{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 838, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.0434608134535555, "learning_rate": 2e-05, "loss": 0.7234, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.9514763639201144, "learning_rate": 4e-05, "loss": 0.5916, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.8903336235741379, "learning_rate": 6e-05, "loss": 0.5827, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.764603046688215, "learning_rate": 8e-05, "loss": 0.5555, "step": 4 }, { "epoch": 0.0, "grad_norm": 0.8956967835024867, "learning_rate": 0.0001, "loss": 0.6073, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.378526599131456, "learning_rate": 0.00012, "loss": 0.6204, "step": 6 }, { "epoch": 0.0, "grad_norm": 1.1442512069955262, "learning_rate": 0.00014, "loss": 0.5452, "step": 7 }, { "epoch": 0.0, "grad_norm": 0.9100913748328603, "learning_rate": 0.00016, "loss": 0.5858, "step": 8 }, { "epoch": 0.01, "grad_norm": 0.6859557442210851, "learning_rate": 0.00018, "loss": 0.5182, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.8288606055568941, "learning_rate": 0.0002, "loss": 0.6083, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.0417686973414615, "learning_rate": 0.0001999999558168346, "loss": 0.65, "step": 11 }, { "epoch": 0.01, "grad_norm": 0.8756017006672853, "learning_rate": 0.00019999982326737747, "loss": 0.5837, "step": 12 }, { "epoch": 0.01, "grad_norm": 0.7284541589078422, "learning_rate": 0.0001999996023517457, "loss": 0.5738, "step": 13 }, { "epoch": 0.01, "grad_norm": 0.8602799120413903, "learning_rate": 0.0001999992930701345, "loss": 0.595, "step": 14 }, { "epoch": 0.01, "grad_norm": 0.7938840633060059, "learning_rate": 0.00019999889542281728, "loss": 0.5907, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.7022632853348306, "learning_rate": 0.00019999840941014525, "loss": 0.5513, "step": 16 }, { "epoch": 0.01, "grad_norm": 0.6531781263691616, "learning_rate": 0.00019999783503254803, "loss": 0.5475, "step": 17 }, { "epoch": 0.01, "grad_norm": 0.5808528104992969, "learning_rate": 0.0001999971722905331, "loss": 0.519, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.5452643486331965, "learning_rate": 0.00019999642118468614, "loss": 0.5421, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.5893567274117093, "learning_rate": 0.00019999558171567082, "loss": 0.6016, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.5039905113559068, "learning_rate": 0.000199994653884229, "loss": 0.6096, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.5236847425188783, "learning_rate": 0.00019999363769118055, "loss": 0.5845, "step": 22 }, { "epoch": 0.01, "grad_norm": 0.4403777461558745, "learning_rate": 0.00019999253313742344, "loss": 0.5657, "step": 23 }, { "epoch": 0.01, "grad_norm": 0.411935443568472, "learning_rate": 0.00019999134022393375, "loss": 0.5619, "step": 24 }, { "epoch": 0.01, "grad_norm": 0.3669921362459581, "learning_rate": 0.0001999900589517656, "loss": 0.6115, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.3613613842516578, "learning_rate": 0.0001999886893220512, "loss": 0.5286, "step": 26 }, { "epoch": 0.02, "grad_norm": 0.378560233146017, "learning_rate": 0.0001999872313360008, "loss": 0.5887, "step": 27 }, { "epoch": 0.02, "grad_norm": 0.3562687135057843, "learning_rate": 0.00019998568499490283, "loss": 0.5598, "step": 28 }, { "epoch": 0.02, "grad_norm": 0.34581851507208355, "learning_rate": 0.00019998405030012371, "loss": 0.5772, "step": 29 }, { "epoch": 0.02, "grad_norm": 0.3812400800902662, "learning_rate": 0.00019998232725310796, "loss": 0.6154, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.2876811822631032, "learning_rate": 0.00019998051585537818, "loss": 0.4949, "step": 31 }, { "epoch": 0.02, "grad_norm": 0.3291934945038139, "learning_rate": 0.00019997861610853503, "loss": 0.5388, "step": 32 }, { "epoch": 0.02, "grad_norm": 0.35220229516562385, "learning_rate": 0.00019997662801425725, "loss": 0.5801, "step": 33 }, { "epoch": 0.02, "grad_norm": 0.3908550087374589, "learning_rate": 0.00019997455157430165, "loss": 0.5783, "step": 34 }, { "epoch": 0.02, "grad_norm": 0.38564043955621646, "learning_rate": 0.00019997238679050308, "loss": 0.5628, "step": 35 }, { "epoch": 0.02, "grad_norm": 0.3248792879576579, "learning_rate": 0.00019997013366477453, "loss": 0.5896, "step": 36 }, { "epoch": 0.02, "grad_norm": 0.33243708146621687, "learning_rate": 0.00019996779219910696, "loss": 0.5618, "step": 37 }, { "epoch": 0.02, "grad_norm": 0.3589452470436555, "learning_rate": 0.00019996536239556942, "loss": 0.5387, "step": 38 }, { "epoch": 0.02, "grad_norm": 0.3635189263065437, "learning_rate": 0.0001999628442563091, "loss": 0.629, "step": 39 }, { "epoch": 0.02, "grad_norm": 0.36761442942017947, "learning_rate": 0.00019996023778355113, "loss": 0.6133, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.30331424568033827, "learning_rate": 0.00019995754297959882, "loss": 0.5377, "step": 41 }, { "epoch": 0.03, "grad_norm": 0.3157701200247212, "learning_rate": 0.0001999547598468334, "loss": 0.6249, "step": 42 }, { "epoch": 0.03, "grad_norm": 0.3112819252913729, "learning_rate": 0.00019995188838771425, "loss": 0.5424, "step": 43 }, { "epoch": 0.03, "grad_norm": 0.33384944369487113, "learning_rate": 0.0001999489286047788, "loss": 0.6014, "step": 44 }, { "epoch": 0.03, "grad_norm": 0.3330564524921197, "learning_rate": 0.00019994588050064243, "loss": 0.5469, "step": 45 }, { "epoch": 0.03, "grad_norm": 0.2889450580843479, "learning_rate": 0.00019994274407799872, "loss": 0.512, "step": 46 }, { "epoch": 0.03, "grad_norm": 0.3273617072745067, "learning_rate": 0.00019993951933961913, "loss": 0.5456, "step": 47 }, { "epoch": 0.03, "grad_norm": 0.3228418979730564, "learning_rate": 0.00019993620628835332, "loss": 0.5716, "step": 48 }, { "epoch": 0.03, "grad_norm": 0.3439278828333003, "learning_rate": 0.0001999328049271289, "loss": 0.5177, "step": 49 }, { "epoch": 0.03, "grad_norm": 0.3186271172935729, "learning_rate": 0.0001999293152589515, "loss": 0.5502, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.38357922086488366, "learning_rate": 0.0001999257372869048, "loss": 0.6178, "step": 51 }, { "epoch": 0.03, "grad_norm": 0.36013497860303273, "learning_rate": 0.00019992207101415053, "loss": 0.6278, "step": 52 }, { "epoch": 0.03, "grad_norm": 0.3146595251755829, "learning_rate": 0.00019991831644392848, "loss": 0.5348, "step": 53 }, { "epoch": 0.03, "grad_norm": 0.3986948949803995, "learning_rate": 0.00019991447357955639, "loss": 0.6331, "step": 54 }, { "epoch": 0.03, "grad_norm": 0.3194176204715625, "learning_rate": 0.00019991054242443008, "loss": 0.5817, "step": 55 }, { "epoch": 0.03, "grad_norm": 0.29564051537116465, "learning_rate": 0.00019990652298202335, "loss": 0.545, "step": 56 }, { "epoch": 0.03, "grad_norm": 0.2908246398716361, "learning_rate": 0.00019990241525588804, "loss": 0.5294, "step": 57 }, { "epoch": 0.03, "grad_norm": 0.3480952622658696, "learning_rate": 0.000199898219249654, "loss": 0.6282, "step": 58 }, { "epoch": 0.04, "grad_norm": 0.4278607132919695, "learning_rate": 0.00019989393496702907, "loss": 0.7008, "step": 59 }, { "epoch": 0.04, "grad_norm": 0.3088760407735635, "learning_rate": 0.00019988956241179912, "loss": 0.5747, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.3549589308890128, "learning_rate": 0.00019988510158782804, "loss": 0.615, "step": 61 }, { "epoch": 0.04, "grad_norm": 0.28349959678885256, "learning_rate": 0.00019988055249905767, "loss": 0.577, "step": 62 }, { "epoch": 0.04, "grad_norm": 0.35001480138074803, "learning_rate": 0.00019987591514950787, "loss": 0.5551, "step": 63 }, { "epoch": 0.04, "grad_norm": 0.32895401860950285, "learning_rate": 0.00019987118954327654, "loss": 0.5617, "step": 64 }, { "epoch": 0.04, "grad_norm": 0.347007326906862, "learning_rate": 0.00019986637568453945, "loss": 0.5935, "step": 65 }, { "epoch": 0.04, "grad_norm": 0.30223641676037666, "learning_rate": 0.00019986147357755048, "loss": 0.5355, "step": 66 }, { "epoch": 0.04, "grad_norm": 0.302279046184156, "learning_rate": 0.00019985648322664145, "loss": 0.5571, "step": 67 }, { "epoch": 0.04, "grad_norm": 0.3181910281320864, "learning_rate": 0.00019985140463622215, "loss": 0.5198, "step": 68 }, { "epoch": 0.04, "grad_norm": 0.32334719229096776, "learning_rate": 0.0001998462378107803, "loss": 0.5063, "step": 69 }, { "epoch": 0.04, "grad_norm": 0.31038499461943353, "learning_rate": 0.0001998409827548817, "loss": 0.5805, "step": 70 }, { "epoch": 0.04, "grad_norm": 0.3032049786542595, "learning_rate": 0.00019983563947316996, "loss": 0.564, "step": 71 }, { "epoch": 0.04, "grad_norm": 0.3345643555445713, "learning_rate": 0.00019983020797036683, "loss": 0.5442, "step": 72 }, { "epoch": 0.04, "grad_norm": 0.32583720357675877, "learning_rate": 0.00019982468825127187, "loss": 0.5674, "step": 73 }, { "epoch": 0.04, "grad_norm": 0.30278224625704836, "learning_rate": 0.0001998190803207627, "loss": 0.569, "step": 74 }, { "epoch": 0.04, "grad_norm": 0.29996902392483177, "learning_rate": 0.0001998133841837948, "loss": 0.6142, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.2947151895973628, "learning_rate": 0.00019980759984540168, "loss": 0.5084, "step": 76 }, { "epoch": 0.05, "grad_norm": 0.33309005605837944, "learning_rate": 0.0001998017273106947, "loss": 0.5807, "step": 77 }, { "epoch": 0.05, "grad_norm": 0.31281343399912853, "learning_rate": 0.00019979576658486325, "loss": 0.6299, "step": 78 }, { "epoch": 0.05, "grad_norm": 0.30980839190781245, "learning_rate": 0.00019978971767317457, "loss": 0.5521, "step": 79 }, { "epoch": 0.05, "grad_norm": 0.2988115356408324, "learning_rate": 0.00019978358058097388, "loss": 0.5645, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.2864799366004751, "learning_rate": 0.0001997773553136843, "loss": 0.5604, "step": 81 }, { "epoch": 0.05, "grad_norm": 0.28284272149262185, "learning_rate": 0.00019977104187680688, "loss": 0.5964, "step": 82 }, { "epoch": 0.05, "grad_norm": 0.2807639116477172, "learning_rate": 0.00019976464027592053, "loss": 0.5441, "step": 83 }, { "epoch": 0.05, "grad_norm": 0.3169919212395633, "learning_rate": 0.00019975815051668217, "loss": 0.5672, "step": 84 }, { "epoch": 0.05, "grad_norm": 0.2827524094344841, "learning_rate": 0.0001997515726048265, "loss": 0.5631, "step": 85 }, { "epoch": 0.05, "grad_norm": 0.29538492598974014, "learning_rate": 0.00019974490654616625, "loss": 0.609, "step": 86 }, { "epoch": 0.05, "grad_norm": 0.29397994414783907, "learning_rate": 0.0001997381523465919, "loss": 0.5723, "step": 87 }, { "epoch": 0.05, "grad_norm": 0.2824005824065347, "learning_rate": 0.00019973131001207195, "loss": 0.5209, "step": 88 }, { "epoch": 0.05, "grad_norm": 0.2847018611508931, "learning_rate": 0.00019972437954865265, "loss": 0.5617, "step": 89 }, { "epoch": 0.05, "grad_norm": 0.2908040007926844, "learning_rate": 0.00019971736096245825, "loss": 0.5624, "step": 90 }, { "epoch": 0.05, "grad_norm": 0.27754759063410545, "learning_rate": 0.00019971025425969083, "loss": 0.5353, "step": 91 }, { "epoch": 0.05, "grad_norm": 0.2885964599083646, "learning_rate": 0.0001997030594466303, "loss": 0.5181, "step": 92 }, { "epoch": 0.06, "grad_norm": 0.29372725474160666, "learning_rate": 0.00019969577652963444, "loss": 0.5757, "step": 93 }, { "epoch": 0.06, "grad_norm": 0.32149915053639194, "learning_rate": 0.0001996884055151389, "loss": 0.552, "step": 94 }, { "epoch": 0.06, "grad_norm": 0.2816717380191918, "learning_rate": 0.00019968094640965717, "loss": 0.4968, "step": 95 }, { "epoch": 0.06, "grad_norm": 0.2719020140724135, "learning_rate": 0.00019967339921978062, "loss": 0.5503, "step": 96 }, { "epoch": 0.06, "grad_norm": 0.28166729851780475, "learning_rate": 0.00019966576395217837, "loss": 0.5546, "step": 97 }, { "epoch": 0.06, "grad_norm": 0.27817598279558775, "learning_rate": 0.0001996580406135975, "loss": 0.6145, "step": 98 }, { "epoch": 0.06, "grad_norm": 0.3471492732103861, "learning_rate": 0.00019965022921086275, "loss": 0.6464, "step": 99 }, { "epoch": 0.06, "grad_norm": 0.2838977359279957, "learning_rate": 0.00019964232975087687, "loss": 0.5576, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.2989119777268752, "learning_rate": 0.00019963434224062025, "loss": 0.5747, "step": 101 }, { "epoch": 0.06, "grad_norm": 0.26079447242968457, "learning_rate": 0.0001996262666871512, "loss": 0.5144, "step": 102 }, { "epoch": 0.06, "grad_norm": 0.2904578974664885, "learning_rate": 0.00019961810309760577, "loss": 0.5623, "step": 103 }, { "epoch": 0.06, "grad_norm": 0.2682051539483259, "learning_rate": 0.00019960985147919778, "loss": 0.5722, "step": 104 }, { "epoch": 0.06, "grad_norm": 0.3004061592870477, "learning_rate": 0.00019960151183921897, "loss": 0.5526, "step": 105 }, { "epoch": 0.06, "grad_norm": 0.27675115608209533, "learning_rate": 0.00019959308418503877, "loss": 0.5859, "step": 106 }, { "epoch": 0.06, "grad_norm": 0.26526760651496173, "learning_rate": 0.00019958456852410433, "loss": 0.5395, "step": 107 }, { "epoch": 0.06, "grad_norm": 0.29513224606753785, "learning_rate": 0.0001995759648639406, "loss": 0.59, "step": 108 }, { "epoch": 0.07, "grad_norm": 0.27848396362609984, "learning_rate": 0.00019956727321215044, "loss": 0.6076, "step": 109 }, { "epoch": 0.07, "grad_norm": 0.29804931563512865, "learning_rate": 0.00019955849357641424, "loss": 0.5555, "step": 110 }, { "epoch": 0.07, "grad_norm": 0.2756038079003531, "learning_rate": 0.00019954962596449024, "loss": 0.5542, "step": 111 }, { "epoch": 0.07, "grad_norm": 0.28433942136507906, "learning_rate": 0.0001995406703842145, "loss": 0.5527, "step": 112 }, { "epoch": 0.07, "grad_norm": 0.646736414676863, "learning_rate": 0.0001995316268435007, "loss": 0.7024, "step": 113 }, { "epoch": 0.07, "grad_norm": 0.25030459489112267, "learning_rate": 0.00019952249535034025, "loss": 0.4928, "step": 114 }, { "epoch": 0.07, "grad_norm": 0.30977646038996587, "learning_rate": 0.00019951327591280236, "loss": 0.5883, "step": 115 }, { "epoch": 0.07, "grad_norm": 0.2854791945432696, "learning_rate": 0.0001995039685390339, "loss": 0.6318, "step": 116 }, { "epoch": 0.07, "grad_norm": 0.3264119003161445, "learning_rate": 0.00019949457323725946, "loss": 0.5654, "step": 117 }, { "epoch": 0.07, "grad_norm": 0.266512900378873, "learning_rate": 0.0001994850900157813, "loss": 0.5457, "step": 118 }, { "epoch": 0.07, "grad_norm": 0.30259231663288877, "learning_rate": 0.0001994755188829794, "loss": 0.5828, "step": 119 }, { "epoch": 0.07, "grad_norm": 0.33798187117798184, "learning_rate": 0.00019946585984731142, "loss": 0.5669, "step": 120 }, { "epoch": 0.07, "grad_norm": 0.2769767197030659, "learning_rate": 0.00019945611291731274, "loss": 0.5619, "step": 121 }, { "epoch": 0.07, "grad_norm": 0.2657775119925365, "learning_rate": 0.00019944627810159632, "loss": 0.59, "step": 122 }, { "epoch": 0.07, "grad_norm": 0.2918370192930011, "learning_rate": 0.00019943635540885279, "loss": 0.5816, "step": 123 }, { "epoch": 0.07, "grad_norm": 0.33417238245851544, "learning_rate": 0.00019942634484785052, "loss": 0.5921, "step": 124 }, { "epoch": 0.07, "grad_norm": 0.2482914745160091, "learning_rate": 0.00019941624642743548, "loss": 0.5113, "step": 125 }, { "epoch": 0.08, "grad_norm": 0.28272991564412037, "learning_rate": 0.0001994060601565313, "loss": 0.5543, "step": 126 }, { "epoch": 0.08, "grad_norm": 0.35389083143384653, "learning_rate": 0.00019939578604413912, "loss": 0.5921, "step": 127 }, { "epoch": 0.08, "grad_norm": 0.3426787529883331, "learning_rate": 0.00019938542409933787, "loss": 0.6073, "step": 128 }, { "epoch": 0.08, "grad_norm": 0.25621145606980844, "learning_rate": 0.000199374974331284, "loss": 0.5639, "step": 129 }, { "epoch": 0.08, "grad_norm": 0.2996022840977615, "learning_rate": 0.00019936443674921158, "loss": 0.5874, "step": 130 }, { "epoch": 0.08, "grad_norm": 0.3283726508140091, "learning_rate": 0.0001993538113624323, "loss": 0.6295, "step": 131 }, { "epoch": 0.08, "grad_norm": 0.2878523809947236, "learning_rate": 0.00019934309818033544, "loss": 0.5565, "step": 132 }, { "epoch": 0.08, "grad_norm": 0.2764010556696928, "learning_rate": 0.0001993322972123878, "loss": 0.554, "step": 133 }, { "epoch": 0.08, "grad_norm": 0.28827215996506156, "learning_rate": 0.0001993214084681338, "loss": 0.5788, "step": 134 }, { "epoch": 0.08, "grad_norm": 0.2945105689397871, "learning_rate": 0.00019931043195719548, "loss": 0.5197, "step": 135 }, { "epoch": 0.08, "grad_norm": 0.25439488455073433, "learning_rate": 0.00019929936768927232, "loss": 0.509, "step": 136 }, { "epoch": 0.08, "grad_norm": 0.2746742447613063, "learning_rate": 0.00019928821567414144, "loss": 0.5479, "step": 137 }, { "epoch": 0.08, "grad_norm": 0.3033960913538536, "learning_rate": 0.00019927697592165747, "loss": 0.5859, "step": 138 }, { "epoch": 0.08, "grad_norm": 0.28486244663010424, "learning_rate": 0.00019926564844175256, "loss": 0.5951, "step": 139 }, { "epoch": 0.08, "grad_norm": 0.3208016816012168, "learning_rate": 0.00019925423324443638, "loss": 0.5823, "step": 140 }, { "epoch": 0.08, "grad_norm": 0.3005362808247367, "learning_rate": 0.00019924273033979613, "loss": 0.5652, "step": 141 }, { "epoch": 0.08, "grad_norm": 0.26910338931514155, "learning_rate": 0.0001992311397379965, "loss": 0.5463, "step": 142 }, { "epoch": 0.09, "grad_norm": 0.28718562659091934, "learning_rate": 0.00019921946144927966, "loss": 0.5245, "step": 143 }, { "epoch": 0.09, "grad_norm": 0.2616953117781411, "learning_rate": 0.0001992076954839653, "loss": 0.5358, "step": 144 }, { "epoch": 0.09, "grad_norm": 0.3107257337368987, "learning_rate": 0.00019919584185245062, "loss": 0.5536, "step": 145 }, { "epoch": 0.09, "grad_norm": 0.28427245738065265, "learning_rate": 0.00019918390056521018, "loss": 0.6126, "step": 146 }, { "epoch": 0.09, "grad_norm": 0.25398310452304734, "learning_rate": 0.00019917187163279605, "loss": 0.5068, "step": 147 }, { "epoch": 0.09, "grad_norm": 0.2556256469730818, "learning_rate": 0.00019915975506583778, "loss": 0.5416, "step": 148 }, { "epoch": 0.09, "grad_norm": 0.2611702329742577, "learning_rate": 0.00019914755087504236, "loss": 0.5276, "step": 149 }, { "epoch": 0.09, "grad_norm": 0.2789045987166575, "learning_rate": 0.00019913525907119418, "loss": 0.5591, "step": 150 }, { "epoch": 0.09, "grad_norm": 0.26837658503581957, "learning_rate": 0.000199122879665155, "loss": 0.6581, "step": 151 }, { "epoch": 0.09, "grad_norm": 0.34601396912277804, "learning_rate": 0.0001991104126678641, "loss": 0.5394, "step": 152 }, { "epoch": 0.09, "grad_norm": 0.25684957257052443, "learning_rate": 0.00019909785809033806, "loss": 0.5392, "step": 153 }, { "epoch": 0.09, "grad_norm": 0.2906797315813485, "learning_rate": 0.00019908521594367098, "loss": 0.5185, "step": 154 }, { "epoch": 0.09, "grad_norm": 0.2852843546202924, "learning_rate": 0.0001990724862390342, "loss": 0.5436, "step": 155 }, { "epoch": 0.09, "grad_norm": 0.2875355300862882, "learning_rate": 0.0001990596689876765, "loss": 0.6009, "step": 156 }, { "epoch": 0.09, "grad_norm": 0.32052910212305513, "learning_rate": 0.00019904676420092404, "loss": 0.5831, "step": 157 }, { "epoch": 0.09, "grad_norm": 0.266884162852661, "learning_rate": 0.00019903377189018024, "loss": 0.5459, "step": 158 }, { "epoch": 0.09, "grad_norm": 0.2957365744895018, "learning_rate": 0.000199020692066926, "loss": 0.5211, "step": 159 }, { "epoch": 0.1, "grad_norm": 0.24951992931808137, "learning_rate": 0.00019900752474271945, "loss": 0.497, "step": 160 }, { "epoch": 0.1, "grad_norm": 0.30509964150122953, "learning_rate": 0.0001989942699291961, "loss": 0.5812, "step": 161 }, { "epoch": 0.1, "grad_norm": 0.2790293776337124, "learning_rate": 0.0001989809276380687, "loss": 0.5856, "step": 162 }, { "epoch": 0.1, "grad_norm": 0.24940387850774506, "learning_rate": 0.00019896749788112737, "loss": 0.5281, "step": 163 }, { "epoch": 0.1, "grad_norm": 0.2664890107453781, "learning_rate": 0.0001989539806702395, "loss": 0.524, "step": 164 }, { "epoch": 0.1, "grad_norm": 0.2896608423493073, "learning_rate": 0.0001989403760173497, "loss": 0.5171, "step": 165 }, { "epoch": 0.1, "grad_norm": 0.2544937836162412, "learning_rate": 0.00019892668393447997, "loss": 0.5546, "step": 166 }, { "epoch": 0.1, "grad_norm": 0.2626626371027326, "learning_rate": 0.00019891290443372944, "loss": 0.5498, "step": 167 }, { "epoch": 0.1, "grad_norm": 0.281410490858952, "learning_rate": 0.0001988990375272746, "loss": 0.5377, "step": 168 }, { "epoch": 0.1, "grad_norm": 0.3376943176164128, "learning_rate": 0.0001988850832273691, "loss": 0.5469, "step": 169 }, { "epoch": 0.1, "grad_norm": 0.2507691377758427, "learning_rate": 0.0001988710415463439, "loss": 0.549, "step": 170 }, { "epoch": 0.1, "grad_norm": 0.27178996752570117, "learning_rate": 0.00019885691249660702, "loss": 0.5636, "step": 171 }, { "epoch": 0.1, "grad_norm": 0.3359421766962587, "learning_rate": 0.00019884269609064386, "loss": 0.5957, "step": 172 }, { "epoch": 0.1, "grad_norm": 0.2638709645045905, "learning_rate": 0.0001988283923410169, "loss": 0.5793, "step": 173 }, { "epoch": 0.1, "grad_norm": 0.25585919726912226, "learning_rate": 0.00019881400126036582, "loss": 0.5817, "step": 174 }, { "epoch": 0.1, "grad_norm": 0.2905067973645414, "learning_rate": 0.00019879952286140754, "loss": 0.5585, "step": 175 }, { "epoch": 0.11, "grad_norm": 0.24197399766587002, "learning_rate": 0.0001987849571569361, "loss": 0.507, "step": 176 }, { "epoch": 0.11, "grad_norm": 0.28898034252731664, "learning_rate": 0.0001987703041598226, "loss": 0.5981, "step": 177 }, { "epoch": 0.11, "grad_norm": 0.26516349701479863, "learning_rate": 0.00019875556388301543, "loss": 0.56, "step": 178 }, { "epoch": 0.11, "grad_norm": 0.27235027968517367, "learning_rate": 0.00019874073633953997, "loss": 0.5872, "step": 179 }, { "epoch": 0.11, "grad_norm": 0.2692241567318253, "learning_rate": 0.00019872582154249884, "loss": 0.5397, "step": 180 }, { "epoch": 0.11, "grad_norm": 0.2560507155942398, "learning_rate": 0.00019871081950507163, "loss": 0.5431, "step": 181 }, { "epoch": 0.11, "grad_norm": 0.26691224099103567, "learning_rate": 0.00019869573024051517, "loss": 0.5608, "step": 182 }, { "epoch": 0.11, "grad_norm": 0.2961375506924155, "learning_rate": 0.00019868055376216323, "loss": 0.5784, "step": 183 }, { "epoch": 0.11, "grad_norm": 0.26055755072015874, "learning_rate": 0.00019866529008342673, "loss": 0.5369, "step": 184 }, { "epoch": 0.11, "grad_norm": 0.2525359310079611, "learning_rate": 0.00019864993921779361, "loss": 0.5438, "step": 185 }, { "epoch": 0.11, "grad_norm": 0.249327141855566, "learning_rate": 0.0001986345011788289, "loss": 0.5668, "step": 186 }, { "epoch": 0.11, "grad_norm": 0.2983950007732028, "learning_rate": 0.00019861897598017457, "loss": 0.5271, "step": 187 }, { "epoch": 0.11, "grad_norm": 0.25610455444964525, "learning_rate": 0.00019860336363554973, "loss": 0.6012, "step": 188 }, { "epoch": 0.11, "grad_norm": 0.24760444410184018, "learning_rate": 0.0001985876641587504, "loss": 0.5066, "step": 189 }, { "epoch": 0.11, "grad_norm": 0.2614264060863463, "learning_rate": 0.00019857187756364958, "loss": 0.5792, "step": 190 }, { "epoch": 0.11, "grad_norm": 0.27219045408444215, "learning_rate": 0.00019855600386419744, "loss": 0.543, "step": 191 }, { "epoch": 0.11, "grad_norm": 0.24606131498871828, "learning_rate": 0.00019854004307442088, "loss": 0.5676, "step": 192 }, { "epoch": 0.12, "grad_norm": 0.28394763236035964, "learning_rate": 0.0001985239952084239, "loss": 0.6032, "step": 193 }, { "epoch": 0.12, "grad_norm": 0.28350170917034406, "learning_rate": 0.0001985078602803874, "loss": 0.6264, "step": 194 }, { "epoch": 0.12, "grad_norm": 0.24011552907338696, "learning_rate": 0.00019849163830456922, "loss": 0.4793, "step": 195 }, { "epoch": 0.12, "grad_norm": 0.2561209086280576, "learning_rate": 0.00019847532929530415, "loss": 0.6198, "step": 196 }, { "epoch": 0.12, "grad_norm": 0.23712886628255178, "learning_rate": 0.00019845893326700384, "loss": 0.4989, "step": 197 }, { "epoch": 0.12, "grad_norm": 0.26720592489417233, "learning_rate": 0.00019844245023415685, "loss": 0.4934, "step": 198 }, { "epoch": 0.12, "grad_norm": 0.2753251397417421, "learning_rate": 0.0001984258802113287, "loss": 0.5544, "step": 199 }, { "epoch": 0.12, "grad_norm": 0.2557869713293877, "learning_rate": 0.0001984092232131616, "loss": 0.5643, "step": 200 }, { "epoch": 0.12, "grad_norm": 0.2669651919314609, "learning_rate": 0.0001983924792543748, "loss": 0.5879, "step": 201 }, { "epoch": 0.12, "grad_norm": 0.25579187132615644, "learning_rate": 0.00019837564834976432, "loss": 0.5742, "step": 202 }, { "epoch": 0.12, "grad_norm": 0.2550207949421237, "learning_rate": 0.000198358730514203, "loss": 0.574, "step": 203 }, { "epoch": 0.12, "grad_norm": 0.23565090455665763, "learning_rate": 0.0001983417257626405, "loss": 0.5299, "step": 204 }, { "epoch": 0.12, "grad_norm": 0.236980034600526, "learning_rate": 0.00019832463411010331, "loss": 0.5199, "step": 205 }, { "epoch": 0.12, "grad_norm": 0.2434029841189093, "learning_rate": 0.0001983074555716947, "loss": 0.5477, "step": 206 }, { "epoch": 0.12, "grad_norm": 0.24771511082227154, "learning_rate": 0.00019829019016259468, "loss": 0.5697, "step": 207 }, { "epoch": 0.12, "grad_norm": 0.23705880771864213, "learning_rate": 0.00019827283789806011, "loss": 0.521, "step": 208 }, { "epoch": 0.12, "grad_norm": 0.24167295291353477, "learning_rate": 0.0001982553987934245, "loss": 0.558, "step": 209 }, { "epoch": 0.13, "grad_norm": 0.2535406245356529, "learning_rate": 0.0001982378728640982, "loss": 0.5693, "step": 210 }, { "epoch": 0.13, "grad_norm": 0.24865334136075806, "learning_rate": 0.00019822026012556818, "loss": 0.5499, "step": 211 }, { "epoch": 0.13, "grad_norm": 0.2544751551481819, "learning_rate": 0.0001982025605933982, "loss": 0.5449, "step": 212 }, { "epoch": 0.13, "grad_norm": 0.2599391794330939, "learning_rate": 0.0001981847742832287, "loss": 0.6222, "step": 213 }, { "epoch": 0.13, "grad_norm": 0.23171934920449544, "learning_rate": 0.00019816690121077674, "loss": 0.5448, "step": 214 }, { "epoch": 0.13, "grad_norm": 0.24380268930715565, "learning_rate": 0.00019814894139183614, "loss": 0.5773, "step": 215 }, { "epoch": 0.13, "grad_norm": 0.2518755786374484, "learning_rate": 0.00019813089484227732, "loss": 0.5479, "step": 216 }, { "epoch": 0.13, "grad_norm": 0.23133984467720642, "learning_rate": 0.00019811276157804733, "loss": 0.471, "step": 217 }, { "epoch": 0.13, "grad_norm": 0.250968947574734, "learning_rate": 0.00019809454161516993, "loss": 0.5738, "step": 218 }, { "epoch": 0.13, "grad_norm": 0.25976015596485974, "learning_rate": 0.00019807623496974537, "loss": 0.5592, "step": 219 }, { "epoch": 0.13, "grad_norm": 0.24400199531999783, "learning_rate": 0.0001980578416579506, "loss": 0.5266, "step": 220 }, { "epoch": 0.13, "grad_norm": 0.24001572180370875, "learning_rate": 0.00019803936169603912, "loss": 0.5843, "step": 221 }, { "epoch": 0.13, "grad_norm": 0.22867208326764507, "learning_rate": 0.00019802079510034096, "loss": 0.518, "step": 222 }, { "epoch": 0.13, "grad_norm": 0.2381724022911579, "learning_rate": 0.00019800214188726276, "loss": 0.5175, "step": 223 }, { "epoch": 0.13, "grad_norm": 0.2700455397530704, "learning_rate": 0.00019798340207328766, "loss": 0.5804, "step": 224 }, { "epoch": 0.13, "grad_norm": 0.24320219539003604, "learning_rate": 0.00019796457567497537, "loss": 0.5304, "step": 225 }, { "epoch": 0.13, "grad_norm": 0.2370472610839002, "learning_rate": 0.0001979456627089621, "loss": 0.5671, "step": 226 }, { "epoch": 0.14, "grad_norm": 0.26756212643991917, "learning_rate": 0.0001979266631919605, "loss": 0.5528, "step": 227 }, { "epoch": 0.14, "grad_norm": 0.24929490389202372, "learning_rate": 0.00019790757714075979, "loss": 0.5407, "step": 228 }, { "epoch": 0.14, "grad_norm": 0.23090595152280974, "learning_rate": 0.00019788840457222556, "loss": 0.5258, "step": 229 }, { "epoch": 0.14, "grad_norm": 0.2979564627406142, "learning_rate": 0.0001978691455033, "loss": 0.5367, "step": 230 }, { "epoch": 0.14, "grad_norm": 0.24228845587479894, "learning_rate": 0.0001978497999510015, "loss": 0.5344, "step": 231 }, { "epoch": 0.14, "grad_norm": 0.25363482164729867, "learning_rate": 0.00019783036793242516, "loss": 0.5669, "step": 232 }, { "epoch": 0.14, "grad_norm": 0.23622712417060854, "learning_rate": 0.00019781084946474226, "loss": 0.5797, "step": 233 }, { "epoch": 0.14, "grad_norm": 0.21594682302559634, "learning_rate": 0.00019779124456520056, "loss": 0.5011, "step": 234 }, { "epoch": 0.14, "grad_norm": 0.24211833950801223, "learning_rate": 0.0001977715532511242, "loss": 0.5164, "step": 235 }, { "epoch": 0.14, "grad_norm": 0.2693820157715391, "learning_rate": 0.0001977517755399137, "loss": 0.5806, "step": 236 }, { "epoch": 0.14, "grad_norm": 0.24734981937576542, "learning_rate": 0.00019773191144904586, "loss": 0.5233, "step": 237 }, { "epoch": 0.14, "grad_norm": 0.2741663737268136, "learning_rate": 0.00019771196099607386, "loss": 0.5402, "step": 238 }, { "epoch": 0.14, "grad_norm": 0.2521489075033339, "learning_rate": 0.00019769192419862716, "loss": 0.5862, "step": 239 }, { "epoch": 0.14, "grad_norm": 0.2633255671236209, "learning_rate": 0.0001976718010744116, "loss": 0.548, "step": 240 }, { "epoch": 0.14, "grad_norm": 0.2691399721238696, "learning_rate": 0.00019765159164120916, "loss": 0.5648, "step": 241 }, { "epoch": 0.14, "grad_norm": 0.25501545746966797, "learning_rate": 0.00019763129591687827, "loss": 0.5602, "step": 242 }, { "epoch": 0.14, "grad_norm": 0.3049976908839563, "learning_rate": 0.00019761091391935347, "loss": 0.5508, "step": 243 }, { "epoch": 0.15, "grad_norm": 0.24467331400916031, "learning_rate": 0.00019759044566664558, "loss": 0.5229, "step": 244 }, { "epoch": 0.15, "grad_norm": 0.27011009612786374, "learning_rate": 0.00019756989117684164, "loss": 0.5448, "step": 245 }, { "epoch": 0.15, "grad_norm": 0.24427143044387528, "learning_rate": 0.00019754925046810493, "loss": 0.5435, "step": 246 }, { "epoch": 0.15, "grad_norm": 0.22753961311031143, "learning_rate": 0.00019752852355867486, "loss": 0.5369, "step": 247 }, { "epoch": 0.15, "grad_norm": 0.23865046003559778, "learning_rate": 0.00019750771046686704, "loss": 0.5354, "step": 248 }, { "epoch": 0.15, "grad_norm": 0.2736283930569903, "learning_rate": 0.00019748681121107325, "loss": 0.5588, "step": 249 }, { "epoch": 0.15, "grad_norm": 0.24727127426749082, "learning_rate": 0.00019746582580976136, "loss": 0.5753, "step": 250 }, { "epoch": 0.15, "grad_norm": 0.2828829340227291, "learning_rate": 0.00019744475428147546, "loss": 0.6793, "step": 251 }, { "epoch": 0.15, "grad_norm": 0.21818184663896711, "learning_rate": 0.00019742359664483563, "loss": 0.5248, "step": 252 }, { "epoch": 0.15, "grad_norm": 0.2320708306833192, "learning_rate": 0.00019740235291853812, "loss": 0.5461, "step": 253 }, { "epoch": 0.15, "grad_norm": 0.25703347930088793, "learning_rate": 0.00019738102312135523, "loss": 0.5713, "step": 254 }, { "epoch": 0.15, "grad_norm": 0.24399588128874033, "learning_rate": 0.0001973596072721353, "loss": 0.5178, "step": 255 }, { "epoch": 0.15, "grad_norm": 0.2229881452119291, "learning_rate": 0.00019733810538980281, "loss": 0.5144, "step": 256 }, { "epoch": 0.15, "grad_norm": 0.23889465035265364, "learning_rate": 0.0001973165174933581, "loss": 0.5727, "step": 257 }, { "epoch": 0.15, "grad_norm": 0.25790569964877214, "learning_rate": 0.0001972948436018776, "loss": 0.5659, "step": 258 }, { "epoch": 0.15, "grad_norm": 0.22511338701135042, "learning_rate": 0.00019727308373451377, "loss": 0.5292, "step": 259 }, { "epoch": 0.16, "grad_norm": 0.23111498863739158, "learning_rate": 0.000197251237910495, "loss": 0.5267, "step": 260 }, { "epoch": 0.16, "grad_norm": 0.23740021982137896, "learning_rate": 0.00019722930614912563, "loss": 0.5499, "step": 261 }, { "epoch": 0.16, "grad_norm": 0.24020258332985106, "learning_rate": 0.00019720728846978598, "loss": 0.5604, "step": 262 }, { "epoch": 0.16, "grad_norm": 0.23947573439011133, "learning_rate": 0.00019718518489193225, "loss": 0.5638, "step": 263 }, { "epoch": 0.16, "grad_norm": 0.23526187217481284, "learning_rate": 0.00019716299543509654, "loss": 0.5436, "step": 264 }, { "epoch": 0.16, "grad_norm": 0.2505831003191156, "learning_rate": 0.00019714072011888686, "loss": 0.5039, "step": 265 }, { "epoch": 0.16, "grad_norm": 0.21592487431784965, "learning_rate": 0.00019711835896298713, "loss": 0.484, "step": 266 }, { "epoch": 0.16, "grad_norm": 0.26528309878122613, "learning_rate": 0.00019709591198715707, "loss": 0.539, "step": 267 }, { "epoch": 0.16, "grad_norm": 0.21635789850987178, "learning_rate": 0.00019707337921123221, "loss": 0.5553, "step": 268 }, { "epoch": 0.16, "grad_norm": 0.23671151623321054, "learning_rate": 0.00019705076065512398, "loss": 0.4968, "step": 269 }, { "epoch": 0.16, "grad_norm": 0.25400871793456326, "learning_rate": 0.00019702805633881957, "loss": 0.5982, "step": 270 }, { "epoch": 0.16, "grad_norm": 0.2622810971314154, "learning_rate": 0.0001970052662823819, "loss": 0.5879, "step": 271 }, { "epoch": 0.16, "grad_norm": 0.22931814830456296, "learning_rate": 0.00019698239050594977, "loss": 0.5611, "step": 272 }, { "epoch": 0.16, "grad_norm": 0.213695866193263, "learning_rate": 0.0001969594290297376, "loss": 0.5386, "step": 273 }, { "epoch": 0.16, "grad_norm": 0.2431252328609808, "learning_rate": 0.00019693638187403563, "loss": 0.6039, "step": 274 }, { "epoch": 0.16, "grad_norm": 0.23108667253454973, "learning_rate": 0.00019691324905920984, "loss": 0.5579, "step": 275 }, { "epoch": 0.16, "grad_norm": 0.22718831415064272, "learning_rate": 0.0001968900306057018, "loss": 0.5196, "step": 276 }, { "epoch": 0.17, "grad_norm": 0.23632362967796033, "learning_rate": 0.0001968667265340288, "loss": 0.5336, "step": 277 }, { "epoch": 0.17, "grad_norm": 0.2421878973201691, "learning_rate": 0.00019684333686478383, "loss": 0.5928, "step": 278 }, { "epoch": 0.17, "grad_norm": 0.225775487602821, "learning_rate": 0.00019681986161863542, "loss": 0.552, "step": 279 }, { "epoch": 0.17, "grad_norm": 0.23037375759338816, "learning_rate": 0.00019679630081632782, "loss": 0.4983, "step": 280 }, { "epoch": 0.17, "grad_norm": 0.24684136612832333, "learning_rate": 0.00019677265447868086, "loss": 0.5655, "step": 281 }, { "epoch": 0.17, "grad_norm": 0.2412756534364674, "learning_rate": 0.0001967489226265899, "loss": 0.5063, "step": 282 }, { "epoch": 0.17, "grad_norm": 0.22005765622474396, "learning_rate": 0.00019672510528102597, "loss": 0.5188, "step": 283 }, { "epoch": 0.17, "grad_norm": 0.25071531725514384, "learning_rate": 0.0001967012024630355, "loss": 0.5938, "step": 284 }, { "epoch": 0.17, "grad_norm": 0.22139592405512468, "learning_rate": 0.00019667721419374065, "loss": 0.4917, "step": 285 }, { "epoch": 0.17, "grad_norm": 0.23067244251762076, "learning_rate": 0.00019665314049433888, "loss": 0.5584, "step": 286 }, { "epoch": 0.17, "grad_norm": 0.23829875535152545, "learning_rate": 0.00019662898138610323, "loss": 0.5264, "step": 287 }, { "epoch": 0.17, "grad_norm": 0.2641034663020514, "learning_rate": 0.00019660473689038228, "loss": 0.5805, "step": 288 }, { "epoch": 0.17, "grad_norm": 0.22321690487140503, "learning_rate": 0.00019658040702859997, "loss": 0.5529, "step": 289 }, { "epoch": 0.17, "grad_norm": 0.2502632163555589, "learning_rate": 0.00019655599182225565, "loss": 0.5347, "step": 290 }, { "epoch": 0.17, "grad_norm": 0.2392608020883604, "learning_rate": 0.00019653149129292426, "loss": 0.5263, "step": 291 }, { "epoch": 0.17, "grad_norm": 0.2539237490519494, "learning_rate": 0.00019650690546225592, "loss": 0.5156, "step": 292 }, { "epoch": 0.17, "grad_norm": 0.21964099103511592, "learning_rate": 0.00019648223435197627, "loss": 0.5101, "step": 293 }, { "epoch": 0.18, "grad_norm": 0.24992985700157416, "learning_rate": 0.00019645747798388628, "loss": 0.5621, "step": 294 }, { "epoch": 0.18, "grad_norm": 0.2559439615345381, "learning_rate": 0.0001964326363798622, "loss": 0.5753, "step": 295 }, { "epoch": 0.18, "grad_norm": 0.2504368010690795, "learning_rate": 0.00019640770956185567, "loss": 0.5558, "step": 296 }, { "epoch": 0.18, "grad_norm": 0.21022187251218089, "learning_rate": 0.0001963826975518936, "loss": 0.5322, "step": 297 }, { "epoch": 0.18, "grad_norm": 0.2422143856532352, "learning_rate": 0.00019635760037207817, "loss": 0.538, "step": 298 }, { "epoch": 0.18, "grad_norm": 0.23174554470584352, "learning_rate": 0.00019633241804458687, "loss": 0.5545, "step": 299 }, { "epoch": 0.18, "grad_norm": 0.267070426953347, "learning_rate": 0.00019630715059167238, "loss": 0.5632, "step": 300 }, { "epoch": 0.18, "grad_norm": 0.22256523603127878, "learning_rate": 0.0001962817980356626, "loss": 0.545, "step": 301 }, { "epoch": 0.18, "grad_norm": 0.24403300497950306, "learning_rate": 0.0001962563603989607, "loss": 0.5448, "step": 302 }, { "epoch": 0.18, "grad_norm": 0.2193103621019292, "learning_rate": 0.00019623083770404492, "loss": 0.5064, "step": 303 }, { "epoch": 0.18, "grad_norm": 0.23299992325798072, "learning_rate": 0.0001962052299734688, "loss": 0.5192, "step": 304 }, { "epoch": 0.18, "grad_norm": 0.2371054150083945, "learning_rate": 0.00019617953722986096, "loss": 0.5157, "step": 305 }, { "epoch": 0.18, "grad_norm": 0.2436064901189273, "learning_rate": 0.00019615375949592504, "loss": 0.5672, "step": 306 }, { "epoch": 0.18, "grad_norm": 0.25098365347678436, "learning_rate": 0.00019612789679443997, "loss": 0.5548, "step": 307 }, { "epoch": 0.18, "grad_norm": 0.2319425382974216, "learning_rate": 0.00019610194914825962, "loss": 0.5293, "step": 308 }, { "epoch": 0.18, "grad_norm": 0.24156576209403272, "learning_rate": 0.000196075916580313, "loss": 0.5672, "step": 309 }, { "epoch": 0.18, "grad_norm": 0.2337383575844323, "learning_rate": 0.0001960497991136041, "loss": 0.5509, "step": 310 }, { "epoch": 0.19, "grad_norm": 0.23799692988502053, "learning_rate": 0.00019602359677121199, "loss": 0.5604, "step": 311 }, { "epoch": 0.19, "grad_norm": 0.2296728275122706, "learning_rate": 0.0001959973095762907, "loss": 0.5371, "step": 312 }, { "epoch": 0.19, "grad_norm": 0.22381626870035518, "learning_rate": 0.00019597093755206936, "loss": 0.5465, "step": 313 }, { "epoch": 0.19, "grad_norm": 0.23335681761234933, "learning_rate": 0.00019594448072185182, "loss": 0.5386, "step": 314 }, { "epoch": 0.19, "grad_norm": 0.22582265649304345, "learning_rate": 0.00019591793910901707, "loss": 0.543, "step": 315 }, { "epoch": 0.19, "grad_norm": 0.2439330072441743, "learning_rate": 0.00019589131273701894, "loss": 0.5177, "step": 316 }, { "epoch": 0.19, "grad_norm": 0.2138593422237162, "learning_rate": 0.00019586460162938622, "loss": 0.5157, "step": 317 }, { "epoch": 0.19, "grad_norm": 0.24003613679646058, "learning_rate": 0.00019583780580972253, "loss": 0.5611, "step": 318 }, { "epoch": 0.19, "grad_norm": 0.2552582800734971, "learning_rate": 0.00019581092530170633, "loss": 0.5922, "step": 319 }, { "epoch": 0.19, "grad_norm": 0.21898423827197905, "learning_rate": 0.00019578396012909092, "loss": 0.5272, "step": 320 }, { "epoch": 0.19, "grad_norm": 0.22013525107478477, "learning_rate": 0.00019575691031570446, "loss": 0.5184, "step": 321 }, { "epoch": 0.19, "grad_norm": 0.21113180640163418, "learning_rate": 0.00019572977588544986, "loss": 0.5134, "step": 322 }, { "epoch": 0.19, "grad_norm": 0.22335860079943387, "learning_rate": 0.00019570255686230485, "loss": 0.5227, "step": 323 }, { "epoch": 0.19, "grad_norm": 0.23006684721287293, "learning_rate": 0.00019567525327032187, "loss": 0.5885, "step": 324 }, { "epoch": 0.19, "grad_norm": 0.21933564641390155, "learning_rate": 0.0001956478651336281, "loss": 0.5598, "step": 325 }, { "epoch": 0.19, "grad_norm": 0.21770749652400337, "learning_rate": 0.00019562039247642546, "loss": 0.5082, "step": 326 }, { "epoch": 0.2, "grad_norm": 0.22800344133653658, "learning_rate": 0.00019559283532299043, "loss": 0.5539, "step": 327 }, { "epoch": 0.2, "grad_norm": 0.2385574924163353, "learning_rate": 0.00019556519369767438, "loss": 0.5497, "step": 328 }, { "epoch": 0.2, "grad_norm": 0.23099538598172079, "learning_rate": 0.0001955374676249031, "loss": 0.5138, "step": 329 }, { "epoch": 0.2, "grad_norm": 0.21517217725478144, "learning_rate": 0.0001955096571291772, "loss": 0.5051, "step": 330 }, { "epoch": 0.2, "grad_norm": 0.21535771106277588, "learning_rate": 0.0001954817622350717, "loss": 0.524, "step": 331 }, { "epoch": 0.2, "grad_norm": 0.20361747402971658, "learning_rate": 0.00019545378296723635, "loss": 0.4989, "step": 332 }, { "epoch": 0.2, "grad_norm": 0.24644921068325687, "learning_rate": 0.0001954257193503954, "loss": 0.5927, "step": 333 }, { "epoch": 0.2, "grad_norm": 0.24765268362172385, "learning_rate": 0.0001953975714093476, "loss": 0.5451, "step": 334 }, { "epoch": 0.2, "grad_norm": 0.20846277824915477, "learning_rate": 0.00019536933916896633, "loss": 0.5259, "step": 335 }, { "epoch": 0.2, "grad_norm": 0.2457371199220107, "learning_rate": 0.00019534102265419932, "loss": 0.5784, "step": 336 }, { "epoch": 0.2, "grad_norm": 0.23029745387228598, "learning_rate": 0.00019531262189006882, "loss": 0.5918, "step": 337 }, { "epoch": 0.2, "grad_norm": 0.2387820151516941, "learning_rate": 0.0001952841369016716, "loss": 0.5576, "step": 338 }, { "epoch": 0.2, "grad_norm": 0.226451643448924, "learning_rate": 0.00019525556771417875, "loss": 0.5241, "step": 339 }, { "epoch": 0.2, "grad_norm": 0.22086691724075064, "learning_rate": 0.00019522691435283585, "loss": 0.5392, "step": 340 }, { "epoch": 0.2, "grad_norm": 0.2259720671796772, "learning_rate": 0.00019519817684296285, "loss": 0.516, "step": 341 }, { "epoch": 0.2, "grad_norm": 0.2244741513315317, "learning_rate": 0.00019516935520995393, "loss": 0.569, "step": 342 }, { "epoch": 0.2, "grad_norm": 0.23890602213836412, "learning_rate": 0.0001951404494792778, "loss": 0.524, "step": 343 }, { "epoch": 0.21, "grad_norm": 0.22136745892767679, "learning_rate": 0.00019511145967647737, "loss": 0.5472, "step": 344 }, { "epoch": 0.21, "grad_norm": 0.22275740066078306, "learning_rate": 0.00019508238582716984, "loss": 0.5553, "step": 345 }, { "epoch": 0.21, "grad_norm": 0.21225155652808625, "learning_rate": 0.00019505322795704676, "loss": 0.5302, "step": 346 }, { "epoch": 0.21, "grad_norm": 0.22704101844750724, "learning_rate": 0.0001950239860918738, "loss": 0.5485, "step": 347 }, { "epoch": 0.21, "grad_norm": 0.2135110250199134, "learning_rate": 0.00019499466025749097, "loss": 0.5343, "step": 348 }, { "epoch": 0.21, "grad_norm": 0.22772242632973722, "learning_rate": 0.00019496525047981242, "loss": 0.5159, "step": 349 }, { "epoch": 0.21, "grad_norm": 0.4444297049160113, "learning_rate": 0.00019493575678482649, "loss": 0.5121, "step": 350 }, { "epoch": 0.21, "grad_norm": 0.226632712040011, "learning_rate": 0.0001949061791985957, "loss": 0.5304, "step": 351 }, { "epoch": 0.21, "grad_norm": 0.22132303156586286, "learning_rate": 0.00019487651774725663, "loss": 0.4817, "step": 352 }, { "epoch": 0.21, "grad_norm": 0.23206581340772667, "learning_rate": 0.00019484677245702004, "loss": 0.5258, "step": 353 }, { "epoch": 0.21, "grad_norm": 0.2374903834541946, "learning_rate": 0.0001948169433541708, "loss": 0.5318, "step": 354 }, { "epoch": 0.21, "grad_norm": 0.22896458770920267, "learning_rate": 0.00019478703046506773, "loss": 0.4806, "step": 355 }, { "epoch": 0.21, "grad_norm": 0.21040301706147688, "learning_rate": 0.00019475703381614375, "loss": 0.5144, "step": 356 }, { "epoch": 0.21, "grad_norm": 0.21179618454444762, "learning_rate": 0.00019472695343390585, "loss": 0.524, "step": 357 }, { "epoch": 0.21, "grad_norm": 0.20436614333218908, "learning_rate": 0.00019469678934493488, "loss": 0.501, "step": 358 }, { "epoch": 0.21, "grad_norm": 0.2478256130980173, "learning_rate": 0.0001946665415758858, "loss": 0.5386, "step": 359 }, { "epoch": 0.21, "grad_norm": 0.226116084636948, "learning_rate": 0.00019463621015348748, "loss": 0.5101, "step": 360 }, { "epoch": 0.22, "grad_norm": 0.21838947264457534, "learning_rate": 0.00019460579510454263, "loss": 0.5296, "step": 361 }, { "epoch": 0.22, "grad_norm": 0.2152879498375444, "learning_rate": 0.00019457529645592792, "loss": 0.512, "step": 362 }, { "epoch": 0.22, "grad_norm": 0.22514971802642378, "learning_rate": 0.00019454471423459389, "loss": 0.5593, "step": 363 }, { "epoch": 0.22, "grad_norm": 0.23402386101532432, "learning_rate": 0.00019451404846756494, "loss": 0.555, "step": 364 }, { "epoch": 0.22, "grad_norm": 0.2244514913016572, "learning_rate": 0.00019448329918193927, "loss": 0.5689, "step": 365 }, { "epoch": 0.22, "grad_norm": 0.22260707231596893, "learning_rate": 0.00019445246640488893, "loss": 0.6062, "step": 366 }, { "epoch": 0.22, "grad_norm": 0.21791090145253736, "learning_rate": 0.00019442155016365965, "loss": 0.531, "step": 367 }, { "epoch": 0.22, "grad_norm": 0.21895889257567258, "learning_rate": 0.00019439055048557101, "loss": 0.5538, "step": 368 }, { "epoch": 0.22, "grad_norm": 0.21306696799872818, "learning_rate": 0.00019435946739801633, "loss": 0.5673, "step": 369 }, { "epoch": 0.22, "grad_norm": 0.2294607768810707, "learning_rate": 0.00019432830092846253, "loss": 0.5855, "step": 370 }, { "epoch": 0.22, "grad_norm": 0.22758409665267085, "learning_rate": 0.0001942970511044503, "loss": 0.5783, "step": 371 }, { "epoch": 0.22, "grad_norm": 0.21334100614200935, "learning_rate": 0.00019426571795359398, "loss": 0.5056, "step": 372 }, { "epoch": 0.22, "grad_norm": 0.24187624093875965, "learning_rate": 0.0001942343015035815, "loss": 0.543, "step": 373 }, { "epoch": 0.22, "grad_norm": 0.2275714845035408, "learning_rate": 0.00019420280178217443, "loss": 0.5329, "step": 374 }, { "epoch": 0.22, "grad_norm": 0.23237641477505608, "learning_rate": 0.00019417121881720793, "loss": 0.5134, "step": 375 }, { "epoch": 0.22, "grad_norm": 0.25196886008416386, "learning_rate": 0.0001941395526365907, "loss": 0.6023, "step": 376 }, { "epoch": 0.22, "grad_norm": 0.22418514390796682, "learning_rate": 0.00019410780326830498, "loss": 0.5529, "step": 377 }, { "epoch": 0.23, "grad_norm": 0.21438856736265666, "learning_rate": 0.0001940759707404065, "loss": 0.5134, "step": 378 }, { "epoch": 0.23, "grad_norm": 0.2331754234870151, "learning_rate": 0.00019404405508102455, "loss": 0.5406, "step": 379 }, { "epoch": 0.23, "grad_norm": 0.24908239322819828, "learning_rate": 0.00019401205631836178, "loss": 0.5377, "step": 380 }, { "epoch": 0.23, "grad_norm": 0.21332745391417657, "learning_rate": 0.00019397997448069435, "loss": 0.5025, "step": 381 }, { "epoch": 0.23, "grad_norm": 0.20749658696001225, "learning_rate": 0.00019394780959637177, "loss": 0.5257, "step": 382 }, { "epoch": 0.23, "grad_norm": 0.2237716482529178, "learning_rate": 0.000193915561693817, "loss": 0.5, "step": 383 }, { "epoch": 0.23, "grad_norm": 0.25234282015654147, "learning_rate": 0.00019388323080152633, "loss": 0.5753, "step": 384 }, { "epoch": 0.23, "grad_norm": 0.2252939326339829, "learning_rate": 0.00019385081694806936, "loss": 0.5662, "step": 385 }, { "epoch": 0.23, "grad_norm": 0.21979629294660186, "learning_rate": 0.00019381832016208904, "loss": 0.5141, "step": 386 }, { "epoch": 0.23, "grad_norm": 0.24762535901866153, "learning_rate": 0.0001937857404723016, "loss": 0.6193, "step": 387 }, { "epoch": 0.23, "grad_norm": 0.25032044234085526, "learning_rate": 0.00019375307790749647, "loss": 0.5024, "step": 388 }, { "epoch": 0.23, "grad_norm": 0.22892425302508923, "learning_rate": 0.0001937203324965364, "loss": 0.5401, "step": 389 }, { "epoch": 0.23, "grad_norm": 0.2461599771002527, "learning_rate": 0.0001936875042683573, "loss": 0.5301, "step": 390 }, { "epoch": 0.23, "grad_norm": 0.22363255721865732, "learning_rate": 0.00019365459325196825, "loss": 0.5538, "step": 391 }, { "epoch": 0.23, "grad_norm": 0.22482667580972365, "learning_rate": 0.00019362159947645152, "loss": 0.4928, "step": 392 }, { "epoch": 0.23, "grad_norm": 0.22869596173751142, "learning_rate": 0.00019358852297096253, "loss": 0.5546, "step": 393 }, { "epoch": 0.24, "grad_norm": 0.2274546469780496, "learning_rate": 0.00019355536376472972, "loss": 0.5763, "step": 394 }, { "epoch": 0.24, "grad_norm": 0.21284874650406885, "learning_rate": 0.0001935221218870547, "loss": 0.5778, "step": 395 }, { "epoch": 0.24, "grad_norm": 0.23158847478661296, "learning_rate": 0.0001934887973673121, "loss": 0.5654, "step": 396 }, { "epoch": 0.24, "grad_norm": 0.24510006704514478, "learning_rate": 0.0001934553902349496, "loss": 0.5053, "step": 397 }, { "epoch": 0.24, "grad_norm": 0.20330878586204656, "learning_rate": 0.00019342190051948777, "loss": 0.5171, "step": 398 }, { "epoch": 0.24, "grad_norm": 0.2131804710318274, "learning_rate": 0.0001933883282505203, "loss": 0.5286, "step": 399 }, { "epoch": 0.24, "grad_norm": 0.23297933515492006, "learning_rate": 0.00019335467345771377, "loss": 0.5593, "step": 400 }, { "epoch": 0.24, "grad_norm": 0.24611434220143105, "learning_rate": 0.0001933209361708077, "loss": 0.604, "step": 401 }, { "epoch": 0.24, "grad_norm": 0.23281321736249425, "learning_rate": 0.00019328711641961445, "loss": 0.5579, "step": 402 }, { "epoch": 0.24, "grad_norm": 0.21399822113981087, "learning_rate": 0.00019325321423401933, "loss": 0.5661, "step": 403 }, { "epoch": 0.24, "grad_norm": 0.22113107520723113, "learning_rate": 0.00019321922964398046, "loss": 0.5789, "step": 404 }, { "epoch": 0.24, "grad_norm": 0.23262880002546846, "learning_rate": 0.00019318516267952874, "loss": 0.5447, "step": 405 }, { "epoch": 0.24, "grad_norm": 0.24962941770082592, "learning_rate": 0.00019315101337076792, "loss": 0.5512, "step": 406 }, { "epoch": 0.24, "grad_norm": 0.22210049422713798, "learning_rate": 0.0001931167817478745, "loss": 0.5427, "step": 407 }, { "epoch": 0.24, "grad_norm": 0.22647809883332484, "learning_rate": 0.0001930824678410977, "loss": 0.4888, "step": 408 }, { "epoch": 0.24, "grad_norm": 0.23660763255678552, "learning_rate": 0.00019304807168075944, "loss": 0.5755, "step": 409 }, { "epoch": 0.24, "grad_norm": 0.2354103448271752, "learning_rate": 0.00019301359329725436, "loss": 0.5265, "step": 410 }, { "epoch": 0.25, "grad_norm": 0.24322261128085423, "learning_rate": 0.00019297903272104977, "loss": 0.5291, "step": 411 }, { "epoch": 0.25, "grad_norm": 0.20525199182278092, "learning_rate": 0.00019294438998268554, "loss": 0.4996, "step": 412 }, { "epoch": 0.25, "grad_norm": 0.24678535182755174, "learning_rate": 0.00019290966511277422, "loss": 0.567, "step": 413 }, { "epoch": 0.25, "grad_norm": 0.22165331172413838, "learning_rate": 0.00019287485814200087, "loss": 0.5348, "step": 414 }, { "epoch": 0.25, "grad_norm": 0.24541020782476444, "learning_rate": 0.00019283996910112318, "loss": 0.5432, "step": 415 }, { "epoch": 0.25, "grad_norm": 0.2255959168063083, "learning_rate": 0.00019280499802097126, "loss": 0.5891, "step": 416 }, { "epoch": 0.25, "grad_norm": 0.21159018099714821, "learning_rate": 0.0001927699449324478, "loss": 0.5003, "step": 417 }, { "epoch": 0.25, "grad_norm": 0.21379995902020923, "learning_rate": 0.00019273480986652794, "loss": 0.5314, "step": 418 }, { "epoch": 0.25, "grad_norm": 0.2853169518220406, "learning_rate": 0.0001926995928542592, "loss": 0.6108, "step": 419 }, { "epoch": 0.25, "grad_norm": 0.22738285867292138, "learning_rate": 0.00019266429392676164, "loss": 0.5217, "step": 420 }, { "epoch": 0.25, "grad_norm": 0.23835369502554374, "learning_rate": 0.00019262891311522755, "loss": 0.5318, "step": 421 }, { "epoch": 0.25, "grad_norm": 0.20671557324330114, "learning_rate": 0.0001925934504509217, "loss": 0.5234, "step": 422 }, { "epoch": 0.25, "grad_norm": 0.205212164360302, "learning_rate": 0.00019255790596518112, "loss": 0.5023, "step": 423 }, { "epoch": 0.25, "grad_norm": 0.21664090577036152, "learning_rate": 0.00019252227968941522, "loss": 0.5452, "step": 424 }, { "epoch": 0.25, "grad_norm": 0.22146041084684798, "learning_rate": 0.00019248657165510556, "loss": 0.5474, "step": 425 }, { "epoch": 0.25, "grad_norm": 0.2338997589574809, "learning_rate": 0.00019245078189380604, "loss": 0.5516, "step": 426 }, { "epoch": 0.25, "grad_norm": 0.2313978280927526, "learning_rate": 0.0001924149104371428, "loss": 0.5831, "step": 427 }, { "epoch": 0.26, "grad_norm": 0.2098577112814155, "learning_rate": 0.00019237895731681408, "loss": 0.5452, "step": 428 }, { "epoch": 0.26, "grad_norm": 0.26497439164374026, "learning_rate": 0.0001923429225645904, "loss": 0.5666, "step": 429 }, { "epoch": 0.26, "grad_norm": 0.21859970576834997, "learning_rate": 0.00019230680621231425, "loss": 0.5069, "step": 430 }, { "epoch": 0.26, "grad_norm": 0.20509380886351694, "learning_rate": 0.0001922706082919004, "loss": 0.4573, "step": 431 }, { "epoch": 0.26, "grad_norm": 0.2182328366507935, "learning_rate": 0.0001922343288353356, "loss": 0.6133, "step": 432 }, { "epoch": 0.26, "grad_norm": 0.2822350271273954, "learning_rate": 0.00019219796787467867, "loss": 0.5709, "step": 433 }, { "epoch": 0.26, "grad_norm": 0.24487543268473794, "learning_rate": 0.00019216152544206049, "loss": 0.546, "step": 434 }, { "epoch": 0.26, "grad_norm": 0.24221176090281485, "learning_rate": 0.00019212500156968383, "loss": 0.5507, "step": 435 }, { "epoch": 0.26, "grad_norm": 0.22053929296251015, "learning_rate": 0.00019208839628982358, "loss": 0.5473, "step": 436 }, { "epoch": 0.26, "grad_norm": 0.22975415570737245, "learning_rate": 0.00019205170963482643, "loss": 0.5181, "step": 437 }, { "epoch": 0.26, "grad_norm": 0.22969105575505203, "learning_rate": 0.00019201494163711104, "loss": 0.5463, "step": 438 }, { "epoch": 0.26, "grad_norm": 0.23764087103158363, "learning_rate": 0.00019197809232916795, "loss": 0.55, "step": 439 }, { "epoch": 0.26, "grad_norm": 0.21997498488474826, "learning_rate": 0.00019194116174355954, "loss": 0.5421, "step": 440 }, { "epoch": 0.26, "grad_norm": 0.22225824990596896, "learning_rate": 0.00019190414991291998, "loss": 0.5439, "step": 441 }, { "epoch": 0.26, "grad_norm": 0.243391488050543, "learning_rate": 0.00019186705686995533, "loss": 0.6289, "step": 442 }, { "epoch": 0.26, "grad_norm": 0.222494273038652, "learning_rate": 0.0001918298826474433, "loss": 0.5088, "step": 443 }, { "epoch": 0.26, "grad_norm": 0.22114450997419682, "learning_rate": 0.0001917926272782334, "loss": 0.5624, "step": 444 }, { "epoch": 0.27, "grad_norm": 0.21964760504534894, "learning_rate": 0.00019175529079524687, "loss": 0.5289, "step": 445 }, { "epoch": 0.27, "grad_norm": 0.3042847973140014, "learning_rate": 0.00019171787323147654, "loss": 0.5328, "step": 446 }, { "epoch": 0.27, "grad_norm": 0.22425571202210934, "learning_rate": 0.00019168037461998695, "loss": 0.5699, "step": 447 }, { "epoch": 0.27, "grad_norm": 0.23406959191320909, "learning_rate": 0.00019164279499391427, "loss": 0.5147, "step": 448 }, { "epoch": 0.27, "grad_norm": 0.3604500123158513, "learning_rate": 0.00019160513438646617, "loss": 0.5697, "step": 449 }, { "epoch": 0.27, "grad_norm": 0.2501436029131694, "learning_rate": 0.00019156739283092205, "loss": 0.6015, "step": 450 }, { "epoch": 0.27, "grad_norm": 0.21928141490521824, "learning_rate": 0.00019152957036063265, "loss": 0.5111, "step": 451 }, { "epoch": 0.27, "grad_norm": 0.257908225365161, "learning_rate": 0.00019149166700902032, "loss": 0.5132, "step": 452 }, { "epoch": 0.27, "grad_norm": 0.2713678867101362, "learning_rate": 0.0001914536828095789, "loss": 0.5995, "step": 453 }, { "epoch": 0.27, "grad_norm": 0.2398794022246256, "learning_rate": 0.0001914156177958736, "loss": 0.4993, "step": 454 }, { "epoch": 0.27, "grad_norm": 0.2373981477389832, "learning_rate": 0.0001913774720015411, "loss": 0.5064, "step": 455 }, { "epoch": 0.27, "grad_norm": 0.2188011093608266, "learning_rate": 0.00019133924546028942, "loss": 0.5606, "step": 456 }, { "epoch": 0.27, "grad_norm": 0.24077263566935142, "learning_rate": 0.00019130093820589791, "loss": 0.5606, "step": 457 }, { "epoch": 0.27, "grad_norm": 0.23519919814487683, "learning_rate": 0.00019126255027221735, "loss": 0.5307, "step": 458 }, { "epoch": 0.27, "grad_norm": 0.21480730775028578, "learning_rate": 0.00019122408169316976, "loss": 0.526, "step": 459 }, { "epoch": 0.27, "grad_norm": 0.2161668548042441, "learning_rate": 0.00019118553250274832, "loss": 0.5657, "step": 460 }, { "epoch": 0.28, "grad_norm": 0.22318400428439122, "learning_rate": 0.00019114690273501765, "loss": 0.513, "step": 461 }, { "epoch": 0.28, "grad_norm": 0.22252447744680176, "learning_rate": 0.00019110819242411337, "loss": 0.5247, "step": 462 }, { "epoch": 0.28, "grad_norm": 0.21358818358042153, "learning_rate": 0.00019106940160424244, "loss": 0.556, "step": 463 }, { "epoch": 0.28, "grad_norm": 0.2121229259271081, "learning_rate": 0.0001910305303096828, "loss": 0.5138, "step": 464 }, { "epoch": 0.28, "grad_norm": 0.22636146624511622, "learning_rate": 0.0001909915785747836, "loss": 0.5111, "step": 465 }, { "epoch": 0.28, "grad_norm": 0.20571954917028099, "learning_rate": 0.00019095254643396512, "loss": 0.5125, "step": 466 }, { "epoch": 0.28, "grad_norm": 0.21968966730793454, "learning_rate": 0.0001909134339217186, "loss": 0.5358, "step": 467 }, { "epoch": 0.28, "grad_norm": 0.21910723327372644, "learning_rate": 0.00019087424107260627, "loss": 0.5382, "step": 468 }, { "epoch": 0.28, "grad_norm": 0.2153841373499183, "learning_rate": 0.00019083496792126153, "loss": 0.5375, "step": 469 }, { "epoch": 0.28, "grad_norm": 0.23479205084160673, "learning_rate": 0.00019079561450238854, "loss": 0.5984, "step": 470 }, { "epoch": 0.28, "grad_norm": 0.21595571362737268, "learning_rate": 0.00019075618085076247, "loss": 0.5417, "step": 471 }, { "epoch": 0.28, "grad_norm": 0.24550770571804625, "learning_rate": 0.00019071666700122946, "loss": 0.5306, "step": 472 }, { "epoch": 0.28, "grad_norm": 0.21802243564456578, "learning_rate": 0.00019067707298870638, "loss": 0.5157, "step": 473 }, { "epoch": 0.28, "grad_norm": 0.2068796190094572, "learning_rate": 0.00019063739884818103, "loss": 0.5254, "step": 474 }, { "epoch": 0.28, "grad_norm": 0.24034732867281272, "learning_rate": 0.000190597644614712, "loss": 0.6204, "step": 475 }, { "epoch": 0.28, "grad_norm": 0.2260836607650634, "learning_rate": 0.00019055781032342864, "loss": 0.5492, "step": 476 }, { "epoch": 0.28, "grad_norm": 0.2476351525598878, "learning_rate": 0.00019051789600953102, "loss": 0.5157, "step": 477 }, { "epoch": 0.29, "grad_norm": 0.2280151093681579, "learning_rate": 0.00019047790170829003, "loss": 0.4984, "step": 478 }, { "epoch": 0.29, "grad_norm": 0.2217333524292061, "learning_rate": 0.00019043782745504711, "loss": 0.5149, "step": 479 }, { "epoch": 0.29, "grad_norm": 0.2356369467654302, "learning_rate": 0.00019039767328521442, "loss": 0.5724, "step": 480 }, { "epoch": 0.29, "grad_norm": 0.21541809863677616, "learning_rate": 0.0001903574392342747, "loss": 0.5138, "step": 481 }, { "epoch": 0.29, "grad_norm": 0.21722431891543054, "learning_rate": 0.00019031712533778137, "loss": 0.5536, "step": 482 }, { "epoch": 0.29, "grad_norm": 0.2370708268417489, "learning_rate": 0.00019027673163135827, "loss": 0.5038, "step": 483 }, { "epoch": 0.29, "grad_norm": 0.22809310323516838, "learning_rate": 0.00019023625815069989, "loss": 0.5713, "step": 484 }, { "epoch": 0.29, "grad_norm": 0.22374988575329294, "learning_rate": 0.00019019570493157114, "loss": 0.5549, "step": 485 }, { "epoch": 0.29, "grad_norm": 0.20510711707245072, "learning_rate": 0.0001901550720098074, "loss": 0.46, "step": 486 }, { "epoch": 0.29, "grad_norm": 0.2621551195786783, "learning_rate": 0.00019011435942131448, "loss": 0.5546, "step": 487 }, { "epoch": 0.29, "grad_norm": 0.20503054358781417, "learning_rate": 0.00019007356720206865, "loss": 0.5547, "step": 488 }, { "epoch": 0.29, "grad_norm": 0.23586140447856616, "learning_rate": 0.00019003269538811647, "loss": 0.6075, "step": 489 }, { "epoch": 0.29, "grad_norm": 0.2828040872125889, "learning_rate": 0.00018999174401557488, "loss": 0.602, "step": 490 }, { "epoch": 0.29, "grad_norm": 0.2023429982220119, "learning_rate": 0.00018995071312063105, "loss": 0.4975, "step": 491 }, { "epoch": 0.29, "grad_norm": 0.2054777673202953, "learning_rate": 0.00018990960273954254, "loss": 0.5295, "step": 492 }, { "epoch": 0.29, "grad_norm": 0.1982185225446849, "learning_rate": 0.00018986841290863704, "loss": 0.5461, "step": 493 }, { "epoch": 0.29, "grad_norm": 0.23248022218099268, "learning_rate": 0.0001898271436643125, "loss": 0.5924, "step": 494 }, { "epoch": 0.3, "grad_norm": 0.2235279893303581, "learning_rate": 0.00018978579504303706, "loss": 0.5598, "step": 495 }, { "epoch": 0.3, "grad_norm": 0.21675084465821123, "learning_rate": 0.000189744367081349, "loss": 0.5012, "step": 496 }, { "epoch": 0.3, "grad_norm": 0.2041881848681654, "learning_rate": 0.00018970285981585662, "loss": 0.526, "step": 497 }, { "epoch": 0.3, "grad_norm": 0.23258761727278376, "learning_rate": 0.00018966127328323842, "loss": 0.553, "step": 498 }, { "epoch": 0.3, "grad_norm": 0.23066266735191, "learning_rate": 0.00018961960752024288, "loss": 0.5506, "step": 499 }, { "epoch": 0.3, "grad_norm": 0.20634958879584178, "learning_rate": 0.0001895778625636885, "loss": 0.5006, "step": 500 }, { "epoch": 0.3, "grad_norm": 0.21082421656186934, "learning_rate": 0.00018953603845046378, "loss": 0.5279, "step": 501 }, { "epoch": 0.3, "grad_norm": 0.2057560041730304, "learning_rate": 0.00018949413521752713, "loss": 0.5598, "step": 502 }, { "epoch": 0.3, "grad_norm": 0.2096114347066206, "learning_rate": 0.00018945215290190693, "loss": 0.5113, "step": 503 }, { "epoch": 0.3, "grad_norm": 0.23218477255443984, "learning_rate": 0.00018941009154070136, "loss": 0.5169, "step": 504 }, { "epoch": 0.3, "grad_norm": 0.20857717653678057, "learning_rate": 0.00018936795117107855, "loss": 0.5149, "step": 505 }, { "epoch": 0.3, "grad_norm": 0.24006448825761761, "learning_rate": 0.0001893257318302764, "loss": 0.5228, "step": 506 }, { "epoch": 0.3, "grad_norm": 0.2146671098435255, "learning_rate": 0.00018928343355560258, "loss": 0.5257, "step": 507 }, { "epoch": 0.3, "grad_norm": 0.20608859556559073, "learning_rate": 0.00018924105638443452, "loss": 0.527, "step": 508 }, { "epoch": 0.3, "grad_norm": 0.2336814919363686, "learning_rate": 0.0001891986003542194, "loss": 0.5461, "step": 509 }, { "epoch": 0.3, "grad_norm": 0.2409130946928026, "learning_rate": 0.00018915606550247397, "loss": 0.5493, "step": 510 }, { "epoch": 0.3, "grad_norm": 0.21371348825911873, "learning_rate": 0.0001891134518667848, "loss": 0.572, "step": 511 }, { "epoch": 0.31, "grad_norm": 0.2014364828041311, "learning_rate": 0.000189070759484808, "loss": 0.5109, "step": 512 }, { "epoch": 0.31, "grad_norm": 0.2290945612613713, "learning_rate": 0.0001890279883942692, "loss": 0.5493, "step": 513 }, { "epoch": 0.31, "grad_norm": 0.22127732127756986, "learning_rate": 0.0001889851386329637, "loss": 0.5387, "step": 514 }, { "epoch": 0.31, "grad_norm": 0.20564079598559082, "learning_rate": 0.00018894221023875622, "loss": 0.5192, "step": 515 }, { "epoch": 0.31, "grad_norm": 0.213993086214796, "learning_rate": 0.00018889920324958106, "loss": 0.5044, "step": 516 }, { "epoch": 0.31, "grad_norm": 0.21506249939577854, "learning_rate": 0.00018885611770344185, "loss": 0.4969, "step": 517 }, { "epoch": 0.31, "grad_norm": 0.22792164808811663, "learning_rate": 0.00018881295363841174, "loss": 0.5564, "step": 518 }, { "epoch": 0.31, "grad_norm": 0.1978731923118128, "learning_rate": 0.00018876971109263324, "loss": 0.4898, "step": 519 }, { "epoch": 0.31, "grad_norm": 0.22394451521984352, "learning_rate": 0.00018872639010431822, "loss": 0.5586, "step": 520 }, { "epoch": 0.31, "grad_norm": 0.20009625678598073, "learning_rate": 0.0001886829907117478, "loss": 0.5399, "step": 521 }, { "epoch": 0.31, "grad_norm": 0.20448355507434923, "learning_rate": 0.00018863951295327244, "loss": 0.5263, "step": 522 }, { "epoch": 0.31, "grad_norm": 0.1967777231547204, "learning_rate": 0.00018859595686731187, "loss": 0.4904, "step": 523 }, { "epoch": 0.31, "grad_norm": 0.2052388343929957, "learning_rate": 0.00018855232249235498, "loss": 0.4951, "step": 524 }, { "epoch": 0.31, "grad_norm": 0.1970956590240829, "learning_rate": 0.00018850860986695985, "loss": 0.5112, "step": 525 }, { "epoch": 0.31, "grad_norm": 0.2102143499682878, "learning_rate": 0.00018846481902975377, "loss": 0.5234, "step": 526 }, { "epoch": 0.31, "grad_norm": 0.23384214794287286, "learning_rate": 0.00018842095001943306, "loss": 0.5387, "step": 527 }, { "epoch": 0.32, "grad_norm": 0.20133953340775343, "learning_rate": 0.00018837700287476316, "loss": 0.4995, "step": 528 }, { "epoch": 0.32, "grad_norm": 0.2238467486071384, "learning_rate": 0.00018833297763457858, "loss": 0.5709, "step": 529 }, { "epoch": 0.32, "grad_norm": 0.26170161234282546, "learning_rate": 0.00018828887433778278, "loss": 0.6314, "step": 530 }, { "epoch": 0.32, "grad_norm": 0.2317819906199683, "learning_rate": 0.00018824469302334822, "loss": 0.5333, "step": 531 }, { "epoch": 0.32, "grad_norm": 0.21538390925414544, "learning_rate": 0.0001882004337303163, "loss": 0.5603, "step": 532 }, { "epoch": 0.32, "grad_norm": 0.23053571801246284, "learning_rate": 0.0001881560964977974, "loss": 0.593, "step": 533 }, { "epoch": 0.32, "grad_norm": 0.21173642276584706, "learning_rate": 0.0001881116813649706, "loss": 0.5539, "step": 534 }, { "epoch": 0.32, "grad_norm": 0.24587290888576793, "learning_rate": 0.00018806718837108402, "loss": 0.5408, "step": 535 }, { "epoch": 0.32, "grad_norm": 0.22324082101473863, "learning_rate": 0.00018802261755545443, "loss": 0.5857, "step": 536 }, { "epoch": 0.32, "grad_norm": 0.21827653101692504, "learning_rate": 0.0001879779689574674, "loss": 0.5451, "step": 537 }, { "epoch": 0.32, "grad_norm": 0.2146222856243753, "learning_rate": 0.00018793324261657737, "loss": 0.5007, "step": 538 }, { "epoch": 0.32, "grad_norm": 0.20994383183759666, "learning_rate": 0.00018788843857230726, "loss": 0.5039, "step": 539 }, { "epoch": 0.32, "grad_norm": 0.23384168426304514, "learning_rate": 0.00018784355686424876, "loss": 0.5329, "step": 540 }, { "epoch": 0.32, "grad_norm": 0.20284382518697272, "learning_rate": 0.00018779859753206225, "loss": 0.5383, "step": 541 }, { "epoch": 0.32, "grad_norm": 0.22307014132513725, "learning_rate": 0.00018775356061547662, "loss": 0.5766, "step": 542 }, { "epoch": 0.32, "grad_norm": 0.21675879523474215, "learning_rate": 0.00018770844615428932, "loss": 0.4994, "step": 543 }, { "epoch": 0.32, "grad_norm": 0.2200785983728407, "learning_rate": 0.00018766325418836637, "loss": 0.5615, "step": 544 }, { "epoch": 0.33, "grad_norm": 0.20895654400479502, "learning_rate": 0.00018761798475764224, "loss": 0.4993, "step": 545 }, { "epoch": 0.33, "grad_norm": 0.22152937631276676, "learning_rate": 0.00018757263790211988, "loss": 0.5275, "step": 546 }, { "epoch": 0.33, "grad_norm": 0.209333487906431, "learning_rate": 0.0001875272136618706, "loss": 0.4911, "step": 547 }, { "epoch": 0.33, "grad_norm": 0.2123519912763275, "learning_rate": 0.00018748171207703417, "loss": 0.5662, "step": 548 }, { "epoch": 0.33, "grad_norm": 0.2147346642469028, "learning_rate": 0.00018743613318781868, "loss": 0.5651, "step": 549 }, { "epoch": 0.33, "grad_norm": 0.2017789732342509, "learning_rate": 0.00018739047703450048, "loss": 0.5573, "step": 550 }, { "epoch": 0.33, "grad_norm": 0.2084087089737107, "learning_rate": 0.00018734474365742428, "loss": 0.562, "step": 551 }, { "epoch": 0.33, "grad_norm": 0.22130968599178, "learning_rate": 0.00018729893309700295, "loss": 0.5729, "step": 552 }, { "epoch": 0.33, "grad_norm": 0.22736172090948445, "learning_rate": 0.0001872530453937176, "loss": 0.5548, "step": 553 }, { "epoch": 0.33, "grad_norm": 0.21738577850339916, "learning_rate": 0.0001872070805881176, "loss": 0.5191, "step": 554 }, { "epoch": 0.33, "grad_norm": 0.20994273135857797, "learning_rate": 0.00018716103872082026, "loss": 0.5153, "step": 555 }, { "epoch": 0.33, "grad_norm": 0.25944295362906805, "learning_rate": 0.00018711491983251113, "loss": 0.5471, "step": 556 }, { "epoch": 0.33, "grad_norm": 0.2138519097360962, "learning_rate": 0.00018706872396394376, "loss": 0.4875, "step": 557 }, { "epoch": 0.33, "grad_norm": 0.23586915663527888, "learning_rate": 0.00018702245115593974, "loss": 0.5224, "step": 558 }, { "epoch": 0.33, "grad_norm": 0.20477148046499385, "learning_rate": 0.0001869761014493887, "loss": 0.5466, "step": 559 }, { "epoch": 0.33, "grad_norm": 0.21783175505387284, "learning_rate": 0.00018692967488524812, "loss": 0.5557, "step": 560 }, { "epoch": 0.33, "grad_norm": 0.20442177589984145, "learning_rate": 0.0001868831715045435, "loss": 0.507, "step": 561 }, { "epoch": 0.34, "grad_norm": 0.21291324212369495, "learning_rate": 0.00018683659134836813, "loss": 0.5779, "step": 562 }, { "epoch": 0.34, "grad_norm": 0.22670486875141618, "learning_rate": 0.00018678993445788323, "loss": 0.5831, "step": 563 }, { "epoch": 0.34, "grad_norm": 0.2431493116309222, "learning_rate": 0.00018674320087431768, "loss": 0.5389, "step": 564 }, { "epoch": 0.34, "grad_norm": 0.22102091260855142, "learning_rate": 0.00018669639063896836, "loss": 0.5569, "step": 565 }, { "epoch": 0.34, "grad_norm": 0.20001951850669827, "learning_rate": 0.0001866495037931997, "loss": 0.486, "step": 566 }, { "epoch": 0.34, "grad_norm": 0.22781103196427857, "learning_rate": 0.00018660254037844388, "loss": 0.4973, "step": 567 }, { "epoch": 0.34, "grad_norm": 0.21129685691062433, "learning_rate": 0.00018655550043620073, "loss": 0.5459, "step": 568 }, { "epoch": 0.34, "grad_norm": 0.20363805081315986, "learning_rate": 0.0001865083840080378, "loss": 0.4997, "step": 569 }, { "epoch": 0.34, "grad_norm": 0.22269838654252982, "learning_rate": 0.00018646119113559006, "loss": 0.5406, "step": 570 }, { "epoch": 0.34, "grad_norm": 0.20307002281681275, "learning_rate": 0.00018641392186056016, "loss": 0.4861, "step": 571 }, { "epoch": 0.34, "grad_norm": 0.20146261628709675, "learning_rate": 0.0001863665762247182, "loss": 0.561, "step": 572 }, { "epoch": 0.34, "grad_norm": 0.21049257054009352, "learning_rate": 0.00018631915426990184, "loss": 0.5257, "step": 573 }, { "epoch": 0.34, "grad_norm": 0.2245482792823418, "learning_rate": 0.00018627165603801605, "loss": 0.5441, "step": 574 }, { "epoch": 0.34, "grad_norm": 0.2106578436256788, "learning_rate": 0.0001862240815710333, "loss": 0.5125, "step": 575 }, { "epoch": 0.34, "grad_norm": 0.2091435884054145, "learning_rate": 0.0001861764309109934, "loss": 0.523, "step": 576 }, { "epoch": 0.34, "grad_norm": 0.21256854318600532, "learning_rate": 0.00018612870410000354, "loss": 0.4851, "step": 577 }, { "epoch": 0.34, "grad_norm": 0.24387962798982954, "learning_rate": 0.00018608090118023808, "loss": 0.5423, "step": 578 }, { "epoch": 0.35, "grad_norm": 0.2357478920855788, "learning_rate": 0.00018603302219393874, "loss": 0.5386, "step": 579 }, { "epoch": 0.35, "grad_norm": 0.21267780857117077, "learning_rate": 0.0001859850671834144, "loss": 0.5545, "step": 580 }, { "epoch": 0.35, "grad_norm": 0.25049614581715324, "learning_rate": 0.0001859370361910412, "loss": 0.5241, "step": 581 }, { "epoch": 0.35, "grad_norm": 0.1937807494598699, "learning_rate": 0.00018588892925926228, "loss": 0.5533, "step": 582 }, { "epoch": 0.35, "grad_norm": 0.21209972240968475, "learning_rate": 0.00018584074643058807, "loss": 0.538, "step": 583 }, { "epoch": 0.35, "grad_norm": 0.22281277082523665, "learning_rate": 0.00018579248774759586, "loss": 0.5456, "step": 584 }, { "epoch": 0.35, "grad_norm": 0.22156542955128883, "learning_rate": 0.00018574415325293018, "loss": 0.5622, "step": 585 }, { "epoch": 0.35, "grad_norm": 0.20068342929250654, "learning_rate": 0.00018569574298930237, "loss": 0.5372, "step": 586 }, { "epoch": 0.35, "grad_norm": 0.21693418845369525, "learning_rate": 0.00018564725699949083, "loss": 0.4874, "step": 587 }, { "epoch": 0.35, "grad_norm": 0.2060622909003744, "learning_rate": 0.0001855986953263409, "loss": 0.5331, "step": 588 }, { "epoch": 0.35, "grad_norm": 0.20007419545283933, "learning_rate": 0.00018555005801276463, "loss": 0.5131, "step": 589 }, { "epoch": 0.35, "grad_norm": 0.21905328017125653, "learning_rate": 0.00018550134510174115, "loss": 0.5572, "step": 590 }, { "epoch": 0.35, "grad_norm": 0.21213287568506015, "learning_rate": 0.0001854525566363162, "loss": 0.5359, "step": 591 }, { "epoch": 0.35, "grad_norm": 0.20066093050756748, "learning_rate": 0.00018540369265960242, "loss": 0.5334, "step": 592 }, { "epoch": 0.35, "grad_norm": 0.2068811720002483, "learning_rate": 0.00018535475321477906, "loss": 0.5558, "step": 593 }, { "epoch": 0.35, "grad_norm": 0.2025287668887073, "learning_rate": 0.00018530573834509215, "loss": 0.5098, "step": 594 }, { "epoch": 0.36, "grad_norm": 0.20807380346718593, "learning_rate": 0.0001852566480938543, "loss": 0.5211, "step": 595 }, { "epoch": 0.36, "grad_norm": 0.2049943719782544, "learning_rate": 0.00018520748250444474, "loss": 0.5379, "step": 596 }, { "epoch": 0.36, "grad_norm": 0.8558508208219735, "learning_rate": 0.00018515824162030934, "loss": 0.5403, "step": 597 }, { "epoch": 0.36, "grad_norm": 0.25414317775682305, "learning_rate": 0.00018510892548496047, "loss": 0.5804, "step": 598 }, { "epoch": 0.36, "grad_norm": 0.20806597400748386, "learning_rate": 0.00018505953414197696, "loss": 0.5419, "step": 599 }, { "epoch": 0.36, "grad_norm": 0.1950528976937739, "learning_rate": 0.00018501006763500414, "loss": 0.4956, "step": 600 }, { "epoch": 0.36, "grad_norm": 0.20652545713558523, "learning_rate": 0.00018496052600775376, "loss": 0.4942, "step": 601 }, { "epoch": 0.36, "grad_norm": 0.20955886781649663, "learning_rate": 0.0001849109093040039, "loss": 0.5177, "step": 602 }, { "epoch": 0.36, "grad_norm": 0.21093362015684414, "learning_rate": 0.00018486121756759906, "loss": 0.5672, "step": 603 }, { "epoch": 0.36, "grad_norm": 0.22033088091533184, "learning_rate": 0.00018481145084245002, "loss": 0.5691, "step": 604 }, { "epoch": 0.36, "grad_norm": 0.20322111965044637, "learning_rate": 0.00018476160917253373, "loss": 0.5425, "step": 605 }, { "epoch": 0.36, "grad_norm": 0.2028788101278272, "learning_rate": 0.0001847116926018935, "loss": 0.5176, "step": 606 }, { "epoch": 0.36, "grad_norm": 0.19551140156538951, "learning_rate": 0.0001846617011746388, "loss": 0.5115, "step": 607 }, { "epoch": 0.36, "grad_norm": 0.21944694996534547, "learning_rate": 0.00018461163493494517, "loss": 0.5496, "step": 608 }, { "epoch": 0.36, "grad_norm": 0.21506814147924705, "learning_rate": 0.0001845614939270543, "loss": 0.5823, "step": 609 }, { "epoch": 0.36, "grad_norm": 0.2220938137588105, "learning_rate": 0.00018451127819527402, "loss": 0.5731, "step": 610 }, { "epoch": 0.36, "grad_norm": 0.21590208362786933, "learning_rate": 0.00018446098778397807, "loss": 0.6063, "step": 611 }, { "epoch": 0.37, "grad_norm": 0.20084594317065918, "learning_rate": 0.00018441062273760628, "loss": 0.5286, "step": 612 }, { "epoch": 0.37, "grad_norm": 0.21847304705653886, "learning_rate": 0.00018436018310066435, "loss": 0.5721, "step": 613 }, { "epoch": 0.37, "grad_norm": 0.2467936487351411, "learning_rate": 0.000184309668917724, "loss": 0.571, "step": 614 }, { "epoch": 0.37, "grad_norm": 0.21666156526926003, "learning_rate": 0.0001842590802334227, "loss": 0.5244, "step": 615 }, { "epoch": 0.37, "grad_norm": 0.21336859433357677, "learning_rate": 0.00018420841709246383, "loss": 0.5724, "step": 616 }, { "epoch": 0.37, "grad_norm": 0.1933070755110986, "learning_rate": 0.0001841576795396166, "loss": 0.5347, "step": 617 }, { "epoch": 0.37, "grad_norm": 0.2332186369470874, "learning_rate": 0.00018410686761971586, "loss": 0.5474, "step": 618 }, { "epoch": 0.37, "grad_norm": 0.1996293438855639, "learning_rate": 0.00018405598137766224, "loss": 0.5421, "step": 619 }, { "epoch": 0.37, "grad_norm": 0.2012759807756364, "learning_rate": 0.00018400502085842208, "loss": 0.519, "step": 620 }, { "epoch": 0.37, "grad_norm": 0.24355300568180752, "learning_rate": 0.00018395398610702733, "loss": 0.597, "step": 621 }, { "epoch": 0.37, "grad_norm": 0.2136711983483761, "learning_rate": 0.00018390287716857546, "loss": 0.5398, "step": 622 }, { "epoch": 0.37, "grad_norm": 0.22275088525970024, "learning_rate": 0.00018385169408822964, "loss": 0.5597, "step": 623 }, { "epoch": 0.37, "grad_norm": 0.20011931485707388, "learning_rate": 0.0001838004369112184, "loss": 0.4901, "step": 624 }, { "epoch": 0.37, "grad_norm": 0.19544716159187206, "learning_rate": 0.00018374910568283594, "loss": 0.4726, "step": 625 }, { "epoch": 0.37, "grad_norm": 0.2176067620544374, "learning_rate": 0.00018369770044844168, "loss": 0.5369, "step": 626 }, { "epoch": 0.37, "grad_norm": 0.2005629047810257, "learning_rate": 0.00018364622125346055, "loss": 0.4914, "step": 627 }, { "epoch": 0.37, "grad_norm": 0.21497281608823432, "learning_rate": 0.0001835946681433829, "loss": 0.5559, "step": 628 }, { "epoch": 0.38, "grad_norm": 0.20354723273049724, "learning_rate": 0.00018354304116376425, "loss": 0.5083, "step": 629 }, { "epoch": 0.38, "grad_norm": 0.23536026550959782, "learning_rate": 0.0001834913403602255, "loss": 0.5449, "step": 630 }, { "epoch": 0.38, "grad_norm": 0.20887211237530257, "learning_rate": 0.00018343956577845276, "loss": 0.5131, "step": 631 }, { "epoch": 0.38, "grad_norm": 0.21728763678777088, "learning_rate": 0.00018338771746419726, "loss": 0.5484, "step": 632 }, { "epoch": 0.38, "grad_norm": 0.21910570476522437, "learning_rate": 0.00018333579546327556, "loss": 0.5452, "step": 633 }, { "epoch": 0.38, "grad_norm": 0.21247350127543838, "learning_rate": 0.00018328379982156915, "loss": 0.5232, "step": 634 }, { "epoch": 0.38, "grad_norm": 0.21706686115456897, "learning_rate": 0.00018323173058502472, "loss": 0.5353, "step": 635 }, { "epoch": 0.38, "grad_norm": 0.19529494853666482, "learning_rate": 0.00018317958779965387, "loss": 0.4611, "step": 636 }, { "epoch": 0.38, "grad_norm": 0.2194890381897013, "learning_rate": 0.00018312737151153334, "loss": 0.4884, "step": 637 }, { "epoch": 0.38, "grad_norm": 0.24336065627870296, "learning_rate": 0.00018307508176680472, "loss": 0.5708, "step": 638 }, { "epoch": 0.38, "grad_norm": 0.22638828434923797, "learning_rate": 0.00018302271861167456, "loss": 0.5795, "step": 639 }, { "epoch": 0.38, "grad_norm": 0.20501380703607638, "learning_rate": 0.0001829702820924142, "loss": 0.5645, "step": 640 }, { "epoch": 0.38, "grad_norm": 0.22705979847255006, "learning_rate": 0.00018291777225535994, "loss": 0.4974, "step": 641 }, { "epoch": 0.38, "grad_norm": 0.22629645320684777, "learning_rate": 0.00018286518914691272, "loss": 0.5587, "step": 642 }, { "epoch": 0.38, "grad_norm": 0.21772563640763765, "learning_rate": 0.00018281253281353838, "loss": 0.5219, "step": 643 }, { "epoch": 0.38, "grad_norm": 0.20447194133414195, "learning_rate": 0.00018275980330176737, "loss": 0.5425, "step": 644 }, { "epoch": 0.38, "grad_norm": 0.24126870503035064, "learning_rate": 0.00018270700065819477, "loss": 0.5119, "step": 645 }, { "epoch": 0.39, "grad_norm": 0.23269297218381896, "learning_rate": 0.00018265412492948042, "loss": 0.5507, "step": 646 }, { "epoch": 0.39, "grad_norm": 0.23416570398912578, "learning_rate": 0.0001826011761623486, "loss": 0.5947, "step": 647 }, { "epoch": 0.39, "grad_norm": 0.2186560086983282, "learning_rate": 0.0001825481544035882, "loss": 0.5204, "step": 648 }, { "epoch": 0.39, "grad_norm": 0.20624707271501935, "learning_rate": 0.00018249505970005262, "loss": 0.4785, "step": 649 }, { "epoch": 0.39, "grad_norm": 0.23418189558532218, "learning_rate": 0.00018244189209865974, "loss": 0.4976, "step": 650 }, { "epoch": 0.39, "grad_norm": 0.21372290734059424, "learning_rate": 0.00018238865164639173, "loss": 0.5237, "step": 651 }, { "epoch": 0.39, "grad_norm": 0.1986689651795865, "learning_rate": 0.0001823353383902953, "loss": 0.5354, "step": 652 }, { "epoch": 0.39, "grad_norm": 0.21154599437074698, "learning_rate": 0.0001822819523774814, "loss": 0.5292, "step": 653 }, { "epoch": 0.39, "grad_norm": 0.21348268586605149, "learning_rate": 0.00018222849365512523, "loss": 0.5249, "step": 654 }, { "epoch": 0.39, "grad_norm": 0.22296243039072478, "learning_rate": 0.0001821749622704664, "loss": 0.5458, "step": 655 }, { "epoch": 0.39, "grad_norm": 0.22596567506529938, "learning_rate": 0.00018212135827080857, "loss": 0.5085, "step": 656 }, { "epoch": 0.39, "grad_norm": 0.19012132806019622, "learning_rate": 0.00018206768170351962, "loss": 0.4977, "step": 657 }, { "epoch": 0.39, "grad_norm": 0.2125366600531234, "learning_rate": 0.0001820139326160316, "loss": 0.5051, "step": 658 }, { "epoch": 0.39, "grad_norm": 0.23677835317412968, "learning_rate": 0.00018196011105584058, "loss": 0.575, "step": 659 }, { "epoch": 0.39, "grad_norm": 0.2262210065848097, "learning_rate": 0.00018190621707050671, "loss": 0.5744, "step": 660 }, { "epoch": 0.39, "grad_norm": 0.21618545867420894, "learning_rate": 0.0001818522507076541, "loss": 0.5715, "step": 661 }, { "epoch": 0.39, "grad_norm": 0.2050215711297079, "learning_rate": 0.00018179821201497092, "loss": 0.5201, "step": 662 }, { "epoch": 0.4, "grad_norm": 0.20218467055707082, "learning_rate": 0.0001817441010402091, "loss": 0.5058, "step": 663 }, { "epoch": 0.4, "grad_norm": 0.20940987275867923, "learning_rate": 0.00018168991783118452, "loss": 0.5095, "step": 664 }, { "epoch": 0.4, "grad_norm": 0.21341822518403558, "learning_rate": 0.00018163566243577697, "loss": 0.5599, "step": 665 }, { "epoch": 0.4, "grad_norm": 0.20028205017927186, "learning_rate": 0.0001815813349019299, "loss": 0.5318, "step": 666 }, { "epoch": 0.4, "grad_norm": 0.20184912350066175, "learning_rate": 0.00018152693527765057, "loss": 0.5643, "step": 667 }, { "epoch": 0.4, "grad_norm": 0.20882160405967118, "learning_rate": 0.0001814724636110099, "loss": 0.542, "step": 668 }, { "epoch": 0.4, "grad_norm": 0.20252144356881077, "learning_rate": 0.00018141791995014255, "loss": 0.4496, "step": 669 }, { "epoch": 0.4, "grad_norm": 0.1956328371434174, "learning_rate": 0.00018136330434324674, "loss": 0.56, "step": 670 }, { "epoch": 0.4, "grad_norm": 0.20691128111503362, "learning_rate": 0.00018130861683858426, "loss": 0.5726, "step": 671 }, { "epoch": 0.4, "grad_norm": 0.2258004454621585, "learning_rate": 0.00018125385748448048, "loss": 0.583, "step": 672 }, { "epoch": 0.4, "grad_norm": 0.22330059205477634, "learning_rate": 0.00018119902632932416, "loss": 0.5288, "step": 673 }, { "epoch": 0.4, "grad_norm": 0.20473079466150892, "learning_rate": 0.0001811441234215677, "loss": 0.5085, "step": 674 }, { "epoch": 0.4, "grad_norm": 0.19439333859223318, "learning_rate": 0.0001810891488097267, "loss": 0.5147, "step": 675 }, { "epoch": 0.4, "grad_norm": 0.2037181989313857, "learning_rate": 0.00018103410254238021, "loss": 0.5228, "step": 676 }, { "epoch": 0.4, "grad_norm": 0.21580635559566858, "learning_rate": 0.0001809789846681706, "loss": 0.5034, "step": 677 }, { "epoch": 0.4, "grad_norm": 0.21490060304667385, "learning_rate": 0.00018092379523580357, "loss": 0.5347, "step": 678 }, { "epoch": 0.41, "grad_norm": 0.20927738857723482, "learning_rate": 0.00018086853429404793, "loss": 0.5039, "step": 679 }, { "epoch": 0.41, "grad_norm": 0.21391199422702836, "learning_rate": 0.00018081320189173577, "loss": 0.5404, "step": 680 }, { "epoch": 0.41, "grad_norm": 0.22355130583819918, "learning_rate": 0.0001807577980777623, "loss": 0.5147, "step": 681 }, { "epoch": 0.41, "grad_norm": 0.21899190720848985, "learning_rate": 0.00018070232290108584, "loss": 0.5195, "step": 682 }, { "epoch": 0.41, "grad_norm": 0.20636096645560792, "learning_rate": 0.00018064677641072775, "loss": 0.5158, "step": 683 }, { "epoch": 0.41, "grad_norm": 0.20462410706105155, "learning_rate": 0.00018059115865577249, "loss": 0.5194, "step": 684 }, { "epoch": 0.41, "grad_norm": 0.21978634315593423, "learning_rate": 0.00018053546968536735, "loss": 0.4986, "step": 685 }, { "epoch": 0.41, "grad_norm": 0.2203882917140438, "learning_rate": 0.00018047970954872264, "loss": 0.5855, "step": 686 }, { "epoch": 0.41, "grad_norm": 0.20144829000454462, "learning_rate": 0.0001804238782951116, "loss": 0.5212, "step": 687 }, { "epoch": 0.41, "grad_norm": 0.21142991796239274, "learning_rate": 0.00018036797597387023, "loss": 0.495, "step": 688 }, { "epoch": 0.41, "grad_norm": 0.21275432668758548, "learning_rate": 0.00018031200263439736, "loss": 0.5694, "step": 689 }, { "epoch": 0.41, "grad_norm": 0.2035189446424034, "learning_rate": 0.00018025595832615459, "loss": 0.55, "step": 690 }, { "epoch": 0.41, "grad_norm": 0.20030837247360464, "learning_rate": 0.00018019984309866619, "loss": 0.4748, "step": 691 }, { "epoch": 0.41, "grad_norm": 0.20366715425572, "learning_rate": 0.00018014365700151912, "loss": 0.5792, "step": 692 }, { "epoch": 0.41, "grad_norm": 0.2082468197583491, "learning_rate": 0.000180087400084363, "loss": 0.4973, "step": 693 }, { "epoch": 0.41, "grad_norm": 0.21820027454676755, "learning_rate": 0.00018003107239691004, "loss": 0.5512, "step": 694 }, { "epoch": 0.41, "grad_norm": 0.2085678250499903, "learning_rate": 0.00017997467398893488, "loss": 0.5148, "step": 695 }, { "epoch": 0.42, "grad_norm": 0.20422653056781329, "learning_rate": 0.00017991820491027472, "loss": 0.4968, "step": 696 }, { "epoch": 0.42, "grad_norm": 0.1875899162050169, "learning_rate": 0.0001798616652108293, "loss": 0.5061, "step": 697 }, { "epoch": 0.42, "grad_norm": 0.20869663705218836, "learning_rate": 0.00017980505494056062, "loss": 0.5182, "step": 698 }, { "epoch": 0.42, "grad_norm": 0.19250179476147736, "learning_rate": 0.00017974837414949307, "loss": 0.5184, "step": 699 }, { "epoch": 0.42, "grad_norm": 0.21732108838463451, "learning_rate": 0.00017969162288771347, "loss": 0.5524, "step": 700 }, { "epoch": 0.42, "grad_norm": 0.20200315361578813, "learning_rate": 0.0001796348012053707, "loss": 0.5386, "step": 701 }, { "epoch": 0.42, "grad_norm": 0.20242537832049035, "learning_rate": 0.00017957790915267615, "loss": 0.5656, "step": 702 }, { "epoch": 0.42, "grad_norm": 0.1889172192023988, "learning_rate": 0.0001795209467799031, "loss": 0.5115, "step": 703 }, { "epoch": 0.42, "grad_norm": 0.19623435201373893, "learning_rate": 0.0001794639141373872, "loss": 0.497, "step": 704 }, { "epoch": 0.42, "grad_norm": 0.22372809637554478, "learning_rate": 0.00017940681127552604, "loss": 0.5579, "step": 705 }, { "epoch": 0.42, "grad_norm": 0.1968536923376666, "learning_rate": 0.0001793496382447794, "loss": 0.4891, "step": 706 }, { "epoch": 0.42, "grad_norm": 0.1990723573146364, "learning_rate": 0.00017929239509566894, "loss": 0.5921, "step": 707 }, { "epoch": 0.42, "grad_norm": 0.20388703819339077, "learning_rate": 0.00017923508187877834, "loss": 0.5414, "step": 708 }, { "epoch": 0.42, "grad_norm": 0.23657852979478725, "learning_rate": 0.00017917769864475314, "loss": 0.5672, "step": 709 }, { "epoch": 0.42, "grad_norm": 0.22888252332289927, "learning_rate": 0.00017912024544430088, "loss": 0.5459, "step": 710 }, { "epoch": 0.42, "grad_norm": 0.19383907969249117, "learning_rate": 0.0001790627223281908, "loss": 0.5509, "step": 711 }, { "epoch": 0.42, "grad_norm": 0.2154263629956836, "learning_rate": 0.00017900512934725397, "loss": 0.5629, "step": 712 }, { "epoch": 0.43, "grad_norm": 0.19802419635693494, "learning_rate": 0.0001789474665523832, "loss": 0.5128, "step": 713 }, { "epoch": 0.43, "grad_norm": 0.19783321602266912, "learning_rate": 0.00017888973399453296, "loss": 0.5064, "step": 714 }, { "epoch": 0.43, "grad_norm": 0.19864882371614528, "learning_rate": 0.00017883193172471944, "loss": 0.5458, "step": 715 }, { "epoch": 0.43, "grad_norm": 0.23609512585527, "learning_rate": 0.00017877405979402038, "loss": 0.5069, "step": 716 }, { "epoch": 0.43, "grad_norm": 0.19894144678524353, "learning_rate": 0.00017871611825357502, "loss": 0.5812, "step": 717 }, { "epoch": 0.43, "grad_norm": 0.19598819977852033, "learning_rate": 0.00017865810715458427, "loss": 0.5223, "step": 718 }, { "epoch": 0.43, "grad_norm": 0.23274847505011953, "learning_rate": 0.00017860002654831032, "loss": 0.5703, "step": 719 }, { "epoch": 0.43, "grad_norm": 0.19794477486450376, "learning_rate": 0.00017854187648607694, "loss": 0.5538, "step": 720 }, { "epoch": 0.43, "grad_norm": 0.2091737019131215, "learning_rate": 0.00017848365701926913, "loss": 0.4962, "step": 721 }, { "epoch": 0.43, "grad_norm": 0.21890749511490995, "learning_rate": 0.00017842536819933337, "loss": 0.5074, "step": 722 }, { "epoch": 0.43, "grad_norm": 0.22746821737803938, "learning_rate": 0.0001783670100777773, "loss": 0.5849, "step": 723 }, { "epoch": 0.43, "grad_norm": 0.20967916540656184, "learning_rate": 0.0001783085827061699, "loss": 0.5246, "step": 724 }, { "epoch": 0.43, "grad_norm": 0.19798059353181535, "learning_rate": 0.00017825008613614127, "loss": 0.4667, "step": 725 }, { "epoch": 0.43, "grad_norm": 0.1992664047298655, "learning_rate": 0.00017819152041938265, "loss": 0.5247, "step": 726 }, { "epoch": 0.43, "grad_norm": 0.22025628624147217, "learning_rate": 0.00017813288560764647, "loss": 0.5291, "step": 727 }, { "epoch": 0.43, "grad_norm": 0.20405038516624363, "learning_rate": 0.00017807418175274612, "loss": 0.5235, "step": 728 }, { "epoch": 0.43, "grad_norm": 0.20626127985692586, "learning_rate": 0.00017801540890655609, "loss": 0.5103, "step": 729 }, { "epoch": 0.44, "grad_norm": 0.2187527308725265, "learning_rate": 0.00017795656712101172, "loss": 0.5515, "step": 730 }, { "epoch": 0.44, "grad_norm": 0.20386714530070776, "learning_rate": 0.00017789765644810935, "loss": 0.5109, "step": 731 }, { "epoch": 0.44, "grad_norm": 0.1990293686392052, "learning_rate": 0.00017783867693990624, "loss": 0.5208, "step": 732 }, { "epoch": 0.44, "grad_norm": 0.19601721442767256, "learning_rate": 0.0001777796286485204, "loss": 0.5318, "step": 733 }, { "epoch": 0.44, "grad_norm": 0.20542580410660244, "learning_rate": 0.0001777205116261306, "loss": 0.5198, "step": 734 }, { "epoch": 0.44, "grad_norm": 0.20998518101289002, "learning_rate": 0.0001776613259249764, "loss": 0.5384, "step": 735 }, { "epoch": 0.44, "grad_norm": 0.20134476803418952, "learning_rate": 0.00017760207159735805, "loss": 0.5448, "step": 736 }, { "epoch": 0.44, "grad_norm": 0.22396912180134018, "learning_rate": 0.00017754274869563637, "loss": 0.59, "step": 737 }, { "epoch": 0.44, "grad_norm": 0.2044555533666512, "learning_rate": 0.00017748335727223294, "loss": 0.5152, "step": 738 }, { "epoch": 0.44, "grad_norm": 0.2106748606361736, "learning_rate": 0.00017742389737962966, "loss": 0.5233, "step": 739 }, { "epoch": 0.44, "grad_norm": 0.19348093577281505, "learning_rate": 0.0001773643690703691, "loss": 0.5181, "step": 740 }, { "epoch": 0.44, "grad_norm": 0.20393569458294794, "learning_rate": 0.00017730477239705428, "loss": 0.5671, "step": 741 }, { "epoch": 0.44, "grad_norm": 0.19728761757057783, "learning_rate": 0.00017724510741234858, "loss": 0.4919, "step": 742 }, { "epoch": 0.44, "grad_norm": 0.2025575313201386, "learning_rate": 0.0001771853741689757, "loss": 0.5452, "step": 743 }, { "epoch": 0.44, "grad_norm": 0.19153867099886435, "learning_rate": 0.0001771255727197198, "loss": 0.4951, "step": 744 }, { "epoch": 0.44, "grad_norm": 0.2220125331576081, "learning_rate": 0.00017706570311742516, "loss": 0.5521, "step": 745 }, { "epoch": 0.45, "grad_norm": 0.22704666961693065, "learning_rate": 0.0001770057654149964, "loss": 0.5184, "step": 746 }, { "epoch": 0.45, "grad_norm": 0.20871880228168335, "learning_rate": 0.00017694575966539823, "loss": 0.5205, "step": 747 }, { "epoch": 0.45, "grad_norm": 0.2105924919088961, "learning_rate": 0.00017688568592165552, "loss": 0.5448, "step": 748 }, { "epoch": 0.45, "grad_norm": 0.19780662201378688, "learning_rate": 0.00017682554423685329, "loss": 0.6037, "step": 749 }, { "epoch": 0.45, "grad_norm": 0.23105576261963792, "learning_rate": 0.0001767653346641365, "loss": 0.7225, "step": 750 }, { "epoch": 0.45, "grad_norm": 0.21997563912032173, "learning_rate": 0.00017670505725671013, "loss": 0.552, "step": 751 }, { "epoch": 0.45, "grad_norm": 0.2033859052649398, "learning_rate": 0.00017664471206783915, "loss": 0.5315, "step": 752 }, { "epoch": 0.45, "grad_norm": 0.19979214467824102, "learning_rate": 0.00017658429915084835, "loss": 0.5697, "step": 753 }, { "epoch": 0.45, "grad_norm": 0.20567412732571028, "learning_rate": 0.00017652381855912247, "loss": 0.5051, "step": 754 }, { "epoch": 0.45, "grad_norm": 0.20563597140752976, "learning_rate": 0.0001764632703461059, "loss": 0.5141, "step": 755 }, { "epoch": 0.45, "grad_norm": 0.1979658869623221, "learning_rate": 0.00017640265456530293, "loss": 0.5257, "step": 756 }, { "epoch": 0.45, "grad_norm": 0.2241077787834463, "learning_rate": 0.0001763419712702775, "loss": 0.5203, "step": 757 }, { "epoch": 0.45, "grad_norm": 0.2197932846972142, "learning_rate": 0.00017628122051465322, "loss": 0.5847, "step": 758 }, { "epoch": 0.45, "grad_norm": 0.1990944255813207, "learning_rate": 0.00017622040235211326, "loss": 0.4962, "step": 759 }, { "epoch": 0.45, "grad_norm": 0.22179309744704687, "learning_rate": 0.00017615951683640045, "loss": 0.5635, "step": 760 }, { "epoch": 0.45, "grad_norm": 0.20505896786344424, "learning_rate": 0.00017609856402131703, "loss": 0.4968, "step": 761 }, { "epoch": 0.45, "grad_norm": 0.21771157401053975, "learning_rate": 0.00017603754396072483, "loss": 0.4858, "step": 762 }, { "epoch": 0.46, "grad_norm": 0.23357401076131715, "learning_rate": 0.000175976456708545, "loss": 0.5766, "step": 763 }, { "epoch": 0.46, "grad_norm": 0.21488993425737504, "learning_rate": 0.0001759153023187581, "loss": 0.5419, "step": 764 }, { "epoch": 0.46, "grad_norm": 0.2035555999534868, "learning_rate": 0.00017585408084540405, "loss": 0.5272, "step": 765 }, { "epoch": 0.46, "grad_norm": 0.20066829451010718, "learning_rate": 0.00017579279234258198, "loss": 0.5013, "step": 766 }, { "epoch": 0.46, "grad_norm": 0.2052255359730049, "learning_rate": 0.00017573143686445034, "loss": 0.5383, "step": 767 }, { "epoch": 0.46, "grad_norm": 0.19180058672325329, "learning_rate": 0.00017567001446522665, "loss": 0.5108, "step": 768 }, { "epoch": 0.46, "grad_norm": 0.22862029943228582, "learning_rate": 0.0001756085251991877, "loss": 0.531, "step": 769 }, { "epoch": 0.46, "grad_norm": 0.2180888066741993, "learning_rate": 0.00017554696912066924, "loss": 0.5938, "step": 770 }, { "epoch": 0.46, "grad_norm": 0.19823263656993223, "learning_rate": 0.00017548534628406616, "loss": 0.5158, "step": 771 }, { "epoch": 0.46, "grad_norm": 0.18700255356016454, "learning_rate": 0.00017542365674383227, "loss": 0.517, "step": 772 }, { "epoch": 0.46, "grad_norm": 0.22948411236460914, "learning_rate": 0.00017536190055448037, "loss": 0.5464, "step": 773 }, { "epoch": 0.46, "grad_norm": 0.21370070213829387, "learning_rate": 0.00017530007777058213, "loss": 0.5158, "step": 774 }, { "epoch": 0.46, "grad_norm": 0.19174674116457566, "learning_rate": 0.0001752381884467681, "loss": 0.5035, "step": 775 }, { "epoch": 0.46, "grad_norm": 0.19069115110218368, "learning_rate": 0.00017517623263772758, "loss": 0.5341, "step": 776 }, { "epoch": 0.46, "grad_norm": 0.2401612943495333, "learning_rate": 0.00017511421039820863, "loss": 0.578, "step": 777 }, { "epoch": 0.46, "grad_norm": 0.20371659209716964, "learning_rate": 0.00017505212178301805, "loss": 0.5103, "step": 778 }, { "epoch": 0.46, "grad_norm": 0.2029847143168681, "learning_rate": 0.00017498996684702132, "loss": 0.537, "step": 779 }, { "epoch": 0.47, "grad_norm": 0.1904915669257304, "learning_rate": 0.00017492774564514235, "loss": 0.5129, "step": 780 }, { "epoch": 0.47, "grad_norm": 0.20640027525482552, "learning_rate": 0.00017486545823236385, "loss": 0.5585, "step": 781 }, { "epoch": 0.47, "grad_norm": 0.23582084058854208, "learning_rate": 0.00017480310466372686, "loss": 0.5648, "step": 782 }, { "epoch": 0.47, "grad_norm": 0.2219618762642625, "learning_rate": 0.00017474068499433098, "loss": 0.5365, "step": 783 }, { "epoch": 0.47, "grad_norm": 0.2021980496149104, "learning_rate": 0.00017467819927933416, "loss": 0.5232, "step": 784 }, { "epoch": 0.47, "grad_norm": 0.22350007413119136, "learning_rate": 0.00017461564757395272, "loss": 0.571, "step": 785 }, { "epoch": 0.47, "grad_norm": 0.1982267515659923, "learning_rate": 0.00017455302993346134, "loss": 0.5228, "step": 786 }, { "epoch": 0.47, "grad_norm": 0.1981641437638338, "learning_rate": 0.00017449034641319288, "loss": 0.5233, "step": 787 }, { "epoch": 0.47, "grad_norm": 0.21019781303279997, "learning_rate": 0.00017442759706853855, "loss": 0.5207, "step": 788 }, { "epoch": 0.47, "grad_norm": 0.2060312831458839, "learning_rate": 0.00017436478195494756, "loss": 0.5262, "step": 789 }, { "epoch": 0.47, "grad_norm": 0.21829001169718656, "learning_rate": 0.00017430190112792737, "loss": 0.563, "step": 790 }, { "epoch": 0.47, "grad_norm": 0.18511782073951058, "learning_rate": 0.00017423895464304342, "loss": 0.5017, "step": 791 }, { "epoch": 0.47, "grad_norm": 0.1883852134929889, "learning_rate": 0.00017417594255591927, "loss": 0.4598, "step": 792 }, { "epoch": 0.47, "grad_norm": 0.18093236424530848, "learning_rate": 0.00017411286492223632, "loss": 0.4834, "step": 793 }, { "epoch": 0.47, "grad_norm": 0.18428120434597678, "learning_rate": 0.000174049721797734, "loss": 0.5032, "step": 794 }, { "epoch": 0.47, "grad_norm": 0.20829275110131446, "learning_rate": 0.00017398651323820958, "loss": 0.5844, "step": 795 }, { "epoch": 0.47, "grad_norm": 0.20484798677763622, "learning_rate": 0.00017392323929951812, "loss": 0.5674, "step": 796 }, { "epoch": 0.48, "grad_norm": 0.24390628538267795, "learning_rate": 0.0001738599000375725, "loss": 0.5415, "step": 797 }, { "epoch": 0.48, "grad_norm": 0.7159631217821198, "learning_rate": 0.00017379649550834327, "loss": 0.5248, "step": 798 }, { "epoch": 0.48, "grad_norm": 0.2153929799459398, "learning_rate": 0.00017373302576785874, "loss": 0.5362, "step": 799 }, { "epoch": 0.48, "grad_norm": 0.18434502268826083, "learning_rate": 0.00017366949087220472, "loss": 0.5179, "step": 800 }, { "epoch": 0.48, "grad_norm": 0.1993458339623336, "learning_rate": 0.0001736058908775247, "loss": 0.5378, "step": 801 }, { "epoch": 0.48, "grad_norm": 0.20482117487035287, "learning_rate": 0.0001735422258400197, "loss": 0.5066, "step": 802 }, { "epoch": 0.48, "grad_norm": 0.22481951556352617, "learning_rate": 0.0001734784958159481, "loss": 0.5504, "step": 803 }, { "epoch": 0.48, "grad_norm": 0.20893222575857784, "learning_rate": 0.00017341470086162586, "loss": 0.5558, "step": 804 }, { "epoch": 0.48, "grad_norm": 0.21011978723049574, "learning_rate": 0.0001733508410334262, "loss": 0.5164, "step": 805 }, { "epoch": 0.48, "grad_norm": 0.19493427746334713, "learning_rate": 0.0001732869163877797, "loss": 0.4928, "step": 806 }, { "epoch": 0.48, "grad_norm": 0.21183476672026114, "learning_rate": 0.00017322292698117425, "loss": 0.539, "step": 807 }, { "epoch": 0.48, "grad_norm": 0.19833380077404217, "learning_rate": 0.00017315887287015492, "loss": 0.5271, "step": 808 }, { "epoch": 0.48, "grad_norm": 0.1914374518219283, "learning_rate": 0.000173094754111324, "loss": 0.5408, "step": 809 }, { "epoch": 0.48, "grad_norm": 0.2086814492670768, "learning_rate": 0.00017303057076134085, "loss": 0.5289, "step": 810 }, { "epoch": 0.48, "grad_norm": 0.20957903788826676, "learning_rate": 0.000172966322876922, "loss": 0.4998, "step": 811 }, { "epoch": 0.48, "grad_norm": 0.20998255172298386, "learning_rate": 0.00017290201051484085, "loss": 0.5481, "step": 812 }, { "epoch": 0.49, "grad_norm": 0.2071975609134585, "learning_rate": 0.00017283763373192798, "loss": 0.5183, "step": 813 }, { "epoch": 0.49, "grad_norm": 0.21738328519054306, "learning_rate": 0.00017277319258507073, "loss": 0.539, "step": 814 }, { "epoch": 0.49, "grad_norm": 0.20518040499899, "learning_rate": 0.0001727086871312134, "loss": 0.5109, "step": 815 }, { "epoch": 0.49, "grad_norm": 0.19341379491491822, "learning_rate": 0.00017264411742735707, "loss": 0.4882, "step": 816 }, { "epoch": 0.49, "grad_norm": 0.23128359760674316, "learning_rate": 0.00017257948353055963, "loss": 0.547, "step": 817 }, { "epoch": 0.49, "grad_norm": 0.1960131633047162, "learning_rate": 0.0001725147854979357, "loss": 0.5467, "step": 818 }, { "epoch": 0.49, "grad_norm": 0.21053602560644855, "learning_rate": 0.00017245002338665656, "loss": 0.5644, "step": 819 }, { "epoch": 0.49, "grad_norm": 0.19602457133539752, "learning_rate": 0.00017238519725395007, "loss": 0.5121, "step": 820 }, { "epoch": 0.49, "grad_norm": 0.1923459024283483, "learning_rate": 0.00017232030715710076, "loss": 0.5335, "step": 821 }, { "epoch": 0.49, "grad_norm": 0.19919783133579333, "learning_rate": 0.00017225535315344955, "loss": 0.5076, "step": 822 }, { "epoch": 0.49, "grad_norm": 0.23727428892467575, "learning_rate": 0.00017219033530039397, "loss": 0.5396, "step": 823 }, { "epoch": 0.49, "grad_norm": 0.18048505778792392, "learning_rate": 0.00017212525365538792, "loss": 0.467, "step": 824 }, { "epoch": 0.49, "grad_norm": 0.20071702267002645, "learning_rate": 0.00017206010827594163, "loss": 0.5217, "step": 825 }, { "epoch": 0.49, "grad_norm": 0.20315216612025339, "learning_rate": 0.0001719948992196217, "loss": 0.4975, "step": 826 }, { "epoch": 0.49, "grad_norm": 0.2142259292765235, "learning_rate": 0.00017192962654405096, "loss": 0.5148, "step": 827 }, { "epoch": 0.49, "grad_norm": 0.19450012283752555, "learning_rate": 0.00017186429030690848, "loss": 0.5297, "step": 828 }, { "epoch": 0.49, "grad_norm": 0.19853162923467543, "learning_rate": 0.00017179889056592954, "loss": 0.547, "step": 829 }, { "epoch": 0.5, "grad_norm": 0.24873174470750906, "learning_rate": 0.00017173342737890544, "loss": 0.563, "step": 830 }, { "epoch": 0.5, "grad_norm": 0.18593730182623175, "learning_rate": 0.00017166790080368357, "loss": 0.4647, "step": 831 }, { "epoch": 0.5, "grad_norm": 0.19387710879340586, "learning_rate": 0.00017160231089816748, "loss": 0.5313, "step": 832 }, { "epoch": 0.5, "grad_norm": 0.20818447206363588, "learning_rate": 0.00017153665772031643, "loss": 0.5333, "step": 833 }, { "epoch": 0.5, "grad_norm": 0.17584822143732362, "learning_rate": 0.0001714709413281458, "loss": 0.4467, "step": 834 }, { "epoch": 0.5, "grad_norm": 0.19622166504995672, "learning_rate": 0.00017140516177972676, "loss": 0.5129, "step": 835 }, { "epoch": 0.5, "grad_norm": 0.18822988249157332, "learning_rate": 0.00017133931913318625, "loss": 0.5186, "step": 836 }, { "epoch": 0.5, "grad_norm": 0.2021164051164271, "learning_rate": 0.00017127341344670696, "loss": 0.551, "step": 837 }, { "epoch": 0.5, "grad_norm": 0.19354327476685654, "learning_rate": 0.00017120744477852745, "loss": 0.5001, "step": 838 } ], "logging_steps": 1, "max_steps": 3352, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 838, "total_flos": 829560862015488.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }