{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9804594272076372, "eval_steps": 500, "global_step": 3352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.0434608134535555, "learning_rate": 2e-05, "loss": 0.7234, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.9514763639201144, "learning_rate": 4e-05, "loss": 0.5916, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.8903336235741379, "learning_rate": 6e-05, "loss": 0.5827, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.764603046688215, "learning_rate": 8e-05, "loss": 0.5555, "step": 4 }, { "epoch": 0.0, "grad_norm": 0.8956967835024867, "learning_rate": 0.0001, "loss": 0.6073, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.378526599131456, "learning_rate": 0.00012, "loss": 0.6204, "step": 6 }, { "epoch": 0.0, "grad_norm": 1.1442512069955262, "learning_rate": 0.00014, "loss": 0.5452, "step": 7 }, { "epoch": 0.0, "grad_norm": 0.9100913748328603, "learning_rate": 0.00016, "loss": 0.5858, "step": 8 }, { "epoch": 0.01, "grad_norm": 0.6859557442210851, "learning_rate": 0.00018, "loss": 0.5182, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.8288606055568941, "learning_rate": 0.0002, "loss": 0.6083, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.0417686973414615, "learning_rate": 0.0001999999558168346, "loss": 0.65, "step": 11 }, { "epoch": 0.01, "grad_norm": 0.8756017006672853, "learning_rate": 0.00019999982326737747, "loss": 0.5837, "step": 12 }, { "epoch": 0.01, "grad_norm": 0.7284541589078422, "learning_rate": 0.0001999996023517457, "loss": 0.5738, "step": 13 }, { "epoch": 0.01, "grad_norm": 0.8602799120413903, "learning_rate": 0.0001999992930701345, "loss": 0.595, "step": 14 }, { "epoch": 0.01, "grad_norm": 0.7938840633060059, "learning_rate": 0.00019999889542281728, "loss": 0.5907, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.7022632853348306, "learning_rate": 0.00019999840941014525, "loss": 0.5513, "step": 16 }, { "epoch": 0.01, "grad_norm": 0.6531781263691616, "learning_rate": 0.00019999783503254803, "loss": 0.5475, "step": 17 }, { "epoch": 0.01, "grad_norm": 0.5808528104992969, "learning_rate": 0.0001999971722905331, "loss": 0.519, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.5452643486331965, "learning_rate": 0.00019999642118468614, "loss": 0.5421, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.5893567274117093, "learning_rate": 0.00019999558171567082, "loss": 0.6016, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.5039905113559068, "learning_rate": 0.000199994653884229, "loss": 0.6096, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.5236847425188783, "learning_rate": 0.00019999363769118055, "loss": 0.5845, "step": 22 }, { "epoch": 0.01, "grad_norm": 0.4403777461558745, "learning_rate": 0.00019999253313742344, "loss": 0.5657, "step": 23 }, { "epoch": 0.01, "grad_norm": 0.411935443568472, "learning_rate": 0.00019999134022393375, "loss": 0.5619, "step": 24 }, { "epoch": 0.01, "grad_norm": 0.3669921362459581, "learning_rate": 0.0001999900589517656, "loss": 0.6115, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.3613613842516578, "learning_rate": 0.0001999886893220512, "loss": 0.5286, "step": 26 }, { "epoch": 0.02, "grad_norm": 0.378560233146017, "learning_rate": 0.0001999872313360008, "loss": 0.5887, "step": 27 }, { "epoch": 0.02, "grad_norm": 0.3562687135057843, "learning_rate": 0.00019998568499490283, "loss": 0.5598, "step": 28 }, { "epoch": 0.02, "grad_norm": 0.34581851507208355, "learning_rate": 0.00019998405030012371, "loss": 0.5772, "step": 29 }, { "epoch": 0.02, "grad_norm": 0.3812400800902662, "learning_rate": 0.00019998232725310796, "loss": 0.6154, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.2876811822631032, "learning_rate": 0.00019998051585537818, "loss": 0.4949, "step": 31 }, { "epoch": 0.02, "grad_norm": 0.3291934945038139, "learning_rate": 0.00019997861610853503, "loss": 0.5388, "step": 32 }, { "epoch": 0.02, "grad_norm": 0.35220229516562385, "learning_rate": 0.00019997662801425725, "loss": 0.5801, "step": 33 }, { "epoch": 0.02, "grad_norm": 0.3908550087374589, "learning_rate": 0.00019997455157430165, "loss": 0.5783, "step": 34 }, { "epoch": 0.02, "grad_norm": 0.38564043955621646, "learning_rate": 0.00019997238679050308, "loss": 0.5628, "step": 35 }, { "epoch": 0.02, "grad_norm": 0.3248792879576579, "learning_rate": 0.00019997013366477453, "loss": 0.5896, "step": 36 }, { "epoch": 0.02, "grad_norm": 0.33243708146621687, "learning_rate": 0.00019996779219910696, "loss": 0.5618, "step": 37 }, { "epoch": 0.02, "grad_norm": 0.3589452470436555, "learning_rate": 0.00019996536239556942, "loss": 0.5387, "step": 38 }, { "epoch": 0.02, "grad_norm": 0.3635189263065437, "learning_rate": 0.0001999628442563091, "loss": 0.629, "step": 39 }, { "epoch": 0.02, "grad_norm": 0.36761442942017947, "learning_rate": 0.00019996023778355113, "loss": 0.6133, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.30331424568033827, "learning_rate": 0.00019995754297959882, "loss": 0.5377, "step": 41 }, { "epoch": 0.03, "grad_norm": 0.3157701200247212, "learning_rate": 0.0001999547598468334, "loss": 0.6249, "step": 42 }, { "epoch": 0.03, "grad_norm": 0.3112819252913729, "learning_rate": 0.00019995188838771425, "loss": 0.5424, "step": 43 }, { "epoch": 0.03, "grad_norm": 0.33384944369487113, "learning_rate": 0.0001999489286047788, "loss": 0.6014, "step": 44 }, { "epoch": 0.03, "grad_norm": 0.3330564524921197, "learning_rate": 0.00019994588050064243, "loss": 0.5469, "step": 45 }, { "epoch": 0.03, "grad_norm": 0.2889450580843479, "learning_rate": 0.00019994274407799872, "loss": 0.512, "step": 46 }, { "epoch": 0.03, "grad_norm": 0.3273617072745067, "learning_rate": 0.00019993951933961913, "loss": 0.5456, "step": 47 }, { "epoch": 0.03, "grad_norm": 0.3228418979730564, "learning_rate": 0.00019993620628835332, "loss": 0.5716, "step": 48 }, { "epoch": 0.03, "grad_norm": 0.3439278828333003, "learning_rate": 0.0001999328049271289, "loss": 0.5177, "step": 49 }, { "epoch": 0.03, "grad_norm": 0.3186271172935729, "learning_rate": 0.0001999293152589515, "loss": 0.5502, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.38357922086488366, "learning_rate": 0.0001999257372869048, "loss": 0.6178, "step": 51 }, { "epoch": 0.03, "grad_norm": 0.36013497860303273, "learning_rate": 0.00019992207101415053, "loss": 0.6278, "step": 52 }, { "epoch": 0.03, "grad_norm": 0.3146595251755829, "learning_rate": 0.00019991831644392848, "loss": 0.5348, "step": 53 }, { "epoch": 0.03, "grad_norm": 0.3986948949803995, "learning_rate": 0.00019991447357955639, "loss": 0.6331, "step": 54 }, { "epoch": 0.03, "grad_norm": 0.3194176204715625, "learning_rate": 0.00019991054242443008, "loss": 0.5817, "step": 55 }, { "epoch": 0.03, "grad_norm": 0.29564051537116465, "learning_rate": 0.00019990652298202335, "loss": 0.545, "step": 56 }, { "epoch": 0.03, "grad_norm": 0.2908246398716361, "learning_rate": 0.00019990241525588804, "loss": 0.5294, "step": 57 }, { "epoch": 0.03, "grad_norm": 0.3480952622658696, "learning_rate": 0.000199898219249654, "loss": 0.6282, "step": 58 }, { "epoch": 0.04, "grad_norm": 0.4278607132919695, "learning_rate": 0.00019989393496702907, "loss": 0.7008, "step": 59 }, { "epoch": 0.04, "grad_norm": 0.3088760407735635, "learning_rate": 0.00019988956241179912, "loss": 0.5747, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.3549589308890128, "learning_rate": 0.00019988510158782804, "loss": 0.615, "step": 61 }, { "epoch": 0.04, "grad_norm": 0.28349959678885256, "learning_rate": 0.00019988055249905767, "loss": 0.577, "step": 62 }, { "epoch": 0.04, "grad_norm": 0.35001480138074803, "learning_rate": 0.00019987591514950787, "loss": 0.5551, "step": 63 }, { "epoch": 0.04, "grad_norm": 0.32895401860950285, "learning_rate": 0.00019987118954327654, "loss": 0.5617, "step": 64 }, { "epoch": 0.04, "grad_norm": 0.347007326906862, "learning_rate": 0.00019986637568453945, "loss": 0.5935, "step": 65 }, { "epoch": 0.04, "grad_norm": 0.30223641676037666, "learning_rate": 0.00019986147357755048, "loss": 0.5355, "step": 66 }, { "epoch": 0.04, "grad_norm": 0.302279046184156, "learning_rate": 0.00019985648322664145, "loss": 0.5571, "step": 67 }, { "epoch": 0.04, "grad_norm": 0.3181910281320864, "learning_rate": 0.00019985140463622215, "loss": 0.5198, "step": 68 }, { "epoch": 0.04, "grad_norm": 0.32334719229096776, "learning_rate": 0.0001998462378107803, "loss": 0.5063, "step": 69 }, { "epoch": 0.04, "grad_norm": 0.31038499461943353, "learning_rate": 0.0001998409827548817, "loss": 0.5805, "step": 70 }, { "epoch": 0.04, "grad_norm": 0.3032049786542595, "learning_rate": 0.00019983563947316996, "loss": 0.564, "step": 71 }, { "epoch": 0.04, "grad_norm": 0.3345643555445713, "learning_rate": 0.00019983020797036683, "loss": 0.5442, "step": 72 }, { "epoch": 0.04, "grad_norm": 0.32583720357675877, "learning_rate": 0.00019982468825127187, "loss": 0.5674, "step": 73 }, { "epoch": 0.04, "grad_norm": 0.30278224625704836, "learning_rate": 0.0001998190803207627, "loss": 0.569, "step": 74 }, { "epoch": 0.04, "grad_norm": 0.29996902392483177, "learning_rate": 0.0001998133841837948, "loss": 0.6142, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.2947151895973628, "learning_rate": 0.00019980759984540168, "loss": 0.5084, "step": 76 }, { "epoch": 0.05, "grad_norm": 0.33309005605837944, "learning_rate": 0.0001998017273106947, "loss": 0.5807, "step": 77 }, { "epoch": 0.05, "grad_norm": 0.31281343399912853, "learning_rate": 0.00019979576658486325, "loss": 0.6299, "step": 78 }, { "epoch": 0.05, "grad_norm": 0.30980839190781245, "learning_rate": 0.00019978971767317457, "loss": 0.5521, "step": 79 }, { "epoch": 0.05, "grad_norm": 0.2988115356408324, "learning_rate": 0.00019978358058097388, "loss": 0.5645, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.2864799366004751, "learning_rate": 0.0001997773553136843, "loss": 0.5604, "step": 81 }, { "epoch": 0.05, "grad_norm": 0.28284272149262185, "learning_rate": 0.00019977104187680688, "loss": 0.5964, "step": 82 }, { "epoch": 0.05, "grad_norm": 0.2807639116477172, "learning_rate": 0.00019976464027592053, "loss": 0.5441, "step": 83 }, { "epoch": 0.05, "grad_norm": 0.3169919212395633, "learning_rate": 0.00019975815051668217, "loss": 0.5672, "step": 84 }, { "epoch": 0.05, "grad_norm": 0.2827524094344841, "learning_rate": 0.0001997515726048265, "loss": 0.5631, "step": 85 }, { "epoch": 0.05, "grad_norm": 0.29538492598974014, "learning_rate": 0.00019974490654616625, "loss": 0.609, "step": 86 }, { "epoch": 0.05, "grad_norm": 0.29397994414783907, "learning_rate": 0.0001997381523465919, "loss": 0.5723, "step": 87 }, { "epoch": 0.05, "grad_norm": 0.2824005824065347, "learning_rate": 0.00019973131001207195, "loss": 0.5209, "step": 88 }, { "epoch": 0.05, "grad_norm": 0.2847018611508931, "learning_rate": 0.00019972437954865265, "loss": 0.5617, "step": 89 }, { "epoch": 0.05, "grad_norm": 0.2908040007926844, "learning_rate": 0.00019971736096245825, "loss": 0.5624, "step": 90 }, { "epoch": 0.05, "grad_norm": 0.27754759063410545, "learning_rate": 0.00019971025425969083, "loss": 0.5353, "step": 91 }, { "epoch": 0.05, "grad_norm": 0.2885964599083646, "learning_rate": 0.0001997030594466303, "loss": 0.5181, "step": 92 }, { "epoch": 0.06, "grad_norm": 0.29372725474160666, "learning_rate": 0.00019969577652963444, "loss": 0.5757, "step": 93 }, { "epoch": 0.06, "grad_norm": 0.32149915053639194, "learning_rate": 0.0001996884055151389, "loss": 0.552, "step": 94 }, { "epoch": 0.06, "grad_norm": 0.2816717380191918, "learning_rate": 0.00019968094640965717, "loss": 0.4968, "step": 95 }, { "epoch": 0.06, "grad_norm": 0.2719020140724135, "learning_rate": 0.00019967339921978062, "loss": 0.5503, "step": 96 }, { "epoch": 0.06, "grad_norm": 0.28166729851780475, "learning_rate": 0.00019966576395217837, "loss": 0.5546, "step": 97 }, { "epoch": 0.06, "grad_norm": 0.27817598279558775, "learning_rate": 0.0001996580406135975, "loss": 0.6145, "step": 98 }, { "epoch": 0.06, "grad_norm": 0.3471492732103861, "learning_rate": 0.00019965022921086275, "loss": 0.6464, "step": 99 }, { "epoch": 0.06, "grad_norm": 0.2838977359279957, "learning_rate": 0.00019964232975087687, "loss": 0.5576, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.2989119777268752, "learning_rate": 0.00019963434224062025, "loss": 0.5747, "step": 101 }, { "epoch": 0.06, "grad_norm": 0.26079447242968457, "learning_rate": 0.0001996262666871512, "loss": 0.5144, "step": 102 }, { "epoch": 0.06, "grad_norm": 0.2904578974664885, "learning_rate": 0.00019961810309760577, "loss": 0.5623, "step": 103 }, { "epoch": 0.06, "grad_norm": 0.2682051539483259, "learning_rate": 0.00019960985147919778, "loss": 0.5722, "step": 104 }, { "epoch": 0.06, "grad_norm": 0.3004061592870477, "learning_rate": 0.00019960151183921897, "loss": 0.5526, "step": 105 }, { "epoch": 0.06, "grad_norm": 0.27675115608209533, "learning_rate": 0.00019959308418503877, "loss": 0.5859, "step": 106 }, { "epoch": 0.06, "grad_norm": 0.26526760651496173, "learning_rate": 0.00019958456852410433, "loss": 0.5395, "step": 107 }, { "epoch": 0.06, "grad_norm": 0.29513224606753785, "learning_rate": 0.0001995759648639406, "loss": 0.59, "step": 108 }, { "epoch": 0.07, "grad_norm": 0.27848396362609984, "learning_rate": 0.00019956727321215044, "loss": 0.6076, "step": 109 }, { "epoch": 0.07, "grad_norm": 0.29804931563512865, "learning_rate": 0.00019955849357641424, "loss": 0.5555, "step": 110 }, { "epoch": 0.07, "grad_norm": 0.2756038079003531, "learning_rate": 0.00019954962596449024, "loss": 0.5542, "step": 111 }, { "epoch": 0.07, "grad_norm": 0.28433942136507906, "learning_rate": 0.0001995406703842145, "loss": 0.5527, "step": 112 }, { "epoch": 0.07, "grad_norm": 0.646736414676863, "learning_rate": 0.0001995316268435007, "loss": 0.7024, "step": 113 }, { "epoch": 0.07, "grad_norm": 0.25030459489112267, "learning_rate": 0.00019952249535034025, "loss": 0.4928, "step": 114 }, { "epoch": 0.07, "grad_norm": 0.30977646038996587, "learning_rate": 0.00019951327591280236, "loss": 0.5883, "step": 115 }, { "epoch": 0.07, "grad_norm": 0.2854791945432696, "learning_rate": 0.0001995039685390339, "loss": 0.6318, "step": 116 }, { "epoch": 0.07, "grad_norm": 0.3264119003161445, "learning_rate": 0.00019949457323725946, "loss": 0.5654, "step": 117 }, { "epoch": 0.07, "grad_norm": 0.266512900378873, "learning_rate": 0.0001994850900157813, "loss": 0.5457, "step": 118 }, { "epoch": 0.07, "grad_norm": 0.30259231663288877, "learning_rate": 0.0001994755188829794, "loss": 0.5828, "step": 119 }, { "epoch": 0.07, "grad_norm": 0.33798187117798184, "learning_rate": 0.00019946585984731142, "loss": 0.5669, "step": 120 }, { "epoch": 0.07, "grad_norm": 0.2769767197030659, "learning_rate": 0.00019945611291731274, "loss": 0.5619, "step": 121 }, { "epoch": 0.07, "grad_norm": 0.2657775119925365, "learning_rate": 0.00019944627810159632, "loss": 0.59, "step": 122 }, { "epoch": 0.07, "grad_norm": 0.2918370192930011, "learning_rate": 0.00019943635540885279, "loss": 0.5816, "step": 123 }, { "epoch": 0.07, "grad_norm": 0.33417238245851544, "learning_rate": 0.00019942634484785052, "loss": 0.5921, "step": 124 }, { "epoch": 0.07, "grad_norm": 0.2482914745160091, "learning_rate": 0.00019941624642743548, "loss": 0.5113, "step": 125 }, { "epoch": 0.08, "grad_norm": 0.28272991564412037, "learning_rate": 0.0001994060601565313, "loss": 0.5543, "step": 126 }, { "epoch": 0.08, "grad_norm": 0.35389083143384653, "learning_rate": 0.00019939578604413912, "loss": 0.5921, "step": 127 }, { "epoch": 0.08, "grad_norm": 0.3426787529883331, "learning_rate": 0.00019938542409933787, "loss": 0.6073, "step": 128 }, { "epoch": 0.08, "grad_norm": 0.25621145606980844, "learning_rate": 0.000199374974331284, "loss": 0.5639, "step": 129 }, { "epoch": 0.08, "grad_norm": 0.2996022840977615, "learning_rate": 0.00019936443674921158, "loss": 0.5874, "step": 130 }, { "epoch": 0.08, "grad_norm": 0.3283726508140091, "learning_rate": 0.0001993538113624323, "loss": 0.6295, "step": 131 }, { "epoch": 0.08, "grad_norm": 0.2878523809947236, "learning_rate": 0.00019934309818033544, "loss": 0.5565, "step": 132 }, { "epoch": 0.08, "grad_norm": 0.2764010556696928, "learning_rate": 0.0001993322972123878, "loss": 0.554, "step": 133 }, { "epoch": 0.08, "grad_norm": 0.28827215996506156, "learning_rate": 0.0001993214084681338, "loss": 0.5788, "step": 134 }, { "epoch": 0.08, "grad_norm": 0.2945105689397871, "learning_rate": 0.00019931043195719548, "loss": 0.5197, "step": 135 }, { "epoch": 0.08, "grad_norm": 0.25439488455073433, "learning_rate": 0.00019929936768927232, "loss": 0.509, "step": 136 }, { "epoch": 0.08, "grad_norm": 0.2746742447613063, "learning_rate": 0.00019928821567414144, "loss": 0.5479, "step": 137 }, { "epoch": 0.08, "grad_norm": 0.3033960913538536, "learning_rate": 0.00019927697592165747, "loss": 0.5859, "step": 138 }, { "epoch": 0.08, "grad_norm": 0.28486244663010424, "learning_rate": 0.00019926564844175256, "loss": 0.5951, "step": 139 }, { "epoch": 0.08, "grad_norm": 0.3208016816012168, "learning_rate": 0.00019925423324443638, "loss": 0.5823, "step": 140 }, { "epoch": 0.08, "grad_norm": 0.3005362808247367, "learning_rate": 0.00019924273033979613, "loss": 0.5652, "step": 141 }, { "epoch": 0.08, "grad_norm": 0.26910338931514155, "learning_rate": 0.0001992311397379965, "loss": 0.5463, "step": 142 }, { "epoch": 0.09, "grad_norm": 0.28718562659091934, "learning_rate": 0.00019921946144927966, "loss": 0.5245, "step": 143 }, { "epoch": 0.09, "grad_norm": 0.2616953117781411, "learning_rate": 0.0001992076954839653, "loss": 0.5358, "step": 144 }, { "epoch": 0.09, "grad_norm": 0.3107257337368987, "learning_rate": 0.00019919584185245062, "loss": 0.5536, "step": 145 }, { "epoch": 0.09, "grad_norm": 0.28427245738065265, "learning_rate": 0.00019918390056521018, "loss": 0.6126, "step": 146 }, { "epoch": 0.09, "grad_norm": 0.25398310452304734, "learning_rate": 0.00019917187163279605, "loss": 0.5068, "step": 147 }, { "epoch": 0.09, "grad_norm": 0.2556256469730818, "learning_rate": 0.00019915975506583778, "loss": 0.5416, "step": 148 }, { "epoch": 0.09, "grad_norm": 0.2611702329742577, "learning_rate": 0.00019914755087504236, "loss": 0.5276, "step": 149 }, { "epoch": 0.09, "grad_norm": 0.2789045987166575, "learning_rate": 0.00019913525907119418, "loss": 0.5591, "step": 150 }, { "epoch": 0.09, "grad_norm": 0.26837658503581957, "learning_rate": 0.000199122879665155, "loss": 0.6581, "step": 151 }, { "epoch": 0.09, "grad_norm": 0.34601396912277804, "learning_rate": 0.0001991104126678641, "loss": 0.5394, "step": 152 }, { "epoch": 0.09, "grad_norm": 0.25684957257052443, "learning_rate": 0.00019909785809033806, "loss": 0.5392, "step": 153 }, { "epoch": 0.09, "grad_norm": 0.2906797315813485, "learning_rate": 0.00019908521594367098, "loss": 0.5185, "step": 154 }, { "epoch": 0.09, "grad_norm": 0.2852843546202924, "learning_rate": 0.0001990724862390342, "loss": 0.5436, "step": 155 }, { "epoch": 0.09, "grad_norm": 0.2875355300862882, "learning_rate": 0.0001990596689876765, "loss": 0.6009, "step": 156 }, { "epoch": 0.09, "grad_norm": 0.32052910212305513, "learning_rate": 0.00019904676420092404, "loss": 0.5831, "step": 157 }, { "epoch": 0.09, "grad_norm": 0.266884162852661, "learning_rate": 0.00019903377189018024, "loss": 0.5459, "step": 158 }, { "epoch": 0.09, "grad_norm": 0.2957365744895018, "learning_rate": 0.000199020692066926, "loss": 0.5211, "step": 159 }, { "epoch": 0.1, "grad_norm": 0.24951992931808137, "learning_rate": 0.00019900752474271945, "loss": 0.497, "step": 160 }, { "epoch": 0.1, "grad_norm": 0.30509964150122953, "learning_rate": 0.0001989942699291961, "loss": 0.5812, "step": 161 }, { "epoch": 0.1, "grad_norm": 0.2790293776337124, "learning_rate": 0.0001989809276380687, "loss": 0.5856, "step": 162 }, { "epoch": 0.1, "grad_norm": 0.24940387850774506, "learning_rate": 0.00019896749788112737, "loss": 0.5281, "step": 163 }, { "epoch": 0.1, "grad_norm": 0.2664890107453781, "learning_rate": 0.0001989539806702395, "loss": 0.524, "step": 164 }, { "epoch": 0.1, "grad_norm": 0.2896608423493073, "learning_rate": 0.0001989403760173497, "loss": 0.5171, "step": 165 }, { "epoch": 0.1, "grad_norm": 0.2544937836162412, "learning_rate": 0.00019892668393447997, "loss": 0.5546, "step": 166 }, { "epoch": 0.1, "grad_norm": 0.2626626371027326, "learning_rate": 0.00019891290443372944, "loss": 0.5498, "step": 167 }, { "epoch": 0.1, "grad_norm": 0.281410490858952, "learning_rate": 0.0001988990375272746, "loss": 0.5377, "step": 168 }, { "epoch": 0.1, "grad_norm": 0.3376943176164128, "learning_rate": 0.0001988850832273691, "loss": 0.5469, "step": 169 }, { "epoch": 0.1, "grad_norm": 0.2507691377758427, "learning_rate": 0.0001988710415463439, "loss": 0.549, "step": 170 }, { "epoch": 0.1, "grad_norm": 0.27178996752570117, "learning_rate": 0.00019885691249660702, "loss": 0.5636, "step": 171 }, { "epoch": 0.1, "grad_norm": 0.3359421766962587, "learning_rate": 0.00019884269609064386, "loss": 0.5957, "step": 172 }, { "epoch": 0.1, "grad_norm": 0.2638709645045905, "learning_rate": 0.0001988283923410169, "loss": 0.5793, "step": 173 }, { "epoch": 0.1, "grad_norm": 0.25585919726912226, "learning_rate": 0.00019881400126036582, "loss": 0.5817, "step": 174 }, { "epoch": 0.1, "grad_norm": 0.2905067973645414, "learning_rate": 0.00019879952286140754, "loss": 0.5585, "step": 175 }, { "epoch": 0.11, "grad_norm": 0.24197399766587002, "learning_rate": 0.0001987849571569361, "loss": 0.507, "step": 176 }, { "epoch": 0.11, "grad_norm": 0.28898034252731664, "learning_rate": 0.0001987703041598226, "loss": 0.5981, "step": 177 }, { "epoch": 0.11, "grad_norm": 0.26516349701479863, "learning_rate": 0.00019875556388301543, "loss": 0.56, "step": 178 }, { "epoch": 0.11, "grad_norm": 0.27235027968517367, "learning_rate": 0.00019874073633953997, "loss": 0.5872, "step": 179 }, { "epoch": 0.11, "grad_norm": 0.2692241567318253, "learning_rate": 0.00019872582154249884, "loss": 0.5397, "step": 180 }, { "epoch": 0.11, "grad_norm": 0.2560507155942398, "learning_rate": 0.00019871081950507163, "loss": 0.5431, "step": 181 }, { "epoch": 0.11, "grad_norm": 0.26691224099103567, "learning_rate": 0.00019869573024051517, "loss": 0.5608, "step": 182 }, { "epoch": 0.11, "grad_norm": 0.2961375506924155, "learning_rate": 0.00019868055376216323, "loss": 0.5784, "step": 183 }, { "epoch": 0.11, "grad_norm": 0.26055755072015874, "learning_rate": 0.00019866529008342673, "loss": 0.5369, "step": 184 }, { "epoch": 0.11, "grad_norm": 0.2525359310079611, "learning_rate": 0.00019864993921779361, "loss": 0.5438, "step": 185 }, { "epoch": 0.11, "grad_norm": 0.249327141855566, "learning_rate": 0.0001986345011788289, "loss": 0.5668, "step": 186 }, { "epoch": 0.11, "grad_norm": 0.2983950007732028, "learning_rate": 0.00019861897598017457, "loss": 0.5271, "step": 187 }, { "epoch": 0.11, "grad_norm": 0.25610455444964525, "learning_rate": 0.00019860336363554973, "loss": 0.6012, "step": 188 }, { "epoch": 0.11, "grad_norm": 0.24760444410184018, "learning_rate": 0.0001985876641587504, "loss": 0.5066, "step": 189 }, { "epoch": 0.11, "grad_norm": 0.2614264060863463, "learning_rate": 0.00019857187756364958, "loss": 0.5792, "step": 190 }, { "epoch": 0.11, "grad_norm": 0.27219045408444215, "learning_rate": 0.00019855600386419744, "loss": 0.543, "step": 191 }, { "epoch": 0.11, "grad_norm": 0.24606131498871828, "learning_rate": 0.00019854004307442088, "loss": 0.5676, "step": 192 }, { "epoch": 0.12, "grad_norm": 0.28394763236035964, "learning_rate": 0.0001985239952084239, "loss": 0.6032, "step": 193 }, { "epoch": 0.12, "grad_norm": 0.28350170917034406, "learning_rate": 0.0001985078602803874, "loss": 0.6264, "step": 194 }, { "epoch": 0.12, "grad_norm": 0.24011552907338696, "learning_rate": 0.00019849163830456922, "loss": 0.4793, "step": 195 }, { "epoch": 0.12, "grad_norm": 0.2561209086280576, "learning_rate": 0.00019847532929530415, "loss": 0.6198, "step": 196 }, { "epoch": 0.12, "grad_norm": 0.23712886628255178, "learning_rate": 0.00019845893326700384, "loss": 0.4989, "step": 197 }, { "epoch": 0.12, "grad_norm": 0.26720592489417233, "learning_rate": 0.00019844245023415685, "loss": 0.4934, "step": 198 }, { "epoch": 0.12, "grad_norm": 0.2753251397417421, "learning_rate": 0.0001984258802113287, "loss": 0.5544, "step": 199 }, { "epoch": 0.12, "grad_norm": 0.2557869713293877, "learning_rate": 0.0001984092232131616, "loss": 0.5643, "step": 200 }, { "epoch": 0.12, "grad_norm": 0.2669651919314609, "learning_rate": 0.0001983924792543748, "loss": 0.5879, "step": 201 }, { "epoch": 0.12, "grad_norm": 0.25579187132615644, "learning_rate": 0.00019837564834976432, "loss": 0.5742, "step": 202 }, { "epoch": 0.12, "grad_norm": 0.2550207949421237, "learning_rate": 0.000198358730514203, "loss": 0.574, "step": 203 }, { "epoch": 0.12, "grad_norm": 0.23565090455665763, "learning_rate": 0.0001983417257626405, "loss": 0.5299, "step": 204 }, { "epoch": 0.12, "grad_norm": 0.236980034600526, "learning_rate": 0.00019832463411010331, "loss": 0.5199, "step": 205 }, { "epoch": 0.12, "grad_norm": 0.2434029841189093, "learning_rate": 0.0001983074555716947, "loss": 0.5477, "step": 206 }, { "epoch": 0.12, "grad_norm": 0.24771511082227154, "learning_rate": 0.00019829019016259468, "loss": 0.5697, "step": 207 }, { "epoch": 0.12, "grad_norm": 0.23705880771864213, "learning_rate": 0.00019827283789806011, "loss": 0.521, "step": 208 }, { "epoch": 0.12, "grad_norm": 0.24167295291353477, "learning_rate": 0.0001982553987934245, "loss": 0.558, "step": 209 }, { "epoch": 0.13, "grad_norm": 0.2535406245356529, "learning_rate": 0.0001982378728640982, "loss": 0.5693, "step": 210 }, { "epoch": 0.13, "grad_norm": 0.24865334136075806, "learning_rate": 0.00019822026012556818, "loss": 0.5499, "step": 211 }, { "epoch": 0.13, "grad_norm": 0.2544751551481819, "learning_rate": 0.0001982025605933982, "loss": 0.5449, "step": 212 }, { "epoch": 0.13, "grad_norm": 0.2599391794330939, "learning_rate": 0.0001981847742832287, "loss": 0.6222, "step": 213 }, { "epoch": 0.13, "grad_norm": 0.23171934920449544, "learning_rate": 0.00019816690121077674, "loss": 0.5448, "step": 214 }, { "epoch": 0.13, "grad_norm": 0.24380268930715565, "learning_rate": 0.00019814894139183614, "loss": 0.5773, "step": 215 }, { "epoch": 0.13, "grad_norm": 0.2518755786374484, "learning_rate": 0.00019813089484227732, "loss": 0.5479, "step": 216 }, { "epoch": 0.13, "grad_norm": 0.23133984467720642, "learning_rate": 0.00019811276157804733, "loss": 0.471, "step": 217 }, { "epoch": 0.13, "grad_norm": 0.250968947574734, "learning_rate": 0.00019809454161516993, "loss": 0.5738, "step": 218 }, { "epoch": 0.13, "grad_norm": 0.25976015596485974, "learning_rate": 0.00019807623496974537, "loss": 0.5592, "step": 219 }, { "epoch": 0.13, "grad_norm": 0.24400199531999783, "learning_rate": 0.0001980578416579506, "loss": 0.5266, "step": 220 }, { "epoch": 0.13, "grad_norm": 0.24001572180370875, "learning_rate": 0.00019803936169603912, "loss": 0.5843, "step": 221 }, { "epoch": 0.13, "grad_norm": 0.22867208326764507, "learning_rate": 0.00019802079510034096, "loss": 0.518, "step": 222 }, { "epoch": 0.13, "grad_norm": 0.2381724022911579, "learning_rate": 0.00019800214188726276, "loss": 0.5175, "step": 223 }, { "epoch": 0.13, "grad_norm": 0.2700455397530704, "learning_rate": 0.00019798340207328766, "loss": 0.5804, "step": 224 }, { "epoch": 0.13, "grad_norm": 0.24320219539003604, "learning_rate": 0.00019796457567497537, "loss": 0.5304, "step": 225 }, { "epoch": 0.13, "grad_norm": 0.2370472610839002, "learning_rate": 0.0001979456627089621, "loss": 0.5671, "step": 226 }, { "epoch": 0.14, "grad_norm": 0.26756212643991917, "learning_rate": 0.0001979266631919605, "loss": 0.5528, "step": 227 }, { "epoch": 0.14, "grad_norm": 0.24929490389202372, "learning_rate": 0.00019790757714075979, "loss": 0.5407, "step": 228 }, { "epoch": 0.14, "grad_norm": 0.23090595152280974, "learning_rate": 0.00019788840457222556, "loss": 0.5258, "step": 229 }, { "epoch": 0.14, "grad_norm": 0.2979564627406142, "learning_rate": 0.0001978691455033, "loss": 0.5367, "step": 230 }, { "epoch": 0.14, "grad_norm": 0.24228845587479894, "learning_rate": 0.0001978497999510015, "loss": 0.5344, "step": 231 }, { "epoch": 0.14, "grad_norm": 0.25363482164729867, "learning_rate": 0.00019783036793242516, "loss": 0.5669, "step": 232 }, { "epoch": 0.14, "grad_norm": 0.23622712417060854, "learning_rate": 0.00019781084946474226, "loss": 0.5797, "step": 233 }, { "epoch": 0.14, "grad_norm": 0.21594682302559634, "learning_rate": 0.00019779124456520056, "loss": 0.5011, "step": 234 }, { "epoch": 0.14, "grad_norm": 0.24211833950801223, "learning_rate": 0.0001977715532511242, "loss": 0.5164, "step": 235 }, { "epoch": 0.14, "grad_norm": 0.2693820157715391, "learning_rate": 0.0001977517755399137, "loss": 0.5806, "step": 236 }, { "epoch": 0.14, "grad_norm": 0.24734981937576542, "learning_rate": 0.00019773191144904586, "loss": 0.5233, "step": 237 }, { "epoch": 0.14, "grad_norm": 0.2741663737268136, "learning_rate": 0.00019771196099607386, "loss": 0.5402, "step": 238 }, { "epoch": 0.14, "grad_norm": 0.2521489075033339, "learning_rate": 0.00019769192419862716, "loss": 0.5862, "step": 239 }, { "epoch": 0.14, "grad_norm": 0.2633255671236209, "learning_rate": 0.0001976718010744116, "loss": 0.548, "step": 240 }, { "epoch": 0.14, "grad_norm": 0.2691399721238696, "learning_rate": 0.00019765159164120916, "loss": 0.5648, "step": 241 }, { "epoch": 0.14, "grad_norm": 0.25501545746966797, "learning_rate": 0.00019763129591687827, "loss": 0.5602, "step": 242 }, { "epoch": 0.14, "grad_norm": 0.3049976908839563, "learning_rate": 0.00019761091391935347, "loss": 0.5508, "step": 243 }, { "epoch": 0.15, "grad_norm": 0.24467331400916031, "learning_rate": 0.00019759044566664558, "loss": 0.5229, "step": 244 }, { "epoch": 0.15, "grad_norm": 0.27011009612786374, "learning_rate": 0.00019756989117684164, "loss": 0.5448, "step": 245 }, { "epoch": 0.15, "grad_norm": 0.24427143044387528, "learning_rate": 0.00019754925046810493, "loss": 0.5435, "step": 246 }, { "epoch": 0.15, "grad_norm": 0.22753961311031143, "learning_rate": 0.00019752852355867486, "loss": 0.5369, "step": 247 }, { "epoch": 0.15, "grad_norm": 0.23865046003559778, "learning_rate": 0.00019750771046686704, "loss": 0.5354, "step": 248 }, { "epoch": 0.15, "grad_norm": 0.2736283930569903, "learning_rate": 0.00019748681121107325, "loss": 0.5588, "step": 249 }, { "epoch": 0.15, "grad_norm": 0.24727127426749082, "learning_rate": 0.00019746582580976136, "loss": 0.5753, "step": 250 }, { "epoch": 0.15, "grad_norm": 0.2828829340227291, "learning_rate": 0.00019744475428147546, "loss": 0.6793, "step": 251 }, { "epoch": 0.15, "grad_norm": 0.21818184663896711, "learning_rate": 0.00019742359664483563, "loss": 0.5248, "step": 252 }, { "epoch": 0.15, "grad_norm": 0.2320708306833192, "learning_rate": 0.00019740235291853812, "loss": 0.5461, "step": 253 }, { "epoch": 0.15, "grad_norm": 0.25703347930088793, "learning_rate": 0.00019738102312135523, "loss": 0.5713, "step": 254 }, { "epoch": 0.15, "grad_norm": 0.24399588128874033, "learning_rate": 0.0001973596072721353, "loss": 0.5178, "step": 255 }, { "epoch": 0.15, "grad_norm": 0.2229881452119291, "learning_rate": 0.00019733810538980281, "loss": 0.5144, "step": 256 }, { "epoch": 0.15, "grad_norm": 0.23889465035265364, "learning_rate": 0.0001973165174933581, "loss": 0.5727, "step": 257 }, { "epoch": 0.15, "grad_norm": 0.25790569964877214, "learning_rate": 0.0001972948436018776, "loss": 0.5659, "step": 258 }, { "epoch": 0.15, "grad_norm": 0.22511338701135042, "learning_rate": 0.00019727308373451377, "loss": 0.5292, "step": 259 }, { "epoch": 0.16, "grad_norm": 0.23111498863739158, "learning_rate": 0.000197251237910495, "loss": 0.5267, "step": 260 }, { "epoch": 0.16, "grad_norm": 0.23740021982137896, "learning_rate": 0.00019722930614912563, "loss": 0.5499, "step": 261 }, { "epoch": 0.16, "grad_norm": 0.24020258332985106, "learning_rate": 0.00019720728846978598, "loss": 0.5604, "step": 262 }, { "epoch": 0.16, "grad_norm": 0.23947573439011133, "learning_rate": 0.00019718518489193225, "loss": 0.5638, "step": 263 }, { "epoch": 0.16, "grad_norm": 0.23526187217481284, "learning_rate": 0.00019716299543509654, "loss": 0.5436, "step": 264 }, { "epoch": 0.16, "grad_norm": 0.2505831003191156, "learning_rate": 0.00019714072011888686, "loss": 0.5039, "step": 265 }, { "epoch": 0.16, "grad_norm": 0.21592487431784965, "learning_rate": 0.00019711835896298713, "loss": 0.484, "step": 266 }, { "epoch": 0.16, "grad_norm": 0.26528309878122613, "learning_rate": 0.00019709591198715707, "loss": 0.539, "step": 267 }, { "epoch": 0.16, "grad_norm": 0.21635789850987178, "learning_rate": 0.00019707337921123221, "loss": 0.5553, "step": 268 }, { "epoch": 0.16, "grad_norm": 0.23671151623321054, "learning_rate": 0.00019705076065512398, "loss": 0.4968, "step": 269 }, { "epoch": 0.16, "grad_norm": 0.25400871793456326, "learning_rate": 0.00019702805633881957, "loss": 0.5982, "step": 270 }, { "epoch": 0.16, "grad_norm": 0.2622810971314154, "learning_rate": 0.0001970052662823819, "loss": 0.5879, "step": 271 }, { "epoch": 0.16, "grad_norm": 0.22931814830456296, "learning_rate": 0.00019698239050594977, "loss": 0.5611, "step": 272 }, { "epoch": 0.16, "grad_norm": 0.213695866193263, "learning_rate": 0.0001969594290297376, "loss": 0.5386, "step": 273 }, { "epoch": 0.16, "grad_norm": 0.2431252328609808, "learning_rate": 0.00019693638187403563, "loss": 0.6039, "step": 274 }, { "epoch": 0.16, "grad_norm": 0.23108667253454973, "learning_rate": 0.00019691324905920984, "loss": 0.5579, "step": 275 }, { "epoch": 0.16, "grad_norm": 0.22718831415064272, "learning_rate": 0.0001968900306057018, "loss": 0.5196, "step": 276 }, { "epoch": 0.17, "grad_norm": 0.23632362967796033, "learning_rate": 0.0001968667265340288, "loss": 0.5336, "step": 277 }, { "epoch": 0.17, "grad_norm": 0.2421878973201691, "learning_rate": 0.00019684333686478383, "loss": 0.5928, "step": 278 }, { "epoch": 0.17, "grad_norm": 0.225775487602821, "learning_rate": 0.00019681986161863542, "loss": 0.552, "step": 279 }, { "epoch": 0.17, "grad_norm": 0.23037375759338816, "learning_rate": 0.00019679630081632782, "loss": 0.4983, "step": 280 }, { "epoch": 0.17, "grad_norm": 0.24684136612832333, "learning_rate": 0.00019677265447868086, "loss": 0.5655, "step": 281 }, { "epoch": 0.17, "grad_norm": 0.2412756534364674, "learning_rate": 0.0001967489226265899, "loss": 0.5063, "step": 282 }, { "epoch": 0.17, "grad_norm": 0.22005765622474396, "learning_rate": 0.00019672510528102597, "loss": 0.5188, "step": 283 }, { "epoch": 0.17, "grad_norm": 0.25071531725514384, "learning_rate": 0.0001967012024630355, "loss": 0.5938, "step": 284 }, { "epoch": 0.17, "grad_norm": 0.22139592405512468, "learning_rate": 0.00019667721419374065, "loss": 0.4917, "step": 285 }, { "epoch": 0.17, "grad_norm": 0.23067244251762076, "learning_rate": 0.00019665314049433888, "loss": 0.5584, "step": 286 }, { "epoch": 0.17, "grad_norm": 0.23829875535152545, "learning_rate": 0.00019662898138610323, "loss": 0.5264, "step": 287 }, { "epoch": 0.17, "grad_norm": 0.2641034663020514, "learning_rate": 0.00019660473689038228, "loss": 0.5805, "step": 288 }, { "epoch": 0.17, "grad_norm": 0.22321690487140503, "learning_rate": 0.00019658040702859997, "loss": 0.5529, "step": 289 }, { "epoch": 0.17, "grad_norm": 0.2502632163555589, "learning_rate": 0.00019655599182225565, "loss": 0.5347, "step": 290 }, { "epoch": 0.17, "grad_norm": 0.2392608020883604, "learning_rate": 0.00019653149129292426, "loss": 0.5263, "step": 291 }, { "epoch": 0.17, "grad_norm": 0.2539237490519494, "learning_rate": 0.00019650690546225592, "loss": 0.5156, "step": 292 }, { "epoch": 0.17, "grad_norm": 0.21964099103511592, "learning_rate": 0.00019648223435197627, "loss": 0.5101, "step": 293 }, { "epoch": 0.18, "grad_norm": 0.24992985700157416, "learning_rate": 0.00019645747798388628, "loss": 0.5621, "step": 294 }, { "epoch": 0.18, "grad_norm": 0.2559439615345381, "learning_rate": 0.0001964326363798622, "loss": 0.5753, "step": 295 }, { "epoch": 0.18, "grad_norm": 0.2504368010690795, "learning_rate": 0.00019640770956185567, "loss": 0.5558, "step": 296 }, { "epoch": 0.18, "grad_norm": 0.21022187251218089, "learning_rate": 0.0001963826975518936, "loss": 0.5322, "step": 297 }, { "epoch": 0.18, "grad_norm": 0.2422143856532352, "learning_rate": 0.00019635760037207817, "loss": 0.538, "step": 298 }, { "epoch": 0.18, "grad_norm": 0.23174554470584352, "learning_rate": 0.00019633241804458687, "loss": 0.5545, "step": 299 }, { "epoch": 0.18, "grad_norm": 0.267070426953347, "learning_rate": 0.00019630715059167238, "loss": 0.5632, "step": 300 }, { "epoch": 0.18, "grad_norm": 0.22256523603127878, "learning_rate": 0.0001962817980356626, "loss": 0.545, "step": 301 }, { "epoch": 0.18, "grad_norm": 0.24403300497950306, "learning_rate": 0.0001962563603989607, "loss": 0.5448, "step": 302 }, { "epoch": 0.18, "grad_norm": 0.2193103621019292, "learning_rate": 0.00019623083770404492, "loss": 0.5064, "step": 303 }, { "epoch": 0.18, "grad_norm": 0.23299992325798072, "learning_rate": 0.0001962052299734688, "loss": 0.5192, "step": 304 }, { "epoch": 0.18, "grad_norm": 0.2371054150083945, "learning_rate": 0.00019617953722986096, "loss": 0.5157, "step": 305 }, { "epoch": 0.18, "grad_norm": 0.2436064901189273, "learning_rate": 0.00019615375949592504, "loss": 0.5672, "step": 306 }, { "epoch": 0.18, "grad_norm": 0.25098365347678436, "learning_rate": 0.00019612789679443997, "loss": 0.5548, "step": 307 }, { "epoch": 0.18, "grad_norm": 0.2319425382974216, "learning_rate": 0.00019610194914825962, "loss": 0.5293, "step": 308 }, { "epoch": 0.18, "grad_norm": 0.24156576209403272, "learning_rate": 0.000196075916580313, "loss": 0.5672, "step": 309 }, { "epoch": 0.18, "grad_norm": 0.2337383575844323, "learning_rate": 0.0001960497991136041, "loss": 0.5509, "step": 310 }, { "epoch": 0.19, "grad_norm": 0.23799692988502053, "learning_rate": 0.00019602359677121199, "loss": 0.5604, "step": 311 }, { "epoch": 0.19, "grad_norm": 0.2296728275122706, "learning_rate": 0.0001959973095762907, "loss": 0.5371, "step": 312 }, { "epoch": 0.19, "grad_norm": 0.22381626870035518, "learning_rate": 0.00019597093755206936, "loss": 0.5465, "step": 313 }, { "epoch": 0.19, "grad_norm": 0.23335681761234933, "learning_rate": 0.00019594448072185182, "loss": 0.5386, "step": 314 }, { "epoch": 0.19, "grad_norm": 0.22582265649304345, "learning_rate": 0.00019591793910901707, "loss": 0.543, "step": 315 }, { "epoch": 0.19, "grad_norm": 0.2439330072441743, "learning_rate": 0.00019589131273701894, "loss": 0.5177, "step": 316 }, { "epoch": 0.19, "grad_norm": 0.2138593422237162, "learning_rate": 0.00019586460162938622, "loss": 0.5157, "step": 317 }, { "epoch": 0.19, "grad_norm": 0.24003613679646058, "learning_rate": 0.00019583780580972253, "loss": 0.5611, "step": 318 }, { "epoch": 0.19, "grad_norm": 0.2552582800734971, "learning_rate": 0.00019581092530170633, "loss": 0.5922, "step": 319 }, { "epoch": 0.19, "grad_norm": 0.21898423827197905, "learning_rate": 0.00019578396012909092, "loss": 0.5272, "step": 320 }, { "epoch": 0.19, "grad_norm": 0.22013525107478477, "learning_rate": 0.00019575691031570446, "loss": 0.5184, "step": 321 }, { "epoch": 0.19, "grad_norm": 0.21113180640163418, "learning_rate": 0.00019572977588544986, "loss": 0.5134, "step": 322 }, { "epoch": 0.19, "grad_norm": 0.22335860079943387, "learning_rate": 0.00019570255686230485, "loss": 0.5227, "step": 323 }, { "epoch": 0.19, "grad_norm": 0.23006684721287293, "learning_rate": 0.00019567525327032187, "loss": 0.5885, "step": 324 }, { "epoch": 0.19, "grad_norm": 0.21933564641390155, "learning_rate": 0.0001956478651336281, "loss": 0.5598, "step": 325 }, { "epoch": 0.19, "grad_norm": 0.21770749652400337, "learning_rate": 0.00019562039247642546, "loss": 0.5082, "step": 326 }, { "epoch": 0.2, "grad_norm": 0.22800344133653658, "learning_rate": 0.00019559283532299043, "loss": 0.5539, "step": 327 }, { "epoch": 0.2, "grad_norm": 0.2385574924163353, "learning_rate": 0.00019556519369767438, "loss": 0.5497, "step": 328 }, { "epoch": 0.2, "grad_norm": 0.23099538598172079, "learning_rate": 0.0001955374676249031, "loss": 0.5138, "step": 329 }, { "epoch": 0.2, "grad_norm": 0.21517217725478144, "learning_rate": 0.0001955096571291772, "loss": 0.5051, "step": 330 }, { "epoch": 0.2, "grad_norm": 0.21535771106277588, "learning_rate": 0.0001954817622350717, "loss": 0.524, "step": 331 }, { "epoch": 0.2, "grad_norm": 0.20361747402971658, "learning_rate": 0.00019545378296723635, "loss": 0.4989, "step": 332 }, { "epoch": 0.2, "grad_norm": 0.24644921068325687, "learning_rate": 0.0001954257193503954, "loss": 0.5927, "step": 333 }, { "epoch": 0.2, "grad_norm": 0.24765268362172385, "learning_rate": 0.0001953975714093476, "loss": 0.5451, "step": 334 }, { "epoch": 0.2, "grad_norm": 0.20846277824915477, "learning_rate": 0.00019536933916896633, "loss": 0.5259, "step": 335 }, { "epoch": 0.2, "grad_norm": 0.2457371199220107, "learning_rate": 0.00019534102265419932, "loss": 0.5784, "step": 336 }, { "epoch": 0.2, "grad_norm": 0.23029745387228598, "learning_rate": 0.00019531262189006882, "loss": 0.5918, "step": 337 }, { "epoch": 0.2, "grad_norm": 0.2387820151516941, "learning_rate": 0.0001952841369016716, "loss": 0.5576, "step": 338 }, { "epoch": 0.2, "grad_norm": 0.226451643448924, "learning_rate": 0.00019525556771417875, "loss": 0.5241, "step": 339 }, { "epoch": 0.2, "grad_norm": 0.22086691724075064, "learning_rate": 0.00019522691435283585, "loss": 0.5392, "step": 340 }, { "epoch": 0.2, "grad_norm": 0.2259720671796772, "learning_rate": 0.00019519817684296285, "loss": 0.516, "step": 341 }, { "epoch": 0.2, "grad_norm": 0.2244741513315317, "learning_rate": 0.00019516935520995393, "loss": 0.569, "step": 342 }, { "epoch": 0.2, "grad_norm": 0.23890602213836412, "learning_rate": 0.0001951404494792778, "loss": 0.524, "step": 343 }, { "epoch": 0.21, "grad_norm": 0.22136745892767679, "learning_rate": 0.00019511145967647737, "loss": 0.5472, "step": 344 }, { "epoch": 0.21, "grad_norm": 0.22275740066078306, "learning_rate": 0.00019508238582716984, "loss": 0.5553, "step": 345 }, { "epoch": 0.21, "grad_norm": 0.21225155652808625, "learning_rate": 0.00019505322795704676, "loss": 0.5302, "step": 346 }, { "epoch": 0.21, "grad_norm": 0.22704101844750724, "learning_rate": 0.0001950239860918738, "loss": 0.5485, "step": 347 }, { "epoch": 0.21, "grad_norm": 0.2135110250199134, "learning_rate": 0.00019499466025749097, "loss": 0.5343, "step": 348 }, { "epoch": 0.21, "grad_norm": 0.22772242632973722, "learning_rate": 0.00019496525047981242, "loss": 0.5159, "step": 349 }, { "epoch": 0.21, "grad_norm": 0.4444297049160113, "learning_rate": 0.00019493575678482649, "loss": 0.5121, "step": 350 }, { "epoch": 0.21, "grad_norm": 0.226632712040011, "learning_rate": 0.0001949061791985957, "loss": 0.5304, "step": 351 }, { "epoch": 0.21, "grad_norm": 0.22132303156586286, "learning_rate": 0.00019487651774725663, "loss": 0.4817, "step": 352 }, { "epoch": 0.21, "grad_norm": 0.23206581340772667, "learning_rate": 0.00019484677245702004, "loss": 0.5258, "step": 353 }, { "epoch": 0.21, "grad_norm": 0.2374903834541946, "learning_rate": 0.0001948169433541708, "loss": 0.5318, "step": 354 }, { "epoch": 0.21, "grad_norm": 0.22896458770920267, "learning_rate": 0.00019478703046506773, "loss": 0.4806, "step": 355 }, { "epoch": 0.21, "grad_norm": 0.21040301706147688, "learning_rate": 0.00019475703381614375, "loss": 0.5144, "step": 356 }, { "epoch": 0.21, "grad_norm": 0.21179618454444762, "learning_rate": 0.00019472695343390585, "loss": 0.524, "step": 357 }, { "epoch": 0.21, "grad_norm": 0.20436614333218908, "learning_rate": 0.00019469678934493488, "loss": 0.501, "step": 358 }, { "epoch": 0.21, "grad_norm": 0.2478256130980173, "learning_rate": 0.0001946665415758858, "loss": 0.5386, "step": 359 }, { "epoch": 0.21, "grad_norm": 0.226116084636948, "learning_rate": 0.00019463621015348748, "loss": 0.5101, "step": 360 }, { "epoch": 0.22, "grad_norm": 0.21838947264457534, "learning_rate": 0.00019460579510454263, "loss": 0.5296, "step": 361 }, { "epoch": 0.22, "grad_norm": 0.2152879498375444, "learning_rate": 0.00019457529645592792, "loss": 0.512, "step": 362 }, { "epoch": 0.22, "grad_norm": 0.22514971802642378, "learning_rate": 0.00019454471423459389, "loss": 0.5593, "step": 363 }, { "epoch": 0.22, "grad_norm": 0.23402386101532432, "learning_rate": 0.00019451404846756494, "loss": 0.555, "step": 364 }, { "epoch": 0.22, "grad_norm": 0.2244514913016572, "learning_rate": 0.00019448329918193927, "loss": 0.5689, "step": 365 }, { "epoch": 0.22, "grad_norm": 0.22260707231596893, "learning_rate": 0.00019445246640488893, "loss": 0.6062, "step": 366 }, { "epoch": 0.22, "grad_norm": 0.21791090145253736, "learning_rate": 0.00019442155016365965, "loss": 0.531, "step": 367 }, { "epoch": 0.22, "grad_norm": 0.21895889257567258, "learning_rate": 0.00019439055048557101, "loss": 0.5538, "step": 368 }, { "epoch": 0.22, "grad_norm": 0.21306696799872818, "learning_rate": 0.00019435946739801633, "loss": 0.5673, "step": 369 }, { "epoch": 0.22, "grad_norm": 0.2294607768810707, "learning_rate": 0.00019432830092846253, "loss": 0.5855, "step": 370 }, { "epoch": 0.22, "grad_norm": 0.22758409665267085, "learning_rate": 0.0001942970511044503, "loss": 0.5783, "step": 371 }, { "epoch": 0.22, "grad_norm": 0.21334100614200935, "learning_rate": 0.00019426571795359398, "loss": 0.5056, "step": 372 }, { "epoch": 0.22, "grad_norm": 0.24187624093875965, "learning_rate": 0.0001942343015035815, "loss": 0.543, "step": 373 }, { "epoch": 0.22, "grad_norm": 0.2275714845035408, "learning_rate": 0.00019420280178217443, "loss": 0.5329, "step": 374 }, { "epoch": 0.22, "grad_norm": 0.23237641477505608, "learning_rate": 0.00019417121881720793, "loss": 0.5134, "step": 375 }, { "epoch": 0.22, "grad_norm": 0.25196886008416386, "learning_rate": 0.0001941395526365907, "loss": 0.6023, "step": 376 }, { "epoch": 0.22, "grad_norm": 0.22418514390796682, "learning_rate": 0.00019410780326830498, "loss": 0.5529, "step": 377 }, { "epoch": 0.23, "grad_norm": 0.21438856736265666, "learning_rate": 0.0001940759707404065, "loss": 0.5134, "step": 378 }, { "epoch": 0.23, "grad_norm": 0.2331754234870151, "learning_rate": 0.00019404405508102455, "loss": 0.5406, "step": 379 }, { "epoch": 0.23, "grad_norm": 0.24908239322819828, "learning_rate": 0.00019401205631836178, "loss": 0.5377, "step": 380 }, { "epoch": 0.23, "grad_norm": 0.21332745391417657, "learning_rate": 0.00019397997448069435, "loss": 0.5025, "step": 381 }, { "epoch": 0.23, "grad_norm": 0.20749658696001225, "learning_rate": 0.00019394780959637177, "loss": 0.5257, "step": 382 }, { "epoch": 0.23, "grad_norm": 0.2237716482529178, "learning_rate": 0.000193915561693817, "loss": 0.5, "step": 383 }, { "epoch": 0.23, "grad_norm": 0.25234282015654147, "learning_rate": 0.00019388323080152633, "loss": 0.5753, "step": 384 }, { "epoch": 0.23, "grad_norm": 0.2252939326339829, "learning_rate": 0.00019385081694806936, "loss": 0.5662, "step": 385 }, { "epoch": 0.23, "grad_norm": 0.21979629294660186, "learning_rate": 0.00019381832016208904, "loss": 0.5141, "step": 386 }, { "epoch": 0.23, "grad_norm": 0.24762535901866153, "learning_rate": 0.0001937857404723016, "loss": 0.6193, "step": 387 }, { "epoch": 0.23, "grad_norm": 0.25032044234085526, "learning_rate": 0.00019375307790749647, "loss": 0.5024, "step": 388 }, { "epoch": 0.23, "grad_norm": 0.22892425302508923, "learning_rate": 0.0001937203324965364, "loss": 0.5401, "step": 389 }, { "epoch": 0.23, "grad_norm": 0.2461599771002527, "learning_rate": 0.0001936875042683573, "loss": 0.5301, "step": 390 }, { "epoch": 0.23, "grad_norm": 0.22363255721865732, "learning_rate": 0.00019365459325196825, "loss": 0.5538, "step": 391 }, { "epoch": 0.23, "grad_norm": 0.22482667580972365, "learning_rate": 0.00019362159947645152, "loss": 0.4928, "step": 392 }, { "epoch": 0.23, "grad_norm": 0.22869596173751142, "learning_rate": 0.00019358852297096253, "loss": 0.5546, "step": 393 }, { "epoch": 0.24, "grad_norm": 0.2274546469780496, "learning_rate": 0.00019355536376472972, "loss": 0.5763, "step": 394 }, { "epoch": 0.24, "grad_norm": 0.21284874650406885, "learning_rate": 0.0001935221218870547, "loss": 0.5778, "step": 395 }, { "epoch": 0.24, "grad_norm": 0.23158847478661296, "learning_rate": 0.0001934887973673121, "loss": 0.5654, "step": 396 }, { "epoch": 0.24, "grad_norm": 0.24510006704514478, "learning_rate": 0.0001934553902349496, "loss": 0.5053, "step": 397 }, { "epoch": 0.24, "grad_norm": 0.20330878586204656, "learning_rate": 0.00019342190051948777, "loss": 0.5171, "step": 398 }, { "epoch": 0.24, "grad_norm": 0.2131804710318274, "learning_rate": 0.0001933883282505203, "loss": 0.5286, "step": 399 }, { "epoch": 0.24, "grad_norm": 0.23297933515492006, "learning_rate": 0.00019335467345771377, "loss": 0.5593, "step": 400 }, { "epoch": 0.24, "grad_norm": 0.24611434220143105, "learning_rate": 0.0001933209361708077, "loss": 0.604, "step": 401 }, { "epoch": 0.24, "grad_norm": 0.23281321736249425, "learning_rate": 0.00019328711641961445, "loss": 0.5579, "step": 402 }, { "epoch": 0.24, "grad_norm": 0.21399822113981087, "learning_rate": 0.00019325321423401933, "loss": 0.5661, "step": 403 }, { "epoch": 0.24, "grad_norm": 0.22113107520723113, "learning_rate": 0.00019321922964398046, "loss": 0.5789, "step": 404 }, { "epoch": 0.24, "grad_norm": 0.23262880002546846, "learning_rate": 0.00019318516267952874, "loss": 0.5447, "step": 405 }, { "epoch": 0.24, "grad_norm": 0.24962941770082592, "learning_rate": 0.00019315101337076792, "loss": 0.5512, "step": 406 }, { "epoch": 0.24, "grad_norm": 0.22210049422713798, "learning_rate": 0.0001931167817478745, "loss": 0.5427, "step": 407 }, { "epoch": 0.24, "grad_norm": 0.22647809883332484, "learning_rate": 0.0001930824678410977, "loss": 0.4888, "step": 408 }, { "epoch": 0.24, "grad_norm": 0.23660763255678552, "learning_rate": 0.00019304807168075944, "loss": 0.5755, "step": 409 }, { "epoch": 0.24, "grad_norm": 0.2354103448271752, "learning_rate": 0.00019301359329725436, "loss": 0.5265, "step": 410 }, { "epoch": 0.25, "grad_norm": 0.24322261128085423, "learning_rate": 0.00019297903272104977, "loss": 0.5291, "step": 411 }, { "epoch": 0.25, "grad_norm": 0.20525199182278092, "learning_rate": 0.00019294438998268554, "loss": 0.4996, "step": 412 }, { "epoch": 0.25, "grad_norm": 0.24678535182755174, "learning_rate": 0.00019290966511277422, "loss": 0.567, "step": 413 }, { "epoch": 0.25, "grad_norm": 0.22165331172413838, "learning_rate": 0.00019287485814200087, "loss": 0.5348, "step": 414 }, { "epoch": 0.25, "grad_norm": 0.24541020782476444, "learning_rate": 0.00019283996910112318, "loss": 0.5432, "step": 415 }, { "epoch": 0.25, "grad_norm": 0.2255959168063083, "learning_rate": 0.00019280499802097126, "loss": 0.5891, "step": 416 }, { "epoch": 0.25, "grad_norm": 0.21159018099714821, "learning_rate": 0.0001927699449324478, "loss": 0.5003, "step": 417 }, { "epoch": 0.25, "grad_norm": 0.21379995902020923, "learning_rate": 0.00019273480986652794, "loss": 0.5314, "step": 418 }, { "epoch": 0.25, "grad_norm": 0.2853169518220406, "learning_rate": 0.0001926995928542592, "loss": 0.6108, "step": 419 }, { "epoch": 0.25, "grad_norm": 0.22738285867292138, "learning_rate": 0.00019266429392676164, "loss": 0.5217, "step": 420 }, { "epoch": 0.25, "grad_norm": 0.23835369502554374, "learning_rate": 0.00019262891311522755, "loss": 0.5318, "step": 421 }, { "epoch": 0.25, "grad_norm": 0.20671557324330114, "learning_rate": 0.0001925934504509217, "loss": 0.5234, "step": 422 }, { "epoch": 0.25, "grad_norm": 0.205212164360302, "learning_rate": 0.00019255790596518112, "loss": 0.5023, "step": 423 }, { "epoch": 0.25, "grad_norm": 0.21664090577036152, "learning_rate": 0.00019252227968941522, "loss": 0.5452, "step": 424 }, { "epoch": 0.25, "grad_norm": 0.22146041084684798, "learning_rate": 0.00019248657165510556, "loss": 0.5474, "step": 425 }, { "epoch": 0.25, "grad_norm": 0.2338997589574809, "learning_rate": 0.00019245078189380604, "loss": 0.5516, "step": 426 }, { "epoch": 0.25, "grad_norm": 0.2313978280927526, "learning_rate": 0.0001924149104371428, "loss": 0.5831, "step": 427 }, { "epoch": 0.26, "grad_norm": 0.2098577112814155, "learning_rate": 0.00019237895731681408, "loss": 0.5452, "step": 428 }, { "epoch": 0.26, "grad_norm": 0.26497439164374026, "learning_rate": 0.0001923429225645904, "loss": 0.5666, "step": 429 }, { "epoch": 0.26, "grad_norm": 0.21859970576834997, "learning_rate": 0.00019230680621231425, "loss": 0.5069, "step": 430 }, { "epoch": 0.26, "grad_norm": 0.20509380886351694, "learning_rate": 0.0001922706082919004, "loss": 0.4573, "step": 431 }, { "epoch": 0.26, "grad_norm": 0.2182328366507935, "learning_rate": 0.0001922343288353356, "loss": 0.6133, "step": 432 }, { "epoch": 0.26, "grad_norm": 0.2822350271273954, "learning_rate": 0.00019219796787467867, "loss": 0.5709, "step": 433 }, { "epoch": 0.26, "grad_norm": 0.24487543268473794, "learning_rate": 0.00019216152544206049, "loss": 0.546, "step": 434 }, { "epoch": 0.26, "grad_norm": 0.24221176090281485, "learning_rate": 0.00019212500156968383, "loss": 0.5507, "step": 435 }, { "epoch": 0.26, "grad_norm": 0.22053929296251015, "learning_rate": 0.00019208839628982358, "loss": 0.5473, "step": 436 }, { "epoch": 0.26, "grad_norm": 0.22975415570737245, "learning_rate": 0.00019205170963482643, "loss": 0.5181, "step": 437 }, { "epoch": 0.26, "grad_norm": 0.22969105575505203, "learning_rate": 0.00019201494163711104, "loss": 0.5463, "step": 438 }, { "epoch": 0.26, "grad_norm": 0.23764087103158363, "learning_rate": 0.00019197809232916795, "loss": 0.55, "step": 439 }, { "epoch": 0.26, "grad_norm": 0.21997498488474826, "learning_rate": 0.00019194116174355954, "loss": 0.5421, "step": 440 }, { "epoch": 0.26, "grad_norm": 0.22225824990596896, "learning_rate": 0.00019190414991291998, "loss": 0.5439, "step": 441 }, { "epoch": 0.26, "grad_norm": 0.243391488050543, "learning_rate": 0.00019186705686995533, "loss": 0.6289, "step": 442 }, { "epoch": 0.26, "grad_norm": 0.222494273038652, "learning_rate": 0.0001918298826474433, "loss": 0.5088, "step": 443 }, { "epoch": 0.26, "grad_norm": 0.22114450997419682, "learning_rate": 0.0001917926272782334, "loss": 0.5624, "step": 444 }, { "epoch": 0.27, "grad_norm": 0.21964760504534894, "learning_rate": 0.00019175529079524687, "loss": 0.5289, "step": 445 }, { "epoch": 0.27, "grad_norm": 0.3042847973140014, "learning_rate": 0.00019171787323147654, "loss": 0.5328, "step": 446 }, { "epoch": 0.27, "grad_norm": 0.22425571202210934, "learning_rate": 0.00019168037461998695, "loss": 0.5699, "step": 447 }, { "epoch": 0.27, "grad_norm": 0.23406959191320909, "learning_rate": 0.00019164279499391427, "loss": 0.5147, "step": 448 }, { "epoch": 0.27, "grad_norm": 0.3604500123158513, "learning_rate": 0.00019160513438646617, "loss": 0.5697, "step": 449 }, { "epoch": 0.27, "grad_norm": 0.2501436029131694, "learning_rate": 0.00019156739283092205, "loss": 0.6015, "step": 450 }, { "epoch": 0.27, "grad_norm": 0.21928141490521824, "learning_rate": 0.00019152957036063265, "loss": 0.5111, "step": 451 }, { "epoch": 0.27, "grad_norm": 0.257908225365161, "learning_rate": 0.00019149166700902032, "loss": 0.5132, "step": 452 }, { "epoch": 0.27, "grad_norm": 0.2713678867101362, "learning_rate": 0.0001914536828095789, "loss": 0.5995, "step": 453 }, { "epoch": 0.27, "grad_norm": 0.2398794022246256, "learning_rate": 0.0001914156177958736, "loss": 0.4993, "step": 454 }, { "epoch": 0.27, "grad_norm": 0.2373981477389832, "learning_rate": 0.0001913774720015411, "loss": 0.5064, "step": 455 }, { "epoch": 0.27, "grad_norm": 0.2188011093608266, "learning_rate": 0.00019133924546028942, "loss": 0.5606, "step": 456 }, { "epoch": 0.27, "grad_norm": 0.24077263566935142, "learning_rate": 0.00019130093820589791, "loss": 0.5606, "step": 457 }, { "epoch": 0.27, "grad_norm": 0.23519919814487683, "learning_rate": 0.00019126255027221735, "loss": 0.5307, "step": 458 }, { "epoch": 0.27, "grad_norm": 0.21480730775028578, "learning_rate": 0.00019122408169316976, "loss": 0.526, "step": 459 }, { "epoch": 0.27, "grad_norm": 0.2161668548042441, "learning_rate": 0.00019118553250274832, "loss": 0.5657, "step": 460 }, { "epoch": 0.28, "grad_norm": 0.22318400428439122, "learning_rate": 0.00019114690273501765, "loss": 0.513, "step": 461 }, { "epoch": 0.28, "grad_norm": 0.22252447744680176, "learning_rate": 0.00019110819242411337, "loss": 0.5247, "step": 462 }, { "epoch": 0.28, "grad_norm": 0.21358818358042153, "learning_rate": 0.00019106940160424244, "loss": 0.556, "step": 463 }, { "epoch": 0.28, "grad_norm": 0.2121229259271081, "learning_rate": 0.0001910305303096828, "loss": 0.5138, "step": 464 }, { "epoch": 0.28, "grad_norm": 0.22636146624511622, "learning_rate": 0.0001909915785747836, "loss": 0.5111, "step": 465 }, { "epoch": 0.28, "grad_norm": 0.20571954917028099, "learning_rate": 0.00019095254643396512, "loss": 0.5125, "step": 466 }, { "epoch": 0.28, "grad_norm": 0.21968966730793454, "learning_rate": 0.0001909134339217186, "loss": 0.5358, "step": 467 }, { "epoch": 0.28, "grad_norm": 0.21910723327372644, "learning_rate": 0.00019087424107260627, "loss": 0.5382, "step": 468 }, { "epoch": 0.28, "grad_norm": 0.2153841373499183, "learning_rate": 0.00019083496792126153, "loss": 0.5375, "step": 469 }, { "epoch": 0.28, "grad_norm": 0.23479205084160673, "learning_rate": 0.00019079561450238854, "loss": 0.5984, "step": 470 }, { "epoch": 0.28, "grad_norm": 0.21595571362737268, "learning_rate": 0.00019075618085076247, "loss": 0.5417, "step": 471 }, { "epoch": 0.28, "grad_norm": 0.24550770571804625, "learning_rate": 0.00019071666700122946, "loss": 0.5306, "step": 472 }, { "epoch": 0.28, "grad_norm": 0.21802243564456578, "learning_rate": 0.00019067707298870638, "loss": 0.5157, "step": 473 }, { "epoch": 0.28, "grad_norm": 0.2068796190094572, "learning_rate": 0.00019063739884818103, "loss": 0.5254, "step": 474 }, { "epoch": 0.28, "grad_norm": 0.24034732867281272, "learning_rate": 0.000190597644614712, "loss": 0.6204, "step": 475 }, { "epoch": 0.28, "grad_norm": 0.2260836607650634, "learning_rate": 0.00019055781032342864, "loss": 0.5492, "step": 476 }, { "epoch": 0.28, "grad_norm": 0.2476351525598878, "learning_rate": 0.00019051789600953102, "loss": 0.5157, "step": 477 }, { "epoch": 0.29, "grad_norm": 0.2280151093681579, "learning_rate": 0.00019047790170829003, "loss": 0.4984, "step": 478 }, { "epoch": 0.29, "grad_norm": 0.2217333524292061, "learning_rate": 0.00019043782745504711, "loss": 0.5149, "step": 479 }, { "epoch": 0.29, "grad_norm": 0.2356369467654302, "learning_rate": 0.00019039767328521442, "loss": 0.5724, "step": 480 }, { "epoch": 0.29, "grad_norm": 0.21541809863677616, "learning_rate": 0.0001903574392342747, "loss": 0.5138, "step": 481 }, { "epoch": 0.29, "grad_norm": 0.21722431891543054, "learning_rate": 0.00019031712533778137, "loss": 0.5536, "step": 482 }, { "epoch": 0.29, "grad_norm": 0.2370708268417489, "learning_rate": 0.00019027673163135827, "loss": 0.5038, "step": 483 }, { "epoch": 0.29, "grad_norm": 0.22809310323516838, "learning_rate": 0.00019023625815069989, "loss": 0.5713, "step": 484 }, { "epoch": 0.29, "grad_norm": 0.22374988575329294, "learning_rate": 0.00019019570493157114, "loss": 0.5549, "step": 485 }, { "epoch": 0.29, "grad_norm": 0.20510711707245072, "learning_rate": 0.0001901550720098074, "loss": 0.46, "step": 486 }, { "epoch": 0.29, "grad_norm": 0.2621551195786783, "learning_rate": 0.00019011435942131448, "loss": 0.5546, "step": 487 }, { "epoch": 0.29, "grad_norm": 0.20503054358781417, "learning_rate": 0.00019007356720206865, "loss": 0.5547, "step": 488 }, { "epoch": 0.29, "grad_norm": 0.23586140447856616, "learning_rate": 0.00019003269538811647, "loss": 0.6075, "step": 489 }, { "epoch": 0.29, "grad_norm": 0.2828040872125889, "learning_rate": 0.00018999174401557488, "loss": 0.602, "step": 490 }, { "epoch": 0.29, "grad_norm": 0.2023429982220119, "learning_rate": 0.00018995071312063105, "loss": 0.4975, "step": 491 }, { "epoch": 0.29, "grad_norm": 0.2054777673202953, "learning_rate": 0.00018990960273954254, "loss": 0.5295, "step": 492 }, { "epoch": 0.29, "grad_norm": 0.1982185225446849, "learning_rate": 0.00018986841290863704, "loss": 0.5461, "step": 493 }, { "epoch": 0.29, "grad_norm": 0.23248022218099268, "learning_rate": 0.0001898271436643125, "loss": 0.5924, "step": 494 }, { "epoch": 0.3, "grad_norm": 0.2235279893303581, "learning_rate": 0.00018978579504303706, "loss": 0.5598, "step": 495 }, { "epoch": 0.3, "grad_norm": 0.21675084465821123, "learning_rate": 0.000189744367081349, "loss": 0.5012, "step": 496 }, { "epoch": 0.3, "grad_norm": 0.2041881848681654, "learning_rate": 0.00018970285981585662, "loss": 0.526, "step": 497 }, { "epoch": 0.3, "grad_norm": 0.23258761727278376, "learning_rate": 0.00018966127328323842, "loss": 0.553, "step": 498 }, { "epoch": 0.3, "grad_norm": 0.23066266735191, "learning_rate": 0.00018961960752024288, "loss": 0.5506, "step": 499 }, { "epoch": 0.3, "grad_norm": 0.20634958879584178, "learning_rate": 0.0001895778625636885, "loss": 0.5006, "step": 500 }, { "epoch": 0.3, "grad_norm": 0.21082421656186934, "learning_rate": 0.00018953603845046378, "loss": 0.5279, "step": 501 }, { "epoch": 0.3, "grad_norm": 0.2057560041730304, "learning_rate": 0.00018949413521752713, "loss": 0.5598, "step": 502 }, { "epoch": 0.3, "grad_norm": 0.2096114347066206, "learning_rate": 0.00018945215290190693, "loss": 0.5113, "step": 503 }, { "epoch": 0.3, "grad_norm": 0.23218477255443984, "learning_rate": 0.00018941009154070136, "loss": 0.5169, "step": 504 }, { "epoch": 0.3, "grad_norm": 0.20857717653678057, "learning_rate": 0.00018936795117107855, "loss": 0.5149, "step": 505 }, { "epoch": 0.3, "grad_norm": 0.24006448825761761, "learning_rate": 0.0001893257318302764, "loss": 0.5228, "step": 506 }, { "epoch": 0.3, "grad_norm": 0.2146671098435255, "learning_rate": 0.00018928343355560258, "loss": 0.5257, "step": 507 }, { "epoch": 0.3, "grad_norm": 0.20608859556559073, "learning_rate": 0.00018924105638443452, "loss": 0.527, "step": 508 }, { "epoch": 0.3, "grad_norm": 0.2336814919363686, "learning_rate": 0.0001891986003542194, "loss": 0.5461, "step": 509 }, { "epoch": 0.3, "grad_norm": 0.2409130946928026, "learning_rate": 0.00018915606550247397, "loss": 0.5493, "step": 510 }, { "epoch": 0.3, "grad_norm": 0.21371348825911873, "learning_rate": 0.0001891134518667848, "loss": 0.572, "step": 511 }, { "epoch": 0.31, "grad_norm": 0.2014364828041311, "learning_rate": 0.000189070759484808, "loss": 0.5109, "step": 512 }, { "epoch": 0.31, "grad_norm": 0.2290945612613713, "learning_rate": 0.0001890279883942692, "loss": 0.5493, "step": 513 }, { "epoch": 0.31, "grad_norm": 0.22127732127756986, "learning_rate": 0.0001889851386329637, "loss": 0.5387, "step": 514 }, { "epoch": 0.31, "grad_norm": 0.20564079598559082, "learning_rate": 0.00018894221023875622, "loss": 0.5192, "step": 515 }, { "epoch": 0.31, "grad_norm": 0.213993086214796, "learning_rate": 0.00018889920324958106, "loss": 0.5044, "step": 516 }, { "epoch": 0.31, "grad_norm": 0.21506249939577854, "learning_rate": 0.00018885611770344185, "loss": 0.4969, "step": 517 }, { "epoch": 0.31, "grad_norm": 0.22792164808811663, "learning_rate": 0.00018881295363841174, "loss": 0.5564, "step": 518 }, { "epoch": 0.31, "grad_norm": 0.1978731923118128, "learning_rate": 0.00018876971109263324, "loss": 0.4898, "step": 519 }, { "epoch": 0.31, "grad_norm": 0.22394451521984352, "learning_rate": 0.00018872639010431822, "loss": 0.5586, "step": 520 }, { "epoch": 0.31, "grad_norm": 0.20009625678598073, "learning_rate": 0.0001886829907117478, "loss": 0.5399, "step": 521 }, { "epoch": 0.31, "grad_norm": 0.20448355507434923, "learning_rate": 0.00018863951295327244, "loss": 0.5263, "step": 522 }, { "epoch": 0.31, "grad_norm": 0.1967777231547204, "learning_rate": 0.00018859595686731187, "loss": 0.4904, "step": 523 }, { "epoch": 0.31, "grad_norm": 0.2052388343929957, "learning_rate": 0.00018855232249235498, "loss": 0.4951, "step": 524 }, { "epoch": 0.31, "grad_norm": 0.1970956590240829, "learning_rate": 0.00018850860986695985, "loss": 0.5112, "step": 525 }, { "epoch": 0.31, "grad_norm": 0.2102143499682878, "learning_rate": 0.00018846481902975377, "loss": 0.5234, "step": 526 }, { "epoch": 0.31, "grad_norm": 0.23384214794287286, "learning_rate": 0.00018842095001943306, "loss": 0.5387, "step": 527 }, { "epoch": 0.32, "grad_norm": 0.20133953340775343, "learning_rate": 0.00018837700287476316, "loss": 0.4995, "step": 528 }, { "epoch": 0.32, "grad_norm": 0.2238467486071384, "learning_rate": 0.00018833297763457858, "loss": 0.5709, "step": 529 }, { "epoch": 0.32, "grad_norm": 0.26170161234282546, "learning_rate": 0.00018828887433778278, "loss": 0.6314, "step": 530 }, { "epoch": 0.32, "grad_norm": 0.2317819906199683, "learning_rate": 0.00018824469302334822, "loss": 0.5333, "step": 531 }, { "epoch": 0.32, "grad_norm": 0.21538390925414544, "learning_rate": 0.0001882004337303163, "loss": 0.5603, "step": 532 }, { "epoch": 0.32, "grad_norm": 0.23053571801246284, "learning_rate": 0.0001881560964977974, "loss": 0.593, "step": 533 }, { "epoch": 0.32, "grad_norm": 0.21173642276584706, "learning_rate": 0.0001881116813649706, "loss": 0.5539, "step": 534 }, { "epoch": 0.32, "grad_norm": 0.24587290888576793, "learning_rate": 0.00018806718837108402, "loss": 0.5408, "step": 535 }, { "epoch": 0.32, "grad_norm": 0.22324082101473863, "learning_rate": 0.00018802261755545443, "loss": 0.5857, "step": 536 }, { "epoch": 0.32, "grad_norm": 0.21827653101692504, "learning_rate": 0.0001879779689574674, "loss": 0.5451, "step": 537 }, { "epoch": 0.32, "grad_norm": 0.2146222856243753, "learning_rate": 0.00018793324261657737, "loss": 0.5007, "step": 538 }, { "epoch": 0.32, "grad_norm": 0.20994383183759666, "learning_rate": 0.00018788843857230726, "loss": 0.5039, "step": 539 }, { "epoch": 0.32, "grad_norm": 0.23384168426304514, "learning_rate": 0.00018784355686424876, "loss": 0.5329, "step": 540 }, { "epoch": 0.32, "grad_norm": 0.20284382518697272, "learning_rate": 0.00018779859753206225, "loss": 0.5383, "step": 541 }, { "epoch": 0.32, "grad_norm": 0.22307014132513725, "learning_rate": 0.00018775356061547662, "loss": 0.5766, "step": 542 }, { "epoch": 0.32, "grad_norm": 0.21675879523474215, "learning_rate": 0.00018770844615428932, "loss": 0.4994, "step": 543 }, { "epoch": 0.32, "grad_norm": 0.2200785983728407, "learning_rate": 0.00018766325418836637, "loss": 0.5615, "step": 544 }, { "epoch": 0.33, "grad_norm": 0.20895654400479502, "learning_rate": 0.00018761798475764224, "loss": 0.4993, "step": 545 }, { "epoch": 0.33, "grad_norm": 0.22152937631276676, "learning_rate": 0.00018757263790211988, "loss": 0.5275, "step": 546 }, { "epoch": 0.33, "grad_norm": 0.209333487906431, "learning_rate": 0.0001875272136618706, "loss": 0.4911, "step": 547 }, { "epoch": 0.33, "grad_norm": 0.2123519912763275, "learning_rate": 0.00018748171207703417, "loss": 0.5662, "step": 548 }, { "epoch": 0.33, "grad_norm": 0.2147346642469028, "learning_rate": 0.00018743613318781868, "loss": 0.5651, "step": 549 }, { "epoch": 0.33, "grad_norm": 0.2017789732342509, "learning_rate": 0.00018739047703450048, "loss": 0.5573, "step": 550 }, { "epoch": 0.33, "grad_norm": 0.2084087089737107, "learning_rate": 0.00018734474365742428, "loss": 0.562, "step": 551 }, { "epoch": 0.33, "grad_norm": 0.22130968599178, "learning_rate": 0.00018729893309700295, "loss": 0.5729, "step": 552 }, { "epoch": 0.33, "grad_norm": 0.22736172090948445, "learning_rate": 0.0001872530453937176, "loss": 0.5548, "step": 553 }, { "epoch": 0.33, "grad_norm": 0.21738577850339916, "learning_rate": 0.0001872070805881176, "loss": 0.5191, "step": 554 }, { "epoch": 0.33, "grad_norm": 0.20994273135857797, "learning_rate": 0.00018716103872082026, "loss": 0.5153, "step": 555 }, { "epoch": 0.33, "grad_norm": 0.25944295362906805, "learning_rate": 0.00018711491983251113, "loss": 0.5471, "step": 556 }, { "epoch": 0.33, "grad_norm": 0.2138519097360962, "learning_rate": 0.00018706872396394376, "loss": 0.4875, "step": 557 }, { "epoch": 0.33, "grad_norm": 0.23586915663527888, "learning_rate": 0.00018702245115593974, "loss": 0.5224, "step": 558 }, { "epoch": 0.33, "grad_norm": 0.20477148046499385, "learning_rate": 0.0001869761014493887, "loss": 0.5466, "step": 559 }, { "epoch": 0.33, "grad_norm": 0.21783175505387284, "learning_rate": 0.00018692967488524812, "loss": 0.5557, "step": 560 }, { "epoch": 0.33, "grad_norm": 0.20442177589984145, "learning_rate": 0.0001868831715045435, "loss": 0.507, "step": 561 }, { "epoch": 0.34, "grad_norm": 0.21291324212369495, "learning_rate": 0.00018683659134836813, "loss": 0.5779, "step": 562 }, { "epoch": 0.34, "grad_norm": 0.22670486875141618, "learning_rate": 0.00018678993445788323, "loss": 0.5831, "step": 563 }, { "epoch": 0.34, "grad_norm": 0.2431493116309222, "learning_rate": 0.00018674320087431768, "loss": 0.5389, "step": 564 }, { "epoch": 0.34, "grad_norm": 0.22102091260855142, "learning_rate": 0.00018669639063896836, "loss": 0.5569, "step": 565 }, { "epoch": 0.34, "grad_norm": 0.20001951850669827, "learning_rate": 0.0001866495037931997, "loss": 0.486, "step": 566 }, { "epoch": 0.34, "grad_norm": 0.22781103196427857, "learning_rate": 0.00018660254037844388, "loss": 0.4973, "step": 567 }, { "epoch": 0.34, "grad_norm": 0.21129685691062433, "learning_rate": 0.00018655550043620073, "loss": 0.5459, "step": 568 }, { "epoch": 0.34, "grad_norm": 0.20363805081315986, "learning_rate": 0.0001865083840080378, "loss": 0.4997, "step": 569 }, { "epoch": 0.34, "grad_norm": 0.22269838654252982, "learning_rate": 0.00018646119113559006, "loss": 0.5406, "step": 570 }, { "epoch": 0.34, "grad_norm": 0.20307002281681275, "learning_rate": 0.00018641392186056016, "loss": 0.4861, "step": 571 }, { "epoch": 0.34, "grad_norm": 0.20146261628709675, "learning_rate": 0.0001863665762247182, "loss": 0.561, "step": 572 }, { "epoch": 0.34, "grad_norm": 0.21049257054009352, "learning_rate": 0.00018631915426990184, "loss": 0.5257, "step": 573 }, { "epoch": 0.34, "grad_norm": 0.2245482792823418, "learning_rate": 0.00018627165603801605, "loss": 0.5441, "step": 574 }, { "epoch": 0.34, "grad_norm": 0.2106578436256788, "learning_rate": 0.0001862240815710333, "loss": 0.5125, "step": 575 }, { "epoch": 0.34, "grad_norm": 0.2091435884054145, "learning_rate": 0.0001861764309109934, "loss": 0.523, "step": 576 }, { "epoch": 0.34, "grad_norm": 0.21256854318600532, "learning_rate": 0.00018612870410000354, "loss": 0.4851, "step": 577 }, { "epoch": 0.34, "grad_norm": 0.24387962798982954, "learning_rate": 0.00018608090118023808, "loss": 0.5423, "step": 578 }, { "epoch": 0.35, "grad_norm": 0.2357478920855788, "learning_rate": 0.00018603302219393874, "loss": 0.5386, "step": 579 }, { "epoch": 0.35, "grad_norm": 0.21267780857117077, "learning_rate": 0.0001859850671834144, "loss": 0.5545, "step": 580 }, { "epoch": 0.35, "grad_norm": 0.25049614581715324, "learning_rate": 0.0001859370361910412, "loss": 0.5241, "step": 581 }, { "epoch": 0.35, "grad_norm": 0.1937807494598699, "learning_rate": 0.00018588892925926228, "loss": 0.5533, "step": 582 }, { "epoch": 0.35, "grad_norm": 0.21209972240968475, "learning_rate": 0.00018584074643058807, "loss": 0.538, "step": 583 }, { "epoch": 0.35, "grad_norm": 0.22281277082523665, "learning_rate": 0.00018579248774759586, "loss": 0.5456, "step": 584 }, { "epoch": 0.35, "grad_norm": 0.22156542955128883, "learning_rate": 0.00018574415325293018, "loss": 0.5622, "step": 585 }, { "epoch": 0.35, "grad_norm": 0.20068342929250654, "learning_rate": 0.00018569574298930237, "loss": 0.5372, "step": 586 }, { "epoch": 0.35, "grad_norm": 0.21693418845369525, "learning_rate": 0.00018564725699949083, "loss": 0.4874, "step": 587 }, { "epoch": 0.35, "grad_norm": 0.2060622909003744, "learning_rate": 0.0001855986953263409, "loss": 0.5331, "step": 588 }, { "epoch": 0.35, "grad_norm": 0.20007419545283933, "learning_rate": 0.00018555005801276463, "loss": 0.5131, "step": 589 }, { "epoch": 0.35, "grad_norm": 0.21905328017125653, "learning_rate": 0.00018550134510174115, "loss": 0.5572, "step": 590 }, { "epoch": 0.35, "grad_norm": 0.21213287568506015, "learning_rate": 0.0001854525566363162, "loss": 0.5359, "step": 591 }, { "epoch": 0.35, "grad_norm": 0.20066093050756748, "learning_rate": 0.00018540369265960242, "loss": 0.5334, "step": 592 }, { "epoch": 0.35, "grad_norm": 0.2068811720002483, "learning_rate": 0.00018535475321477906, "loss": 0.5558, "step": 593 }, { "epoch": 0.35, "grad_norm": 0.2025287668887073, "learning_rate": 0.00018530573834509215, "loss": 0.5098, "step": 594 }, { "epoch": 0.36, "grad_norm": 0.20807380346718593, "learning_rate": 0.0001852566480938543, "loss": 0.5211, "step": 595 }, { "epoch": 0.36, "grad_norm": 0.2049943719782544, "learning_rate": 0.00018520748250444474, "loss": 0.5379, "step": 596 }, { "epoch": 0.36, "grad_norm": 0.8558508208219735, "learning_rate": 0.00018515824162030934, "loss": 0.5403, "step": 597 }, { "epoch": 0.36, "grad_norm": 0.25414317775682305, "learning_rate": 0.00018510892548496047, "loss": 0.5804, "step": 598 }, { "epoch": 0.36, "grad_norm": 0.20806597400748386, "learning_rate": 0.00018505953414197696, "loss": 0.5419, "step": 599 }, { "epoch": 0.36, "grad_norm": 0.1950528976937739, "learning_rate": 0.00018501006763500414, "loss": 0.4956, "step": 600 }, { "epoch": 0.36, "grad_norm": 0.20652545713558523, "learning_rate": 0.00018496052600775376, "loss": 0.4942, "step": 601 }, { "epoch": 0.36, "grad_norm": 0.20955886781649663, "learning_rate": 0.0001849109093040039, "loss": 0.5177, "step": 602 }, { "epoch": 0.36, "grad_norm": 0.21093362015684414, "learning_rate": 0.00018486121756759906, "loss": 0.5672, "step": 603 }, { "epoch": 0.36, "grad_norm": 0.22033088091533184, "learning_rate": 0.00018481145084245002, "loss": 0.5691, "step": 604 }, { "epoch": 0.36, "grad_norm": 0.20322111965044637, "learning_rate": 0.00018476160917253373, "loss": 0.5425, "step": 605 }, { "epoch": 0.36, "grad_norm": 0.2028788101278272, "learning_rate": 0.0001847116926018935, "loss": 0.5176, "step": 606 }, { "epoch": 0.36, "grad_norm": 0.19551140156538951, "learning_rate": 0.0001846617011746388, "loss": 0.5115, "step": 607 }, { "epoch": 0.36, "grad_norm": 0.21944694996534547, "learning_rate": 0.00018461163493494517, "loss": 0.5496, "step": 608 }, { "epoch": 0.36, "grad_norm": 0.21506814147924705, "learning_rate": 0.0001845614939270543, "loss": 0.5823, "step": 609 }, { "epoch": 0.36, "grad_norm": 0.2220938137588105, "learning_rate": 0.00018451127819527402, "loss": 0.5731, "step": 610 }, { "epoch": 0.36, "grad_norm": 0.21590208362786933, "learning_rate": 0.00018446098778397807, "loss": 0.6063, "step": 611 }, { "epoch": 0.37, "grad_norm": 0.20084594317065918, "learning_rate": 0.00018441062273760628, "loss": 0.5286, "step": 612 }, { "epoch": 0.37, "grad_norm": 0.21847304705653886, "learning_rate": 0.00018436018310066435, "loss": 0.5721, "step": 613 }, { "epoch": 0.37, "grad_norm": 0.2467936487351411, "learning_rate": 0.000184309668917724, "loss": 0.571, "step": 614 }, { "epoch": 0.37, "grad_norm": 0.21666156526926003, "learning_rate": 0.0001842590802334227, "loss": 0.5244, "step": 615 }, { "epoch": 0.37, "grad_norm": 0.21336859433357677, "learning_rate": 0.00018420841709246383, "loss": 0.5724, "step": 616 }, { "epoch": 0.37, "grad_norm": 0.1933070755110986, "learning_rate": 0.0001841576795396166, "loss": 0.5347, "step": 617 }, { "epoch": 0.37, "grad_norm": 0.2332186369470874, "learning_rate": 0.00018410686761971586, "loss": 0.5474, "step": 618 }, { "epoch": 0.37, "grad_norm": 0.1996293438855639, "learning_rate": 0.00018405598137766224, "loss": 0.5421, "step": 619 }, { "epoch": 0.37, "grad_norm": 0.2012759807756364, "learning_rate": 0.00018400502085842208, "loss": 0.519, "step": 620 }, { "epoch": 0.37, "grad_norm": 0.24355300568180752, "learning_rate": 0.00018395398610702733, "loss": 0.597, "step": 621 }, { "epoch": 0.37, "grad_norm": 0.2136711983483761, "learning_rate": 0.00018390287716857546, "loss": 0.5398, "step": 622 }, { "epoch": 0.37, "grad_norm": 0.22275088525970024, "learning_rate": 0.00018385169408822964, "loss": 0.5597, "step": 623 }, { "epoch": 0.37, "grad_norm": 0.20011931485707388, "learning_rate": 0.0001838004369112184, "loss": 0.4901, "step": 624 }, { "epoch": 0.37, "grad_norm": 0.19544716159187206, "learning_rate": 0.00018374910568283594, "loss": 0.4726, "step": 625 }, { "epoch": 0.37, "grad_norm": 0.2176067620544374, "learning_rate": 0.00018369770044844168, "loss": 0.5369, "step": 626 }, { "epoch": 0.37, "grad_norm": 0.2005629047810257, "learning_rate": 0.00018364622125346055, "loss": 0.4914, "step": 627 }, { "epoch": 0.37, "grad_norm": 0.21497281608823432, "learning_rate": 0.0001835946681433829, "loss": 0.5559, "step": 628 }, { "epoch": 0.38, "grad_norm": 0.20354723273049724, "learning_rate": 0.00018354304116376425, "loss": 0.5083, "step": 629 }, { "epoch": 0.38, "grad_norm": 0.23536026550959782, "learning_rate": 0.0001834913403602255, "loss": 0.5449, "step": 630 }, { "epoch": 0.38, "grad_norm": 0.20887211237530257, "learning_rate": 0.00018343956577845276, "loss": 0.5131, "step": 631 }, { "epoch": 0.38, "grad_norm": 0.21728763678777088, "learning_rate": 0.00018338771746419726, "loss": 0.5484, "step": 632 }, { "epoch": 0.38, "grad_norm": 0.21910570476522437, "learning_rate": 0.00018333579546327556, "loss": 0.5452, "step": 633 }, { "epoch": 0.38, "grad_norm": 0.21247350127543838, "learning_rate": 0.00018328379982156915, "loss": 0.5232, "step": 634 }, { "epoch": 0.38, "grad_norm": 0.21706686115456897, "learning_rate": 0.00018323173058502472, "loss": 0.5353, "step": 635 }, { "epoch": 0.38, "grad_norm": 0.19529494853666482, "learning_rate": 0.00018317958779965387, "loss": 0.4611, "step": 636 }, { "epoch": 0.38, "grad_norm": 0.2194890381897013, "learning_rate": 0.00018312737151153334, "loss": 0.4884, "step": 637 }, { "epoch": 0.38, "grad_norm": 0.24336065627870296, "learning_rate": 0.00018307508176680472, "loss": 0.5708, "step": 638 }, { "epoch": 0.38, "grad_norm": 0.22638828434923797, "learning_rate": 0.00018302271861167456, "loss": 0.5795, "step": 639 }, { "epoch": 0.38, "grad_norm": 0.20501380703607638, "learning_rate": 0.0001829702820924142, "loss": 0.5645, "step": 640 }, { "epoch": 0.38, "grad_norm": 0.22705979847255006, "learning_rate": 0.00018291777225535994, "loss": 0.4974, "step": 641 }, { "epoch": 0.38, "grad_norm": 0.22629645320684777, "learning_rate": 0.00018286518914691272, "loss": 0.5587, "step": 642 }, { "epoch": 0.38, "grad_norm": 0.21772563640763765, "learning_rate": 0.00018281253281353838, "loss": 0.5219, "step": 643 }, { "epoch": 0.38, "grad_norm": 0.20447194133414195, "learning_rate": 0.00018275980330176737, "loss": 0.5425, "step": 644 }, { "epoch": 0.38, "grad_norm": 0.24126870503035064, "learning_rate": 0.00018270700065819477, "loss": 0.5119, "step": 645 }, { "epoch": 0.39, "grad_norm": 0.23269297218381896, "learning_rate": 0.00018265412492948042, "loss": 0.5507, "step": 646 }, { "epoch": 0.39, "grad_norm": 0.23416570398912578, "learning_rate": 0.0001826011761623486, "loss": 0.5947, "step": 647 }, { "epoch": 0.39, "grad_norm": 0.2186560086983282, "learning_rate": 0.0001825481544035882, "loss": 0.5204, "step": 648 }, { "epoch": 0.39, "grad_norm": 0.20624707271501935, "learning_rate": 0.00018249505970005262, "loss": 0.4785, "step": 649 }, { "epoch": 0.39, "grad_norm": 0.23418189558532218, "learning_rate": 0.00018244189209865974, "loss": 0.4976, "step": 650 }, { "epoch": 0.39, "grad_norm": 0.21372290734059424, "learning_rate": 0.00018238865164639173, "loss": 0.5237, "step": 651 }, { "epoch": 0.39, "grad_norm": 0.1986689651795865, "learning_rate": 0.0001823353383902953, "loss": 0.5354, "step": 652 }, { "epoch": 0.39, "grad_norm": 0.21154599437074698, "learning_rate": 0.0001822819523774814, "loss": 0.5292, "step": 653 }, { "epoch": 0.39, "grad_norm": 0.21348268586605149, "learning_rate": 0.00018222849365512523, "loss": 0.5249, "step": 654 }, { "epoch": 0.39, "grad_norm": 0.22296243039072478, "learning_rate": 0.0001821749622704664, "loss": 0.5458, "step": 655 }, { "epoch": 0.39, "grad_norm": 0.22596567506529938, "learning_rate": 0.00018212135827080857, "loss": 0.5085, "step": 656 }, { "epoch": 0.39, "grad_norm": 0.19012132806019622, "learning_rate": 0.00018206768170351962, "loss": 0.4977, "step": 657 }, { "epoch": 0.39, "grad_norm": 0.2125366600531234, "learning_rate": 0.0001820139326160316, "loss": 0.5051, "step": 658 }, { "epoch": 0.39, "grad_norm": 0.23677835317412968, "learning_rate": 0.00018196011105584058, "loss": 0.575, "step": 659 }, { "epoch": 0.39, "grad_norm": 0.2262210065848097, "learning_rate": 0.00018190621707050671, "loss": 0.5744, "step": 660 }, { "epoch": 0.39, "grad_norm": 0.21618545867420894, "learning_rate": 0.0001818522507076541, "loss": 0.5715, "step": 661 }, { "epoch": 0.39, "grad_norm": 0.2050215711297079, "learning_rate": 0.00018179821201497092, "loss": 0.5201, "step": 662 }, { "epoch": 0.4, "grad_norm": 0.20218467055707082, "learning_rate": 0.0001817441010402091, "loss": 0.5058, "step": 663 }, { "epoch": 0.4, "grad_norm": 0.20940987275867923, "learning_rate": 0.00018168991783118452, "loss": 0.5095, "step": 664 }, { "epoch": 0.4, "grad_norm": 0.21341822518403558, "learning_rate": 0.00018163566243577697, "loss": 0.5599, "step": 665 }, { "epoch": 0.4, "grad_norm": 0.20028205017927186, "learning_rate": 0.0001815813349019299, "loss": 0.5318, "step": 666 }, { "epoch": 0.4, "grad_norm": 0.20184912350066175, "learning_rate": 0.00018152693527765057, "loss": 0.5643, "step": 667 }, { "epoch": 0.4, "grad_norm": 0.20882160405967118, "learning_rate": 0.0001814724636110099, "loss": 0.542, "step": 668 }, { "epoch": 0.4, "grad_norm": 0.20252144356881077, "learning_rate": 0.00018141791995014255, "loss": 0.4496, "step": 669 }, { "epoch": 0.4, "grad_norm": 0.1956328371434174, "learning_rate": 0.00018136330434324674, "loss": 0.56, "step": 670 }, { "epoch": 0.4, "grad_norm": 0.20691128111503362, "learning_rate": 0.00018130861683858426, "loss": 0.5726, "step": 671 }, { "epoch": 0.4, "grad_norm": 0.2258004454621585, "learning_rate": 0.00018125385748448048, "loss": 0.583, "step": 672 }, { "epoch": 0.4, "grad_norm": 0.22330059205477634, "learning_rate": 0.00018119902632932416, "loss": 0.5288, "step": 673 }, { "epoch": 0.4, "grad_norm": 0.20473079466150892, "learning_rate": 0.0001811441234215677, "loss": 0.5085, "step": 674 }, { "epoch": 0.4, "grad_norm": 0.19439333859223318, "learning_rate": 0.0001810891488097267, "loss": 0.5147, "step": 675 }, { "epoch": 0.4, "grad_norm": 0.2037181989313857, "learning_rate": 0.00018103410254238021, "loss": 0.5228, "step": 676 }, { "epoch": 0.4, "grad_norm": 0.21580635559566858, "learning_rate": 0.0001809789846681706, "loss": 0.5034, "step": 677 }, { "epoch": 0.4, "grad_norm": 0.21490060304667385, "learning_rate": 0.00018092379523580357, "loss": 0.5347, "step": 678 }, { "epoch": 0.41, "grad_norm": 0.20927738857723482, "learning_rate": 0.00018086853429404793, "loss": 0.5039, "step": 679 }, { "epoch": 0.41, "grad_norm": 0.21391199422702836, "learning_rate": 0.00018081320189173577, "loss": 0.5404, "step": 680 }, { "epoch": 0.41, "grad_norm": 0.22355130583819918, "learning_rate": 0.0001807577980777623, "loss": 0.5147, "step": 681 }, { "epoch": 0.41, "grad_norm": 0.21899190720848985, "learning_rate": 0.00018070232290108584, "loss": 0.5195, "step": 682 }, { "epoch": 0.41, "grad_norm": 0.20636096645560792, "learning_rate": 0.00018064677641072775, "loss": 0.5158, "step": 683 }, { "epoch": 0.41, "grad_norm": 0.20462410706105155, "learning_rate": 0.00018059115865577249, "loss": 0.5194, "step": 684 }, { "epoch": 0.41, "grad_norm": 0.21978634315593423, "learning_rate": 0.00018053546968536735, "loss": 0.4986, "step": 685 }, { "epoch": 0.41, "grad_norm": 0.2203882917140438, "learning_rate": 0.00018047970954872264, "loss": 0.5855, "step": 686 }, { "epoch": 0.41, "grad_norm": 0.20144829000454462, "learning_rate": 0.0001804238782951116, "loss": 0.5212, "step": 687 }, { "epoch": 0.41, "grad_norm": 0.21142991796239274, "learning_rate": 0.00018036797597387023, "loss": 0.495, "step": 688 }, { "epoch": 0.41, "grad_norm": 0.21275432668758548, "learning_rate": 0.00018031200263439736, "loss": 0.5694, "step": 689 }, { "epoch": 0.41, "grad_norm": 0.2035189446424034, "learning_rate": 0.00018025595832615459, "loss": 0.55, "step": 690 }, { "epoch": 0.41, "grad_norm": 0.20030837247360464, "learning_rate": 0.00018019984309866619, "loss": 0.4748, "step": 691 }, { "epoch": 0.41, "grad_norm": 0.20366715425572, "learning_rate": 0.00018014365700151912, "loss": 0.5792, "step": 692 }, { "epoch": 0.41, "grad_norm": 0.2082468197583491, "learning_rate": 0.000180087400084363, "loss": 0.4973, "step": 693 }, { "epoch": 0.41, "grad_norm": 0.21820027454676755, "learning_rate": 0.00018003107239691004, "loss": 0.5512, "step": 694 }, { "epoch": 0.41, "grad_norm": 0.2085678250499903, "learning_rate": 0.00017997467398893488, "loss": 0.5148, "step": 695 }, { "epoch": 0.42, "grad_norm": 0.20422653056781329, "learning_rate": 0.00017991820491027472, "loss": 0.4968, "step": 696 }, { "epoch": 0.42, "grad_norm": 0.1875899162050169, "learning_rate": 0.0001798616652108293, "loss": 0.5061, "step": 697 }, { "epoch": 0.42, "grad_norm": 0.20869663705218836, "learning_rate": 0.00017980505494056062, "loss": 0.5182, "step": 698 }, { "epoch": 0.42, "grad_norm": 0.19250179476147736, "learning_rate": 0.00017974837414949307, "loss": 0.5184, "step": 699 }, { "epoch": 0.42, "grad_norm": 0.21732108838463451, "learning_rate": 0.00017969162288771347, "loss": 0.5524, "step": 700 }, { "epoch": 0.42, "grad_norm": 0.20200315361578813, "learning_rate": 0.0001796348012053707, "loss": 0.5386, "step": 701 }, { "epoch": 0.42, "grad_norm": 0.20242537832049035, "learning_rate": 0.00017957790915267615, "loss": 0.5656, "step": 702 }, { "epoch": 0.42, "grad_norm": 0.1889172192023988, "learning_rate": 0.0001795209467799031, "loss": 0.5115, "step": 703 }, { "epoch": 0.42, "grad_norm": 0.19623435201373893, "learning_rate": 0.0001794639141373872, "loss": 0.497, "step": 704 }, { "epoch": 0.42, "grad_norm": 0.22372809637554478, "learning_rate": 0.00017940681127552604, "loss": 0.5579, "step": 705 }, { "epoch": 0.42, "grad_norm": 0.1968536923376666, "learning_rate": 0.0001793496382447794, "loss": 0.4891, "step": 706 }, { "epoch": 0.42, "grad_norm": 0.1990723573146364, "learning_rate": 0.00017929239509566894, "loss": 0.5921, "step": 707 }, { "epoch": 0.42, "grad_norm": 0.20388703819339077, "learning_rate": 0.00017923508187877834, "loss": 0.5414, "step": 708 }, { "epoch": 0.42, "grad_norm": 0.23657852979478725, "learning_rate": 0.00017917769864475314, "loss": 0.5672, "step": 709 }, { "epoch": 0.42, "grad_norm": 0.22888252332289927, "learning_rate": 0.00017912024544430088, "loss": 0.5459, "step": 710 }, { "epoch": 0.42, "grad_norm": 0.19383907969249117, "learning_rate": 0.0001790627223281908, "loss": 0.5509, "step": 711 }, { "epoch": 0.42, "grad_norm": 0.2154263629956836, "learning_rate": 0.00017900512934725397, "loss": 0.5629, "step": 712 }, { "epoch": 0.43, "grad_norm": 0.19802419635693494, "learning_rate": 0.0001789474665523832, "loss": 0.5128, "step": 713 }, { "epoch": 0.43, "grad_norm": 0.19783321602266912, "learning_rate": 0.00017888973399453296, "loss": 0.5064, "step": 714 }, { "epoch": 0.43, "grad_norm": 0.19864882371614528, "learning_rate": 0.00017883193172471944, "loss": 0.5458, "step": 715 }, { "epoch": 0.43, "grad_norm": 0.23609512585527, "learning_rate": 0.00017877405979402038, "loss": 0.5069, "step": 716 }, { "epoch": 0.43, "grad_norm": 0.19894144678524353, "learning_rate": 0.00017871611825357502, "loss": 0.5812, "step": 717 }, { "epoch": 0.43, "grad_norm": 0.19598819977852033, "learning_rate": 0.00017865810715458427, "loss": 0.5223, "step": 718 }, { "epoch": 0.43, "grad_norm": 0.23274847505011953, "learning_rate": 0.00017860002654831032, "loss": 0.5703, "step": 719 }, { "epoch": 0.43, "grad_norm": 0.19794477486450376, "learning_rate": 0.00017854187648607694, "loss": 0.5538, "step": 720 }, { "epoch": 0.43, "grad_norm": 0.2091737019131215, "learning_rate": 0.00017848365701926913, "loss": 0.4962, "step": 721 }, { "epoch": 0.43, "grad_norm": 0.21890749511490995, "learning_rate": 0.00017842536819933337, "loss": 0.5074, "step": 722 }, { "epoch": 0.43, "grad_norm": 0.22746821737803938, "learning_rate": 0.0001783670100777773, "loss": 0.5849, "step": 723 }, { "epoch": 0.43, "grad_norm": 0.20967916540656184, "learning_rate": 0.0001783085827061699, "loss": 0.5246, "step": 724 }, { "epoch": 0.43, "grad_norm": 0.19798059353181535, "learning_rate": 0.00017825008613614127, "loss": 0.4667, "step": 725 }, { "epoch": 0.43, "grad_norm": 0.1992664047298655, "learning_rate": 0.00017819152041938265, "loss": 0.5247, "step": 726 }, { "epoch": 0.43, "grad_norm": 0.22025628624147217, "learning_rate": 0.00017813288560764647, "loss": 0.5291, "step": 727 }, { "epoch": 0.43, "grad_norm": 0.20405038516624363, "learning_rate": 0.00017807418175274612, "loss": 0.5235, "step": 728 }, { "epoch": 0.43, "grad_norm": 0.20626127985692586, "learning_rate": 0.00017801540890655609, "loss": 0.5103, "step": 729 }, { "epoch": 0.44, "grad_norm": 0.2187527308725265, "learning_rate": 0.00017795656712101172, "loss": 0.5515, "step": 730 }, { "epoch": 0.44, "grad_norm": 0.20386714530070776, "learning_rate": 0.00017789765644810935, "loss": 0.5109, "step": 731 }, { "epoch": 0.44, "grad_norm": 0.1990293686392052, "learning_rate": 0.00017783867693990624, "loss": 0.5208, "step": 732 }, { "epoch": 0.44, "grad_norm": 0.19601721442767256, "learning_rate": 0.0001777796286485204, "loss": 0.5318, "step": 733 }, { "epoch": 0.44, "grad_norm": 0.20542580410660244, "learning_rate": 0.0001777205116261306, "loss": 0.5198, "step": 734 }, { "epoch": 0.44, "grad_norm": 0.20998518101289002, "learning_rate": 0.0001776613259249764, "loss": 0.5384, "step": 735 }, { "epoch": 0.44, "grad_norm": 0.20134476803418952, "learning_rate": 0.00017760207159735805, "loss": 0.5448, "step": 736 }, { "epoch": 0.44, "grad_norm": 0.22396912180134018, "learning_rate": 0.00017754274869563637, "loss": 0.59, "step": 737 }, { "epoch": 0.44, "grad_norm": 0.2044555533666512, "learning_rate": 0.00017748335727223294, "loss": 0.5152, "step": 738 }, { "epoch": 0.44, "grad_norm": 0.2106748606361736, "learning_rate": 0.00017742389737962966, "loss": 0.5233, "step": 739 }, { "epoch": 0.44, "grad_norm": 0.19348093577281505, "learning_rate": 0.0001773643690703691, "loss": 0.5181, "step": 740 }, { "epoch": 0.44, "grad_norm": 0.20393569458294794, "learning_rate": 0.00017730477239705428, "loss": 0.5671, "step": 741 }, { "epoch": 0.44, "grad_norm": 0.19728761757057783, "learning_rate": 0.00017724510741234858, "loss": 0.4919, "step": 742 }, { "epoch": 0.44, "grad_norm": 0.2025575313201386, "learning_rate": 0.0001771853741689757, "loss": 0.5452, "step": 743 }, { "epoch": 0.44, "grad_norm": 0.19153867099886435, "learning_rate": 0.0001771255727197198, "loss": 0.4951, "step": 744 }, { "epoch": 0.44, "grad_norm": 0.2220125331576081, "learning_rate": 0.00017706570311742516, "loss": 0.5521, "step": 745 }, { "epoch": 0.45, "grad_norm": 0.22704666961693065, "learning_rate": 0.0001770057654149964, "loss": 0.5184, "step": 746 }, { "epoch": 0.45, "grad_norm": 0.20871880228168335, "learning_rate": 0.00017694575966539823, "loss": 0.5205, "step": 747 }, { "epoch": 0.45, "grad_norm": 0.2105924919088961, "learning_rate": 0.00017688568592165552, "loss": 0.5448, "step": 748 }, { "epoch": 0.45, "grad_norm": 0.19780662201378688, "learning_rate": 0.00017682554423685329, "loss": 0.6037, "step": 749 }, { "epoch": 0.45, "grad_norm": 0.23105576261963792, "learning_rate": 0.0001767653346641365, "loss": 0.7225, "step": 750 }, { "epoch": 0.45, "grad_norm": 0.21997563912032173, "learning_rate": 0.00017670505725671013, "loss": 0.552, "step": 751 }, { "epoch": 0.45, "grad_norm": 0.2033859052649398, "learning_rate": 0.00017664471206783915, "loss": 0.5315, "step": 752 }, { "epoch": 0.45, "grad_norm": 0.19979214467824102, "learning_rate": 0.00017658429915084835, "loss": 0.5697, "step": 753 }, { "epoch": 0.45, "grad_norm": 0.20567412732571028, "learning_rate": 0.00017652381855912247, "loss": 0.5051, "step": 754 }, { "epoch": 0.45, "grad_norm": 0.20563597140752976, "learning_rate": 0.0001764632703461059, "loss": 0.5141, "step": 755 }, { "epoch": 0.45, "grad_norm": 0.1979658869623221, "learning_rate": 0.00017640265456530293, "loss": 0.5257, "step": 756 }, { "epoch": 0.45, "grad_norm": 0.2241077787834463, "learning_rate": 0.0001763419712702775, "loss": 0.5203, "step": 757 }, { "epoch": 0.45, "grad_norm": 0.2197932846972142, "learning_rate": 0.00017628122051465322, "loss": 0.5847, "step": 758 }, { "epoch": 0.45, "grad_norm": 0.1990944255813207, "learning_rate": 0.00017622040235211326, "loss": 0.4962, "step": 759 }, { "epoch": 0.45, "grad_norm": 0.22179309744704687, "learning_rate": 0.00017615951683640045, "loss": 0.5635, "step": 760 }, { "epoch": 0.45, "grad_norm": 0.20505896786344424, "learning_rate": 0.00017609856402131703, "loss": 0.4968, "step": 761 }, { "epoch": 0.45, "grad_norm": 0.21771157401053975, "learning_rate": 0.00017603754396072483, "loss": 0.4858, "step": 762 }, { "epoch": 0.46, "grad_norm": 0.23357401076131715, "learning_rate": 0.000175976456708545, "loss": 0.5766, "step": 763 }, { "epoch": 0.46, "grad_norm": 0.21488993425737504, "learning_rate": 0.0001759153023187581, "loss": 0.5419, "step": 764 }, { "epoch": 0.46, "grad_norm": 0.2035555999534868, "learning_rate": 0.00017585408084540405, "loss": 0.5272, "step": 765 }, { "epoch": 0.46, "grad_norm": 0.20066829451010718, "learning_rate": 0.00017579279234258198, "loss": 0.5013, "step": 766 }, { "epoch": 0.46, "grad_norm": 0.2052255359730049, "learning_rate": 0.00017573143686445034, "loss": 0.5383, "step": 767 }, { "epoch": 0.46, "grad_norm": 0.19180058672325329, "learning_rate": 0.00017567001446522665, "loss": 0.5108, "step": 768 }, { "epoch": 0.46, "grad_norm": 0.22862029943228582, "learning_rate": 0.0001756085251991877, "loss": 0.531, "step": 769 }, { "epoch": 0.46, "grad_norm": 0.2180888066741993, "learning_rate": 0.00017554696912066924, "loss": 0.5938, "step": 770 }, { "epoch": 0.46, "grad_norm": 0.19823263656993223, "learning_rate": 0.00017548534628406616, "loss": 0.5158, "step": 771 }, { "epoch": 0.46, "grad_norm": 0.18700255356016454, "learning_rate": 0.00017542365674383227, "loss": 0.517, "step": 772 }, { "epoch": 0.46, "grad_norm": 0.22948411236460914, "learning_rate": 0.00017536190055448037, "loss": 0.5464, "step": 773 }, { "epoch": 0.46, "grad_norm": 0.21370070213829387, "learning_rate": 0.00017530007777058213, "loss": 0.5158, "step": 774 }, { "epoch": 0.46, "grad_norm": 0.19174674116457566, "learning_rate": 0.0001752381884467681, "loss": 0.5035, "step": 775 }, { "epoch": 0.46, "grad_norm": 0.19069115110218368, "learning_rate": 0.00017517623263772758, "loss": 0.5341, "step": 776 }, { "epoch": 0.46, "grad_norm": 0.2401612943495333, "learning_rate": 0.00017511421039820863, "loss": 0.578, "step": 777 }, { "epoch": 0.46, "grad_norm": 0.20371659209716964, "learning_rate": 0.00017505212178301805, "loss": 0.5103, "step": 778 }, { "epoch": 0.46, "grad_norm": 0.2029847143168681, "learning_rate": 0.00017498996684702132, "loss": 0.537, "step": 779 }, { "epoch": 0.47, "grad_norm": 0.1904915669257304, "learning_rate": 0.00017492774564514235, "loss": 0.5129, "step": 780 }, { "epoch": 0.47, "grad_norm": 0.20640027525482552, "learning_rate": 0.00017486545823236385, "loss": 0.5585, "step": 781 }, { "epoch": 0.47, "grad_norm": 0.23582084058854208, "learning_rate": 0.00017480310466372686, "loss": 0.5648, "step": 782 }, { "epoch": 0.47, "grad_norm": 0.2219618762642625, "learning_rate": 0.00017474068499433098, "loss": 0.5365, "step": 783 }, { "epoch": 0.47, "grad_norm": 0.2021980496149104, "learning_rate": 0.00017467819927933416, "loss": 0.5232, "step": 784 }, { "epoch": 0.47, "grad_norm": 0.22350007413119136, "learning_rate": 0.00017461564757395272, "loss": 0.571, "step": 785 }, { "epoch": 0.47, "grad_norm": 0.1982267515659923, "learning_rate": 0.00017455302993346134, "loss": 0.5228, "step": 786 }, { "epoch": 0.47, "grad_norm": 0.1981641437638338, "learning_rate": 0.00017449034641319288, "loss": 0.5233, "step": 787 }, { "epoch": 0.47, "grad_norm": 0.21019781303279997, "learning_rate": 0.00017442759706853855, "loss": 0.5207, "step": 788 }, { "epoch": 0.47, "grad_norm": 0.2060312831458839, "learning_rate": 0.00017436478195494756, "loss": 0.5262, "step": 789 }, { "epoch": 0.47, "grad_norm": 0.21829001169718656, "learning_rate": 0.00017430190112792737, "loss": 0.563, "step": 790 }, { "epoch": 0.47, "grad_norm": 0.18511782073951058, "learning_rate": 0.00017423895464304342, "loss": 0.5017, "step": 791 }, { "epoch": 0.47, "grad_norm": 0.1883852134929889, "learning_rate": 0.00017417594255591927, "loss": 0.4598, "step": 792 }, { "epoch": 0.47, "grad_norm": 0.18093236424530848, "learning_rate": 0.00017411286492223632, "loss": 0.4834, "step": 793 }, { "epoch": 0.47, "grad_norm": 0.18428120434597678, "learning_rate": 0.000174049721797734, "loss": 0.5032, "step": 794 }, { "epoch": 0.47, "grad_norm": 0.20829275110131446, "learning_rate": 0.00017398651323820958, "loss": 0.5844, "step": 795 }, { "epoch": 0.47, "grad_norm": 0.20484798677763622, "learning_rate": 0.00017392323929951812, "loss": 0.5674, "step": 796 }, { "epoch": 0.48, "grad_norm": 0.24390628538267795, "learning_rate": 0.0001738599000375725, "loss": 0.5415, "step": 797 }, { "epoch": 0.48, "grad_norm": 0.7159631217821198, "learning_rate": 0.00017379649550834327, "loss": 0.5248, "step": 798 }, { "epoch": 0.48, "grad_norm": 0.2153929799459398, "learning_rate": 0.00017373302576785874, "loss": 0.5362, "step": 799 }, { "epoch": 0.48, "grad_norm": 0.18434502268826083, "learning_rate": 0.00017366949087220472, "loss": 0.5179, "step": 800 }, { "epoch": 0.48, "grad_norm": 0.1993458339623336, "learning_rate": 0.0001736058908775247, "loss": 0.5378, "step": 801 }, { "epoch": 0.48, "grad_norm": 0.20482117487035287, "learning_rate": 0.0001735422258400197, "loss": 0.5066, "step": 802 }, { "epoch": 0.48, "grad_norm": 0.22481951556352617, "learning_rate": 0.0001734784958159481, "loss": 0.5504, "step": 803 }, { "epoch": 0.48, "grad_norm": 0.20893222575857784, "learning_rate": 0.00017341470086162586, "loss": 0.5558, "step": 804 }, { "epoch": 0.48, "grad_norm": 0.21011978723049574, "learning_rate": 0.0001733508410334262, "loss": 0.5164, "step": 805 }, { "epoch": 0.48, "grad_norm": 0.19493427746334713, "learning_rate": 0.0001732869163877797, "loss": 0.4928, "step": 806 }, { "epoch": 0.48, "grad_norm": 0.21183476672026114, "learning_rate": 0.00017322292698117425, "loss": 0.539, "step": 807 }, { "epoch": 0.48, "grad_norm": 0.19833380077404217, "learning_rate": 0.00017315887287015492, "loss": 0.5271, "step": 808 }, { "epoch": 0.48, "grad_norm": 0.1914374518219283, "learning_rate": 0.000173094754111324, "loss": 0.5408, "step": 809 }, { "epoch": 0.48, "grad_norm": 0.2086814492670768, "learning_rate": 0.00017303057076134085, "loss": 0.5289, "step": 810 }, { "epoch": 0.48, "grad_norm": 0.20957903788826676, "learning_rate": 0.000172966322876922, "loss": 0.4998, "step": 811 }, { "epoch": 0.48, "grad_norm": 0.20998255172298386, "learning_rate": 0.00017290201051484085, "loss": 0.5481, "step": 812 }, { "epoch": 0.49, "grad_norm": 0.2071975609134585, "learning_rate": 0.00017283763373192798, "loss": 0.5183, "step": 813 }, { "epoch": 0.49, "grad_norm": 0.21738328519054306, "learning_rate": 0.00017277319258507073, "loss": 0.539, "step": 814 }, { "epoch": 0.49, "grad_norm": 0.20518040499899, "learning_rate": 0.0001727086871312134, "loss": 0.5109, "step": 815 }, { "epoch": 0.49, "grad_norm": 0.19341379491491822, "learning_rate": 0.00017264411742735707, "loss": 0.4882, "step": 816 }, { "epoch": 0.49, "grad_norm": 0.23128359760674316, "learning_rate": 0.00017257948353055963, "loss": 0.547, "step": 817 }, { "epoch": 0.49, "grad_norm": 0.1960131633047162, "learning_rate": 0.0001725147854979357, "loss": 0.5467, "step": 818 }, { "epoch": 0.49, "grad_norm": 0.21053602560644855, "learning_rate": 0.00017245002338665656, "loss": 0.5644, "step": 819 }, { "epoch": 0.49, "grad_norm": 0.19602457133539752, "learning_rate": 0.00017238519725395007, "loss": 0.5121, "step": 820 }, { "epoch": 0.49, "grad_norm": 0.1923459024283483, "learning_rate": 0.00017232030715710076, "loss": 0.5335, "step": 821 }, { "epoch": 0.49, "grad_norm": 0.19919783133579333, "learning_rate": 0.00017225535315344955, "loss": 0.5076, "step": 822 }, { "epoch": 0.49, "grad_norm": 0.23727428892467575, "learning_rate": 0.00017219033530039397, "loss": 0.5396, "step": 823 }, { "epoch": 0.49, "grad_norm": 0.18048505778792392, "learning_rate": 0.00017212525365538792, "loss": 0.467, "step": 824 }, { "epoch": 0.49, "grad_norm": 0.20071702267002645, "learning_rate": 0.00017206010827594163, "loss": 0.5217, "step": 825 }, { "epoch": 0.49, "grad_norm": 0.20315216612025339, "learning_rate": 0.0001719948992196217, "loss": 0.4975, "step": 826 }, { "epoch": 0.49, "grad_norm": 0.2142259292765235, "learning_rate": 0.00017192962654405096, "loss": 0.5148, "step": 827 }, { "epoch": 0.49, "grad_norm": 0.19450012283752555, "learning_rate": 0.00017186429030690848, "loss": 0.5297, "step": 828 }, { "epoch": 0.49, "grad_norm": 0.19853162923467543, "learning_rate": 0.00017179889056592954, "loss": 0.547, "step": 829 }, { "epoch": 0.5, "grad_norm": 0.24873174470750906, "learning_rate": 0.00017173342737890544, "loss": 0.563, "step": 830 }, { "epoch": 0.5, "grad_norm": 0.18593730182623175, "learning_rate": 0.00017166790080368357, "loss": 0.4647, "step": 831 }, { "epoch": 0.5, "grad_norm": 0.19387710879340586, "learning_rate": 0.00017160231089816748, "loss": 0.5313, "step": 832 }, { "epoch": 0.5, "grad_norm": 0.20818447206363588, "learning_rate": 0.00017153665772031643, "loss": 0.5333, "step": 833 }, { "epoch": 0.5, "grad_norm": 0.17584822143732362, "learning_rate": 0.0001714709413281458, "loss": 0.4467, "step": 834 }, { "epoch": 0.5, "grad_norm": 0.19622166504995672, "learning_rate": 0.00017140516177972676, "loss": 0.5129, "step": 835 }, { "epoch": 0.5, "grad_norm": 0.18822988249157332, "learning_rate": 0.00017133931913318625, "loss": 0.5186, "step": 836 }, { "epoch": 0.5, "grad_norm": 0.2021164051164271, "learning_rate": 0.00017127341344670696, "loss": 0.551, "step": 837 }, { "epoch": 0.5, "grad_norm": 0.19354327476685654, "learning_rate": 0.00017120744477852745, "loss": 0.5001, "step": 838 }, { "epoch": 0.5, "grad_norm": 0.2202448214811264, "learning_rate": 0.00017114141318694167, "loss": 0.5516, "step": 839 }, { "epoch": 0.5, "grad_norm": 0.2152071248011757, "learning_rate": 0.00017107531873029942, "loss": 0.603, "step": 840 }, { "epoch": 0.5, "grad_norm": 0.21182530477312786, "learning_rate": 0.0001710091614670059, "loss": 0.522, "step": 841 }, { "epoch": 0.5, "grad_norm": 0.19970970678249778, "learning_rate": 0.00017094294145552188, "loss": 0.547, "step": 842 }, { "epoch": 0.5, "grad_norm": 0.19827173723610417, "learning_rate": 0.00017087665875436354, "loss": 0.5238, "step": 843 }, { "epoch": 0.5, "grad_norm": 0.20841052076060687, "learning_rate": 0.00017081031342210245, "loss": 0.5438, "step": 844 }, { "epoch": 0.5, "grad_norm": 0.20276205913776804, "learning_rate": 0.0001707439055173656, "loss": 0.541, "step": 845 }, { "epoch": 0.5, "grad_norm": 0.2235538813636529, "learning_rate": 0.00017067743509883515, "loss": 0.5496, "step": 846 }, { "epoch": 0.51, "grad_norm": 0.19092318633528002, "learning_rate": 0.00017061090222524863, "loss": 0.47, "step": 847 }, { "epoch": 0.51, "grad_norm": 0.23200775329397214, "learning_rate": 0.00017054430695539864, "loss": 0.5268, "step": 848 }, { "epoch": 0.51, "grad_norm": 0.22029427294529383, "learning_rate": 0.00017047764934813303, "loss": 0.499, "step": 849 }, { "epoch": 0.51, "grad_norm": 0.22601440602605696, "learning_rate": 0.00017041092946235467, "loss": 0.5593, "step": 850 }, { "epoch": 0.51, "grad_norm": 0.18461060416824568, "learning_rate": 0.00017034414735702145, "loss": 0.495, "step": 851 }, { "epoch": 0.51, "grad_norm": 0.1901253135554411, "learning_rate": 0.0001702773030911463, "loss": 0.5333, "step": 852 }, { "epoch": 0.51, "grad_norm": 0.21957939715433938, "learning_rate": 0.00017021039672379703, "loss": 0.5019, "step": 853 }, { "epoch": 0.51, "grad_norm": 0.18779803577301077, "learning_rate": 0.00017014342831409634, "loss": 0.5079, "step": 854 }, { "epoch": 0.51, "grad_norm": 0.22887531917427628, "learning_rate": 0.00017007639792122173, "loss": 0.537, "step": 855 }, { "epoch": 0.51, "grad_norm": 0.20978489469439648, "learning_rate": 0.00017000930560440554, "loss": 0.5488, "step": 856 }, { "epoch": 0.51, "grad_norm": 0.21114765352664955, "learning_rate": 0.0001699421514229348, "loss": 0.55, "step": 857 }, { "epoch": 0.51, "grad_norm": 0.1963314578424483, "learning_rate": 0.00016987493543615115, "loss": 0.487, "step": 858 }, { "epoch": 0.51, "grad_norm": 0.21946023399145406, "learning_rate": 0.0001698076577034509, "loss": 0.5174, "step": 859 }, { "epoch": 0.51, "grad_norm": 0.20119091495408187, "learning_rate": 0.00016974031828428495, "loss": 0.5127, "step": 860 }, { "epoch": 0.51, "grad_norm": 0.19098558699810822, "learning_rate": 0.00016967291723815863, "loss": 0.5776, "step": 861 }, { "epoch": 0.51, "grad_norm": 0.20834795709316445, "learning_rate": 0.00016960545462463183, "loss": 0.4853, "step": 862 }, { "epoch": 0.51, "grad_norm": 0.20553975175824055, "learning_rate": 0.0001695379305033187, "loss": 0.5244, "step": 863 }, { "epoch": 0.52, "grad_norm": 0.38517636125953164, "learning_rate": 0.00016947034493388786, "loss": 0.5314, "step": 864 }, { "epoch": 0.52, "grad_norm": 0.19821744800803398, "learning_rate": 0.00016940269797606228, "loss": 0.5104, "step": 865 }, { "epoch": 0.52, "grad_norm": 0.21102895742419708, "learning_rate": 0.00016933498968961898, "loss": 0.5355, "step": 866 }, { "epoch": 0.52, "grad_norm": 0.202288584940297, "learning_rate": 0.00016926722013438936, "loss": 0.5497, "step": 867 }, { "epoch": 0.52, "grad_norm": 0.20339292404553427, "learning_rate": 0.00016919938937025886, "loss": 0.5655, "step": 868 }, { "epoch": 0.52, "grad_norm": 0.23789079718932474, "learning_rate": 0.00016913149745716703, "loss": 0.5312, "step": 869 }, { "epoch": 0.52, "grad_norm": 0.1930501027423073, "learning_rate": 0.00016906354445510747, "loss": 0.5119, "step": 870 }, { "epoch": 0.52, "grad_norm": 0.20670905101801587, "learning_rate": 0.0001689955304241278, "loss": 0.5297, "step": 871 }, { "epoch": 0.52, "grad_norm": 0.18514362407781898, "learning_rate": 0.0001689274554243294, "loss": 0.4913, "step": 872 }, { "epoch": 0.52, "grad_norm": 0.22536281121632565, "learning_rate": 0.00016885931951586783, "loss": 0.5236, "step": 873 }, { "epoch": 0.52, "grad_norm": 0.22244899593514914, "learning_rate": 0.00016879112275895215, "loss": 0.5305, "step": 874 }, { "epoch": 0.52, "grad_norm": 0.19941629299670782, "learning_rate": 0.00016872286521384537, "loss": 0.5809, "step": 875 }, { "epoch": 0.52, "grad_norm": 0.20169129247248377, "learning_rate": 0.0001686545469408642, "loss": 0.5081, "step": 876 }, { "epoch": 0.52, "grad_norm": 0.21516300195343227, "learning_rate": 0.000168586168000379, "loss": 0.5084, "step": 877 }, { "epoch": 0.52, "grad_norm": 0.21422988118226038, "learning_rate": 0.00016851772845281367, "loss": 0.5246, "step": 878 }, { "epoch": 0.52, "grad_norm": 0.19390871621362155, "learning_rate": 0.00016844922835864575, "loss": 0.5528, "step": 879 }, { "epoch": 0.53, "grad_norm": 0.21294605765564745, "learning_rate": 0.00016838066777840629, "loss": 0.5971, "step": 880 }, { "epoch": 0.53, "grad_norm": 0.17876424837335442, "learning_rate": 0.00016831204677267975, "loss": 0.4906, "step": 881 }, { "epoch": 0.53, "grad_norm": 0.21551555099302896, "learning_rate": 0.00016824336540210402, "loss": 0.5501, "step": 882 }, { "epoch": 0.53, "grad_norm": 0.2046934200181805, "learning_rate": 0.0001681746237273702, "loss": 0.5372, "step": 883 }, { "epoch": 0.53, "grad_norm": 0.19709464178914807, "learning_rate": 0.00016810582180922293, "loss": 0.5479, "step": 884 }, { "epoch": 0.53, "grad_norm": 0.19373994384361878, "learning_rate": 0.00016803695970845985, "loss": 0.5038, "step": 885 }, { "epoch": 0.53, "grad_norm": 0.19009129266864652, "learning_rate": 0.0001679680374859319, "loss": 0.5047, "step": 886 }, { "epoch": 0.53, "grad_norm": 0.19438125438775203, "learning_rate": 0.0001678990552025431, "loss": 0.5241, "step": 887 }, { "epoch": 0.53, "grad_norm": 0.23198116415021436, "learning_rate": 0.00016783001291925055, "loss": 0.5384, "step": 888 }, { "epoch": 0.53, "grad_norm": 0.19609980449580128, "learning_rate": 0.00016776091069706442, "loss": 0.5096, "step": 889 }, { "epoch": 0.53, "grad_norm": 0.19406158834918122, "learning_rate": 0.00016769174859704783, "loss": 0.494, "step": 890 }, { "epoch": 0.53, "grad_norm": 0.21696546174997547, "learning_rate": 0.00016762252668031674, "loss": 0.5064, "step": 891 }, { "epoch": 0.53, "grad_norm": 0.18219835689033384, "learning_rate": 0.00016755324500804, "loss": 0.5172, "step": 892 }, { "epoch": 0.53, "grad_norm": 0.20040255213813227, "learning_rate": 0.00016748390364143938, "loss": 0.5691, "step": 893 }, { "epoch": 0.53, "grad_norm": 0.231096201209158, "learning_rate": 0.00016741450264178917, "loss": 0.5921, "step": 894 }, { "epoch": 0.53, "grad_norm": 0.1949281480708862, "learning_rate": 0.00016734504207041663, "loss": 0.5268, "step": 895 }, { "epoch": 0.53, "grad_norm": 0.20701715584416944, "learning_rate": 0.00016727552198870135, "loss": 0.5415, "step": 896 }, { "epoch": 0.54, "grad_norm": 0.19117861330981445, "learning_rate": 0.00016720594245807582, "loss": 0.5208, "step": 897 }, { "epoch": 0.54, "grad_norm": 0.203448739160119, "learning_rate": 0.00016713630354002484, "loss": 0.4925, "step": 898 }, { "epoch": 0.54, "grad_norm": 0.1961421908595443, "learning_rate": 0.00016706660529608583, "loss": 0.5173, "step": 899 }, { "epoch": 0.54, "grad_norm": 0.21571781486847058, "learning_rate": 0.0001669968477878485, "loss": 0.5288, "step": 900 }, { "epoch": 0.54, "grad_norm": 0.1951469824130785, "learning_rate": 0.00016692703107695507, "loss": 0.5454, "step": 901 }, { "epoch": 0.54, "grad_norm": 0.20181422096222762, "learning_rate": 0.00016685715522509994, "loss": 0.5266, "step": 902 }, { "epoch": 0.54, "grad_norm": 0.20297647148502806, "learning_rate": 0.0001667872202940299, "loss": 0.5621, "step": 903 }, { "epoch": 0.54, "grad_norm": 0.21597435470412332, "learning_rate": 0.0001667172263455438, "loss": 0.5458, "step": 904 }, { "epoch": 0.54, "grad_norm": 0.18789940455022036, "learning_rate": 0.00016664717344149277, "loss": 0.5198, "step": 905 }, { "epoch": 0.54, "grad_norm": 0.20826993025429438, "learning_rate": 0.00016657706164378, "loss": 0.5513, "step": 906 }, { "epoch": 0.54, "grad_norm": 0.20437016308518594, "learning_rate": 0.00016650689101436073, "loss": 0.549, "step": 907 }, { "epoch": 0.54, "grad_norm": 0.19686798551292672, "learning_rate": 0.00016643666161524217, "loss": 0.529, "step": 908 }, { "epoch": 0.54, "grad_norm": 0.19123982383563307, "learning_rate": 0.00016636637350848338, "loss": 0.4862, "step": 909 }, { "epoch": 0.54, "grad_norm": 0.1949343721096402, "learning_rate": 0.00016629602675619548, "loss": 0.5535, "step": 910 }, { "epoch": 0.54, "grad_norm": 0.19482047517370968, "learning_rate": 0.0001662256214205413, "loss": 0.5149, "step": 911 }, { "epoch": 0.54, "grad_norm": 0.19782922305356188, "learning_rate": 0.00016615515756373533, "loss": 0.5132, "step": 912 }, { "epoch": 0.54, "grad_norm": 0.19929182594748354, "learning_rate": 0.00016608463524804407, "loss": 0.4978, "step": 913 }, { "epoch": 0.55, "grad_norm": 0.20329183437505496, "learning_rate": 0.0001660140545357854, "loss": 0.554, "step": 914 }, { "epoch": 0.55, "grad_norm": 0.20759199356585792, "learning_rate": 0.00016594341548932894, "loss": 0.5478, "step": 915 }, { "epoch": 0.55, "grad_norm": 0.2141758006074044, "learning_rate": 0.0001658727181710958, "loss": 0.5097, "step": 916 }, { "epoch": 0.55, "grad_norm": 0.19823795581184833, "learning_rate": 0.0001658019626435586, "loss": 0.4997, "step": 917 }, { "epoch": 0.55, "grad_norm": 0.19545826935570532, "learning_rate": 0.00016573114896924147, "loss": 0.4973, "step": 918 }, { "epoch": 0.55, "grad_norm": 0.2011307790801751, "learning_rate": 0.0001656602772107198, "loss": 0.5683, "step": 919 }, { "epoch": 0.55, "grad_norm": 0.220529866848964, "learning_rate": 0.00016558934743062035, "loss": 0.574, "step": 920 }, { "epoch": 0.55, "grad_norm": 0.19374991809265163, "learning_rate": 0.00016551835969162118, "loss": 0.4881, "step": 921 }, { "epoch": 0.55, "grad_norm": 0.188760632867396, "learning_rate": 0.00016544731405645154, "loss": 0.5035, "step": 922 }, { "epoch": 0.55, "grad_norm": 0.21165086312587134, "learning_rate": 0.00016537621058789194, "loss": 0.5457, "step": 923 }, { "epoch": 0.55, "grad_norm": 0.22858469252110178, "learning_rate": 0.00016530504934877377, "loss": 0.5361, "step": 924 }, { "epoch": 0.55, "grad_norm": 0.22860822822332005, "learning_rate": 0.0001652338304019797, "loss": 0.5317, "step": 925 }, { "epoch": 0.55, "grad_norm": 0.18446055255941704, "learning_rate": 0.00016516255381044323, "loss": 0.5384, "step": 926 }, { "epoch": 0.55, "grad_norm": 0.19147093978335786, "learning_rate": 0.00016509121963714896, "loss": 0.5462, "step": 927 }, { "epoch": 0.55, "grad_norm": 0.2010805530061722, "learning_rate": 0.00016501982794513219, "loss": 0.5224, "step": 928 }, { "epoch": 0.55, "grad_norm": 0.2194018595901758, "learning_rate": 0.00016494837879747916, "loss": 0.5403, "step": 929 }, { "epoch": 0.55, "grad_norm": 0.18748965563201492, "learning_rate": 0.00016487687225732694, "loss": 0.5024, "step": 930 }, { "epoch": 0.56, "grad_norm": 0.18976038729649813, "learning_rate": 0.00016480530838786312, "loss": 0.5373, "step": 931 }, { "epoch": 0.56, "grad_norm": 0.20751165573259855, "learning_rate": 0.00016473368725232614, "loss": 0.5524, "step": 932 }, { "epoch": 0.56, "grad_norm": 0.23600220568039398, "learning_rate": 0.0001646620089140049, "loss": 0.5315, "step": 933 }, { "epoch": 0.56, "grad_norm": 0.17133056460736396, "learning_rate": 0.00016459027343623906, "loss": 0.4753, "step": 934 }, { "epoch": 0.56, "grad_norm": 0.1864042334593607, "learning_rate": 0.00016451848088241847, "loss": 0.5617, "step": 935 }, { "epoch": 0.56, "grad_norm": 0.20853995853524807, "learning_rate": 0.00016444663131598365, "loss": 0.5149, "step": 936 }, { "epoch": 0.56, "grad_norm": 0.20631366863701492, "learning_rate": 0.00016437472480042544, "loss": 0.5712, "step": 937 }, { "epoch": 0.56, "grad_norm": 0.18606219350180553, "learning_rate": 0.00016430276139928494, "loss": 0.5147, "step": 938 }, { "epoch": 0.56, "grad_norm": 0.18702291336595328, "learning_rate": 0.00016423074117615362, "loss": 0.4982, "step": 939 }, { "epoch": 0.56, "grad_norm": 0.19313224950151772, "learning_rate": 0.00016415866419467308, "loss": 0.5283, "step": 940 }, { "epoch": 0.56, "grad_norm": 0.19468213657571698, "learning_rate": 0.00016408653051853505, "loss": 0.5222, "step": 941 }, { "epoch": 0.56, "grad_norm": 0.1900318796388756, "learning_rate": 0.00016401434021148155, "loss": 0.5215, "step": 942 }, { "epoch": 0.56, "grad_norm": 0.20563975975379326, "learning_rate": 0.00016394209333730437, "loss": 0.5332, "step": 943 }, { "epoch": 0.56, "grad_norm": 0.19499178353885577, "learning_rate": 0.0001638697899598455, "loss": 0.5089, "step": 944 }, { "epoch": 0.56, "grad_norm": 0.193828603254631, "learning_rate": 0.00016379743014299675, "loss": 0.4814, "step": 945 }, { "epoch": 0.56, "grad_norm": 0.2077835011524252, "learning_rate": 0.00016372501395069984, "loss": 0.5874, "step": 946 }, { "epoch": 0.57, "grad_norm": 0.19612398295653072, "learning_rate": 0.0001636525414469463, "loss": 0.5526, "step": 947 }, { "epoch": 0.57, "grad_norm": 0.20309531438683182, "learning_rate": 0.00016358001269577743, "loss": 0.5197, "step": 948 }, { "epoch": 0.57, "grad_norm": 0.1915526531241072, "learning_rate": 0.00016350742776128423, "loss": 0.4595, "step": 949 }, { "epoch": 0.57, "grad_norm": 0.18881194099828352, "learning_rate": 0.00016343478670760732, "loss": 0.5367, "step": 950 }, { "epoch": 0.57, "grad_norm": 0.18139152767925906, "learning_rate": 0.00016336208959893698, "loss": 0.4768, "step": 951 }, { "epoch": 0.57, "grad_norm": 0.17769254483980393, "learning_rate": 0.00016328933649951293, "loss": 0.4676, "step": 952 }, { "epoch": 0.57, "grad_norm": 0.20965333812663062, "learning_rate": 0.00016321652747362445, "loss": 0.5555, "step": 953 }, { "epoch": 0.57, "grad_norm": 0.19321230034521153, "learning_rate": 0.00016314366258561016, "loss": 0.5295, "step": 954 }, { "epoch": 0.57, "grad_norm": 0.18597567849314112, "learning_rate": 0.00016307074189985814, "loss": 0.4884, "step": 955 }, { "epoch": 0.57, "grad_norm": 0.18434368773024823, "learning_rate": 0.0001629977654808057, "loss": 0.4824, "step": 956 }, { "epoch": 0.57, "grad_norm": 0.20328216484120545, "learning_rate": 0.0001629247333929394, "loss": 0.58, "step": 957 }, { "epoch": 0.57, "grad_norm": 0.21563942573107894, "learning_rate": 0.00016285164570079504, "loss": 0.5822, "step": 958 }, { "epoch": 0.57, "grad_norm": 0.2001040967836008, "learning_rate": 0.00016277850246895753, "loss": 0.5302, "step": 959 }, { "epoch": 0.57, "grad_norm": 0.1937847274208307, "learning_rate": 0.0001627053037620609, "loss": 0.4953, "step": 960 }, { "epoch": 0.57, "grad_norm": 0.19489799779815298, "learning_rate": 0.00016263204964478807, "loss": 0.5693, "step": 961 }, { "epoch": 0.57, "grad_norm": 0.1746038558868276, "learning_rate": 0.00016255874018187113, "loss": 0.4843, "step": 962 }, { "epoch": 0.57, "grad_norm": 0.19886109498653962, "learning_rate": 0.00016248537543809085, "loss": 0.5251, "step": 963 }, { "epoch": 0.58, "grad_norm": 0.23968663323524653, "learning_rate": 0.00016241195547827704, "loss": 0.5842, "step": 964 }, { "epoch": 0.58, "grad_norm": 0.17879861073069336, "learning_rate": 0.00016233848036730818, "loss": 0.5271, "step": 965 }, { "epoch": 0.58, "grad_norm": 0.18682851702979986, "learning_rate": 0.00016226495017011155, "loss": 0.5117, "step": 966 }, { "epoch": 0.58, "grad_norm": 0.17994919875574256, "learning_rate": 0.0001621913649516631, "loss": 0.5277, "step": 967 }, { "epoch": 0.58, "grad_norm": 0.18619266507890767, "learning_rate": 0.00016211772477698737, "loss": 0.5564, "step": 968 }, { "epoch": 0.58, "grad_norm": 0.18791500150949197, "learning_rate": 0.0001620440297111575, "loss": 0.5138, "step": 969 }, { "epoch": 0.58, "grad_norm": 0.20203542918576534, "learning_rate": 0.00016197027981929506, "loss": 0.5027, "step": 970 }, { "epoch": 0.58, "grad_norm": 0.2028531086326741, "learning_rate": 0.00016189647516657018, "loss": 0.5271, "step": 971 }, { "epoch": 0.58, "grad_norm": 0.20588712233829473, "learning_rate": 0.0001618226158182013, "loss": 0.5453, "step": 972 }, { "epoch": 0.58, "grad_norm": 0.17483034796275576, "learning_rate": 0.00016174870183945523, "loss": 0.5213, "step": 973 }, { "epoch": 0.58, "grad_norm": 0.1869321547342615, "learning_rate": 0.00016167473329564705, "loss": 0.5445, "step": 974 }, { "epoch": 0.58, "grad_norm": 0.20261074022832953, "learning_rate": 0.00016160071025213998, "loss": 0.5607, "step": 975 }, { "epoch": 0.58, "grad_norm": 0.20067938165729438, "learning_rate": 0.00016152663277434556, "loss": 0.5145, "step": 976 }, { "epoch": 0.58, "grad_norm": 0.1886528405831426, "learning_rate": 0.0001614525009277233, "loss": 0.5212, "step": 977 }, { "epoch": 0.58, "grad_norm": 0.20396953786191716, "learning_rate": 0.00016137831477778077, "loss": 0.5546, "step": 978 }, { "epoch": 0.58, "grad_norm": 0.22783734025618152, "learning_rate": 0.00016130407439007355, "loss": 0.5541, "step": 979 }, { "epoch": 0.58, "grad_norm": 0.2062351395059516, "learning_rate": 0.0001612297798302052, "loss": 0.5276, "step": 980 }, { "epoch": 0.59, "grad_norm": 0.1971656346477824, "learning_rate": 0.00016115543116382707, "loss": 0.5232, "step": 981 }, { "epoch": 0.59, "grad_norm": 0.19528017198031397, "learning_rate": 0.00016108102845663832, "loss": 0.5468, "step": 982 }, { "epoch": 0.59, "grad_norm": 0.213721647209808, "learning_rate": 0.00016100657177438592, "loss": 0.5617, "step": 983 }, { "epoch": 0.59, "grad_norm": 0.21222822398944205, "learning_rate": 0.0001609320611828645, "loss": 0.5014, "step": 984 }, { "epoch": 0.59, "grad_norm": 0.19722416264931983, "learning_rate": 0.0001608574967479163, "loss": 0.5811, "step": 985 }, { "epoch": 0.59, "grad_norm": 0.19590127100369606, "learning_rate": 0.00016078287853543125, "loss": 0.518, "step": 986 }, { "epoch": 0.59, "grad_norm": 0.19341001514547207, "learning_rate": 0.00016070820661134668, "loss": 0.4885, "step": 987 }, { "epoch": 0.59, "grad_norm": 0.20898635250980999, "learning_rate": 0.00016063348104164744, "loss": 0.4751, "step": 988 }, { "epoch": 0.59, "grad_norm": 0.21215717136848186, "learning_rate": 0.00016055870189236578, "loss": 0.5211, "step": 989 }, { "epoch": 0.59, "grad_norm": 0.24258818484100222, "learning_rate": 0.00016048386922958127, "loss": 0.4836, "step": 990 }, { "epoch": 0.59, "grad_norm": 0.1798892664861584, "learning_rate": 0.00016040898311942082, "loss": 0.4984, "step": 991 }, { "epoch": 0.59, "grad_norm": 0.18450252398162806, "learning_rate": 0.0001603340436280585, "loss": 0.5306, "step": 992 }, { "epoch": 0.59, "grad_norm": 0.18837615304247687, "learning_rate": 0.00016025905082171562, "loss": 0.4754, "step": 993 }, { "epoch": 0.59, "grad_norm": 0.2214407132100871, "learning_rate": 0.00016018400476666055, "loss": 0.6057, "step": 994 }, { "epoch": 0.59, "grad_norm": 0.19318387417313984, "learning_rate": 0.00016010890552920875, "loss": 0.5205, "step": 995 }, { "epoch": 0.59, "grad_norm": 0.18092243312526385, "learning_rate": 0.00016003375317572263, "loss": 0.5283, "step": 996 }, { "epoch": 0.59, "grad_norm": 0.18814705771101395, "learning_rate": 0.00015995854777261161, "loss": 0.5231, "step": 997 }, { "epoch": 0.6, "grad_norm": 0.2411971952093798, "learning_rate": 0.00015988328938633191, "loss": 0.5382, "step": 998 }, { "epoch": 0.6, "grad_norm": 0.19563910470947196, "learning_rate": 0.00015980797808338664, "loss": 0.5228, "step": 999 }, { "epoch": 0.6, "grad_norm": 0.1859383484794571, "learning_rate": 0.00015973261393032563, "loss": 0.5166, "step": 1000 }, { "epoch": 0.6, "grad_norm": 0.19354728242000344, "learning_rate": 0.0001596571969937454, "loss": 0.5317, "step": 1001 }, { "epoch": 0.6, "grad_norm": 0.19477100537075334, "learning_rate": 0.0001595817273402891, "loss": 0.5389, "step": 1002 }, { "epoch": 0.6, "grad_norm": 0.19658938430385506, "learning_rate": 0.00015950620503664658, "loss": 0.5576, "step": 1003 }, { "epoch": 0.6, "grad_norm": 0.18705811284094567, "learning_rate": 0.00015943063014955402, "loss": 0.5339, "step": 1004 }, { "epoch": 0.6, "grad_norm": 0.2238775349637154, "learning_rate": 0.00015935500274579426, "loss": 0.5348, "step": 1005 }, { "epoch": 0.6, "grad_norm": 0.20521877935386715, "learning_rate": 0.00015927932289219642, "loss": 0.5817, "step": 1006 }, { "epoch": 0.6, "grad_norm": 0.1748831609387364, "learning_rate": 0.00015920359065563604, "loss": 0.5003, "step": 1007 }, { "epoch": 0.6, "grad_norm": 0.19535482397472856, "learning_rate": 0.0001591278061030349, "loss": 0.5075, "step": 1008 }, { "epoch": 0.6, "grad_norm": 0.2091479313897058, "learning_rate": 0.00015905196930136097, "loss": 0.5555, "step": 1009 }, { "epoch": 0.6, "grad_norm": 0.19083157236229129, "learning_rate": 0.0001589760803176286, "loss": 0.5166, "step": 1010 }, { "epoch": 0.6, "grad_norm": 0.19553974503051083, "learning_rate": 0.00015890013921889795, "loss": 0.5978, "step": 1011 }, { "epoch": 0.6, "grad_norm": 0.18073049964870547, "learning_rate": 0.00015882414607227546, "loss": 0.5136, "step": 1012 }, { "epoch": 0.6, "grad_norm": 0.18595442399132076, "learning_rate": 0.00015874810094491343, "loss": 0.494, "step": 1013 }, { "epoch": 0.61, "grad_norm": 0.18565792372097342, "learning_rate": 0.00015867200390401023, "loss": 0.5211, "step": 1014 }, { "epoch": 0.61, "grad_norm": 0.19074958534189332, "learning_rate": 0.00015859585501681, "loss": 0.5371, "step": 1015 }, { "epoch": 0.61, "grad_norm": 0.22738878543819066, "learning_rate": 0.00015851965435060262, "loss": 0.5055, "step": 1016 }, { "epoch": 0.61, "grad_norm": 0.2316623260697804, "learning_rate": 0.00015844340197272393, "loss": 0.5467, "step": 1017 }, { "epoch": 0.61, "grad_norm": 0.18501859711102792, "learning_rate": 0.00015836709795055532, "loss": 0.5152, "step": 1018 }, { "epoch": 0.61, "grad_norm": 0.2000854019680551, "learning_rate": 0.0001582907423515239, "loss": 0.4975, "step": 1019 }, { "epoch": 0.61, "grad_norm": 0.2164338400016087, "learning_rate": 0.00015821433524310224, "loss": 0.5108, "step": 1020 }, { "epoch": 0.61, "grad_norm": 0.1794884681634538, "learning_rate": 0.00015813787669280855, "loss": 0.4852, "step": 1021 }, { "epoch": 0.61, "grad_norm": 0.18483080731616555, "learning_rate": 0.00015806136676820639, "loss": 0.5126, "step": 1022 }, { "epoch": 0.61, "grad_norm": 0.18580223692725362, "learning_rate": 0.00015798480553690482, "loss": 0.5068, "step": 1023 }, { "epoch": 0.61, "grad_norm": 0.2061029141413908, "learning_rate": 0.0001579081930665582, "loss": 0.5358, "step": 1024 }, { "epoch": 0.61, "grad_norm": 0.20179672861743628, "learning_rate": 0.00015783152942486613, "loss": 0.4904, "step": 1025 }, { "epoch": 0.61, "grad_norm": 0.1901456318733625, "learning_rate": 0.0001577548146795735, "loss": 0.5189, "step": 1026 }, { "epoch": 0.61, "grad_norm": 0.18700717565555658, "learning_rate": 0.00015767804889847025, "loss": 0.5053, "step": 1027 }, { "epoch": 0.61, "grad_norm": 0.20306080744009194, "learning_rate": 0.00015760123214939148, "loss": 0.5648, "step": 1028 }, { "epoch": 0.61, "grad_norm": 0.19427308531076118, "learning_rate": 0.00015752436450021742, "loss": 0.5317, "step": 1029 }, { "epoch": 0.61, "grad_norm": 0.17730364387799943, "learning_rate": 0.0001574474460188731, "loss": 0.4865, "step": 1030 }, { "epoch": 0.62, "grad_norm": 0.18919444977285255, "learning_rate": 0.00015737047677332863, "loss": 0.4482, "step": 1031 }, { "epoch": 0.62, "grad_norm": 0.2186748845065149, "learning_rate": 0.0001572934568315989, "loss": 0.5166, "step": 1032 }, { "epoch": 0.62, "grad_norm": 0.2157209010754937, "learning_rate": 0.00015721638626174354, "loss": 0.5, "step": 1033 }, { "epoch": 0.62, "grad_norm": 0.19854498695084713, "learning_rate": 0.00015713926513186702, "loss": 0.5833, "step": 1034 }, { "epoch": 0.62, "grad_norm": 0.19845689442164263, "learning_rate": 0.00015706209351011848, "loss": 0.5678, "step": 1035 }, { "epoch": 0.62, "grad_norm": 0.1816372371442818, "learning_rate": 0.00015698487146469163, "loss": 0.5403, "step": 1036 }, { "epoch": 0.62, "grad_norm": 0.1770141417061582, "learning_rate": 0.0001569075990638248, "loss": 0.4799, "step": 1037 }, { "epoch": 0.62, "grad_norm": 0.1828187317317013, "learning_rate": 0.00015683027637580066, "loss": 0.4879, "step": 1038 }, { "epoch": 0.62, "grad_norm": 0.20434187555872277, "learning_rate": 0.00015675290346894657, "loss": 0.5189, "step": 1039 }, { "epoch": 0.62, "grad_norm": 0.1906338108685848, "learning_rate": 0.00015667548041163406, "loss": 0.5162, "step": 1040 }, { "epoch": 0.62, "grad_norm": 0.19218887365666604, "learning_rate": 0.00015659800727227903, "loss": 0.5571, "step": 1041 }, { "epoch": 0.62, "grad_norm": 0.19183422180053736, "learning_rate": 0.00015652048411934167, "loss": 0.5144, "step": 1042 }, { "epoch": 0.62, "grad_norm": 0.19849906503502973, "learning_rate": 0.00015644291102132635, "loss": 0.5271, "step": 1043 }, { "epoch": 0.62, "grad_norm": 0.18585204906477085, "learning_rate": 0.0001563652880467816, "loss": 0.4906, "step": 1044 }, { "epoch": 0.62, "grad_norm": 0.20344554390359765, "learning_rate": 0.00015628761526429992, "loss": 0.5268, "step": 1045 }, { "epoch": 0.62, "grad_norm": 0.22108658658095226, "learning_rate": 0.00015620989274251797, "loss": 0.5253, "step": 1046 }, { "epoch": 0.62, "grad_norm": 0.1991875825405089, "learning_rate": 0.00015613212055011624, "loss": 0.5311, "step": 1047 }, { "epoch": 0.63, "grad_norm": 0.20172996858736694, "learning_rate": 0.0001560542987558192, "loss": 0.5008, "step": 1048 }, { "epoch": 0.63, "grad_norm": 0.1879727067503482, "learning_rate": 0.00015597642742839506, "loss": 0.539, "step": 1049 }, { "epoch": 0.63, "grad_norm": 0.2016771106491948, "learning_rate": 0.00015589850663665593, "loss": 0.5426, "step": 1050 }, { "epoch": 0.63, "grad_norm": 0.20217576692011496, "learning_rate": 0.0001558205364494575, "loss": 0.5315, "step": 1051 }, { "epoch": 0.63, "grad_norm": 0.19326432360974946, "learning_rate": 0.0001557425169356992, "loss": 0.4779, "step": 1052 }, { "epoch": 0.63, "grad_norm": 0.20958769456715673, "learning_rate": 0.000155664448164324, "loss": 0.5368, "step": 1053 }, { "epoch": 0.63, "grad_norm": 0.1944842076067339, "learning_rate": 0.00015558633020431835, "loss": 0.4948, "step": 1054 }, { "epoch": 0.63, "grad_norm": 0.19103650755521584, "learning_rate": 0.00015550816312471234, "loss": 0.4687, "step": 1055 }, { "epoch": 0.63, "grad_norm": 0.19130294148586216, "learning_rate": 0.00015542994699457925, "loss": 0.5241, "step": 1056 }, { "epoch": 0.63, "grad_norm": 0.19011766175405304, "learning_rate": 0.00015535168188303585, "loss": 0.5157, "step": 1057 }, { "epoch": 0.63, "grad_norm": 0.19308237510118917, "learning_rate": 0.00015527336785924213, "loss": 0.5158, "step": 1058 }, { "epoch": 0.63, "grad_norm": 0.18949297343958862, "learning_rate": 0.00015519500499240133, "loss": 0.5525, "step": 1059 }, { "epoch": 0.63, "grad_norm": 0.18554479264301466, "learning_rate": 0.00015511659335175985, "loss": 0.516, "step": 1060 }, { "epoch": 0.63, "grad_norm": 0.20363425340107272, "learning_rate": 0.00015503813300660717, "loss": 0.509, "step": 1061 }, { "epoch": 0.63, "grad_norm": 0.18613987636546375, "learning_rate": 0.0001549596240262758, "loss": 0.5097, "step": 1062 }, { "epoch": 0.63, "grad_norm": 0.20481511432281724, "learning_rate": 0.00015488106648014127, "loss": 0.5263, "step": 1063 }, { "epoch": 0.63, "grad_norm": 0.18895386399049763, "learning_rate": 0.00015480246043762198, "loss": 0.528, "step": 1064 }, { "epoch": 0.64, "grad_norm": 0.19514309309247793, "learning_rate": 0.00015472380596817922, "loss": 0.5444, "step": 1065 }, { "epoch": 0.64, "grad_norm": 0.2013716596665657, "learning_rate": 0.0001546451031413171, "loss": 0.535, "step": 1066 }, { "epoch": 0.64, "grad_norm": 0.2003122019096167, "learning_rate": 0.0001545663520265823, "loss": 0.5294, "step": 1067 }, { "epoch": 0.64, "grad_norm": 0.18908243290011195, "learning_rate": 0.00015448755269356442, "loss": 0.5366, "step": 1068 }, { "epoch": 0.64, "grad_norm": 0.1893813159340979, "learning_rate": 0.00015440870521189547, "loss": 0.5397, "step": 1069 }, { "epoch": 0.64, "grad_norm": 0.17856528748444775, "learning_rate": 0.00015432980965125008, "loss": 0.4996, "step": 1070 }, { "epoch": 0.64, "grad_norm": 0.20198848206655348, "learning_rate": 0.0001542508660813454, "loss": 0.5084, "step": 1071 }, { "epoch": 0.64, "grad_norm": 0.2027016976348092, "learning_rate": 0.00015417187457194092, "loss": 0.5835, "step": 1072 }, { "epoch": 0.64, "grad_norm": 0.19225289448462915, "learning_rate": 0.00015409283519283857, "loss": 0.5333, "step": 1073 }, { "epoch": 0.64, "grad_norm": 0.19832197684261557, "learning_rate": 0.00015401374801388254, "loss": 0.5183, "step": 1074 }, { "epoch": 0.64, "grad_norm": 0.18727329905053802, "learning_rate": 0.00015393461310495926, "loss": 0.5104, "step": 1075 }, { "epoch": 0.64, "grad_norm": 0.19371954596455335, "learning_rate": 0.0001538554305359974, "loss": 0.5482, "step": 1076 }, { "epoch": 0.64, "grad_norm": 0.17653026485031678, "learning_rate": 0.00015377620037696757, "loss": 0.4785, "step": 1077 }, { "epoch": 0.64, "grad_norm": 0.18746235874046693, "learning_rate": 0.00015369692269788266, "loss": 0.5432, "step": 1078 }, { "epoch": 0.64, "grad_norm": 0.20320517317727052, "learning_rate": 0.0001536175975687974, "loss": 0.5358, "step": 1079 }, { "epoch": 0.64, "grad_norm": 0.18239445216598227, "learning_rate": 0.00015353822505980854, "loss": 0.5133, "step": 1080 }, { "epoch": 0.64, "grad_norm": 0.2001726032915372, "learning_rate": 0.00015345880524105462, "loss": 0.5601, "step": 1081 }, { "epoch": 0.65, "grad_norm": 0.17441851789965093, "learning_rate": 0.00015337933818271597, "loss": 0.4992, "step": 1082 }, { "epoch": 0.65, "grad_norm": 0.19460177893040895, "learning_rate": 0.00015329982395501478, "loss": 0.5038, "step": 1083 }, { "epoch": 0.65, "grad_norm": 0.18954115748132863, "learning_rate": 0.00015322026262821488, "loss": 0.5691, "step": 1084 }, { "epoch": 0.65, "grad_norm": 0.18958791658091825, "learning_rate": 0.00015314065427262166, "loss": 0.513, "step": 1085 }, { "epoch": 0.65, "grad_norm": 0.18280595622163878, "learning_rate": 0.00015306099895858206, "loss": 0.5147, "step": 1086 }, { "epoch": 0.65, "grad_norm": 0.17678570607217403, "learning_rate": 0.00015298129675648462, "loss": 0.4952, "step": 1087 }, { "epoch": 0.65, "grad_norm": 0.17214270706367016, "learning_rate": 0.00015290154773675923, "loss": 0.4614, "step": 1088 }, { "epoch": 0.65, "grad_norm": 0.1874186691157256, "learning_rate": 0.00015282175196987721, "loss": 0.5354, "step": 1089 }, { "epoch": 0.65, "grad_norm": 0.18779920562332963, "learning_rate": 0.00015274190952635106, "loss": 0.4892, "step": 1090 }, { "epoch": 0.65, "grad_norm": 0.17670235761935024, "learning_rate": 0.00015266202047673467, "loss": 0.5265, "step": 1091 }, { "epoch": 0.65, "grad_norm": 0.1744750957023612, "learning_rate": 0.00015258208489162312, "loss": 0.547, "step": 1092 }, { "epoch": 0.65, "grad_norm": 0.17422295122430556, "learning_rate": 0.00015250210284165246, "loss": 0.5005, "step": 1093 }, { "epoch": 0.65, "grad_norm": 0.17453851271743742, "learning_rate": 0.00015242207439749992, "loss": 0.4727, "step": 1094 }, { "epoch": 0.65, "grad_norm": 0.2020628888829431, "learning_rate": 0.0001523419996298837, "loss": 0.5164, "step": 1095 }, { "epoch": 0.65, "grad_norm": 0.18036982996630943, "learning_rate": 0.00015226187860956295, "loss": 0.4998, "step": 1096 }, { "epoch": 0.65, "grad_norm": 0.19207762457147357, "learning_rate": 0.00015218171140733773, "loss": 0.4962, "step": 1097 }, { "epoch": 0.66, "grad_norm": 0.17728752409159226, "learning_rate": 0.00015210149809404875, "loss": 0.486, "step": 1098 }, { "epoch": 0.66, "grad_norm": 0.18658006486346662, "learning_rate": 0.00015202123874057761, "loss": 0.5701, "step": 1099 }, { "epoch": 0.66, "grad_norm": 0.1799589924022257, "learning_rate": 0.00015194093341784655, "loss": 0.508, "step": 1100 }, { "epoch": 0.66, "grad_norm": 0.18488881231943724, "learning_rate": 0.00015186058219681848, "loss": 0.5276, "step": 1101 }, { "epoch": 0.66, "grad_norm": 0.19172132491153843, "learning_rate": 0.00015178018514849678, "loss": 0.5103, "step": 1102 }, { "epoch": 0.66, "grad_norm": 0.18192159347628917, "learning_rate": 0.00015169974234392538, "loss": 0.5168, "step": 1103 }, { "epoch": 0.66, "grad_norm": 0.17390383147461494, "learning_rate": 0.00015161925385418867, "loss": 0.5117, "step": 1104 }, { "epoch": 0.66, "grad_norm": 0.18438067858194318, "learning_rate": 0.00015153871975041131, "loss": 0.5362, "step": 1105 }, { "epoch": 0.66, "grad_norm": 0.18794738239211156, "learning_rate": 0.00015145814010375841, "loss": 0.5284, "step": 1106 }, { "epoch": 0.66, "grad_norm": 0.18207793056557062, "learning_rate": 0.00015137751498543517, "loss": 0.5157, "step": 1107 }, { "epoch": 0.66, "grad_norm": 0.1820180090632629, "learning_rate": 0.00015129684446668713, "loss": 0.4754, "step": 1108 }, { "epoch": 0.66, "grad_norm": 0.17889600747067783, "learning_rate": 0.00015121612861879974, "loss": 0.4987, "step": 1109 }, { "epoch": 0.66, "grad_norm": 0.17971937316609232, "learning_rate": 0.00015113536751309878, "loss": 0.4911, "step": 1110 }, { "epoch": 0.66, "grad_norm": 0.18140328667930378, "learning_rate": 0.00015105456122094983, "loss": 0.5305, "step": 1111 }, { "epoch": 0.66, "grad_norm": 0.2025119429324591, "learning_rate": 0.00015097370981375838, "loss": 0.5386, "step": 1112 }, { "epoch": 0.66, "grad_norm": 0.19491035376315885, "learning_rate": 0.0001508928133629699, "loss": 0.4907, "step": 1113 }, { "epoch": 0.66, "grad_norm": 0.21078693446799357, "learning_rate": 0.00015081187194006962, "loss": 0.5365, "step": 1114 }, { "epoch": 0.67, "grad_norm": 0.18843997731657822, "learning_rate": 0.0001507308856165825, "loss": 0.5183, "step": 1115 }, { "epoch": 0.67, "grad_norm": 0.18301017548445006, "learning_rate": 0.00015064985446407321, "loss": 0.5365, "step": 1116 }, { "epoch": 0.67, "grad_norm": 0.20526106918611095, "learning_rate": 0.00015056877855414594, "loss": 0.5424, "step": 1117 }, { "epoch": 0.67, "grad_norm": 0.19687691349301994, "learning_rate": 0.00015048765795844457, "loss": 0.4767, "step": 1118 }, { "epoch": 0.67, "grad_norm": 0.24595956331043234, "learning_rate": 0.00015040649274865238, "loss": 0.5276, "step": 1119 }, { "epoch": 0.67, "grad_norm": 0.19028199168907478, "learning_rate": 0.000150325282996492, "loss": 0.5154, "step": 1120 }, { "epoch": 0.67, "grad_norm": 0.1869717933744596, "learning_rate": 0.00015024402877372562, "loss": 0.4994, "step": 1121 }, { "epoch": 0.67, "grad_norm": 0.19760879222012737, "learning_rate": 0.00015016273015215455, "loss": 0.5181, "step": 1122 }, { "epoch": 0.67, "grad_norm": 0.19770025526423807, "learning_rate": 0.00015008138720361942, "loss": 0.5224, "step": 1123 }, { "epoch": 0.67, "grad_norm": 0.21756294254565256, "learning_rate": 0.00015000000000000001, "loss": 0.5578, "step": 1124 }, { "epoch": 0.67, "grad_norm": 0.20716812622877365, "learning_rate": 0.0001499185686132152, "loss": 0.5812, "step": 1125 }, { "epoch": 0.67, "grad_norm": 0.20335171698711324, "learning_rate": 0.00014983709311522297, "loss": 0.5249, "step": 1126 }, { "epoch": 0.67, "grad_norm": 0.2176962106234944, "learning_rate": 0.0001497555735780201, "loss": 0.5856, "step": 1127 }, { "epoch": 0.67, "grad_norm": 0.1783334911039492, "learning_rate": 0.00014967401007364255, "loss": 0.4926, "step": 1128 }, { "epoch": 0.67, "grad_norm": 0.21009643426167063, "learning_rate": 0.0001495924026741649, "loss": 0.4825, "step": 1129 }, { "epoch": 0.67, "grad_norm": 0.182586366890212, "learning_rate": 0.0001495107514517007, "loss": 0.4694, "step": 1130 }, { "epoch": 0.67, "grad_norm": 0.19955812947692972, "learning_rate": 0.00014942905647840206, "loss": 0.5385, "step": 1131 }, { "epoch": 0.68, "grad_norm": 0.18833362414930832, "learning_rate": 0.0001493473178264599, "loss": 0.5331, "step": 1132 }, { "epoch": 0.68, "grad_norm": 0.19652826299925613, "learning_rate": 0.0001492655355681036, "loss": 0.5833, "step": 1133 }, { "epoch": 0.68, "grad_norm": 0.19744208734697974, "learning_rate": 0.00014918370977560122, "loss": 0.5147, "step": 1134 }, { "epoch": 0.68, "grad_norm": 0.19287794254421994, "learning_rate": 0.0001491018405212591, "loss": 0.4678, "step": 1135 }, { "epoch": 0.68, "grad_norm": 0.20401004694813352, "learning_rate": 0.00014901992787742219, "loss": 0.5388, "step": 1136 }, { "epoch": 0.68, "grad_norm": 0.1859539975227642, "learning_rate": 0.00014893797191647368, "loss": 0.5121, "step": 1137 }, { "epoch": 0.68, "grad_norm": 0.1777739643065349, "learning_rate": 0.00014885597271083499, "loss": 0.4917, "step": 1138 }, { "epoch": 0.68, "grad_norm": 0.24667049739226657, "learning_rate": 0.00014877393033296585, "loss": 0.5631, "step": 1139 }, { "epoch": 0.68, "grad_norm": 0.19660914123915638, "learning_rate": 0.00014869184485536408, "loss": 0.4955, "step": 1140 }, { "epoch": 0.68, "grad_norm": 0.1849639582602034, "learning_rate": 0.00014860971635056563, "loss": 0.5611, "step": 1141 }, { "epoch": 0.68, "grad_norm": 0.18618478737494773, "learning_rate": 0.00014852754489114444, "loss": 0.5668, "step": 1142 }, { "epoch": 0.68, "grad_norm": 0.18738098465743458, "learning_rate": 0.0001484453305497124, "loss": 0.491, "step": 1143 }, { "epoch": 0.68, "grad_norm": 0.2001668377337724, "learning_rate": 0.00014836307339891934, "loss": 0.5395, "step": 1144 }, { "epoch": 0.68, "grad_norm": 0.18840397893983923, "learning_rate": 0.00014828077351145282, "loss": 0.5466, "step": 1145 }, { "epoch": 0.68, "grad_norm": 0.18348054201307173, "learning_rate": 0.00014819843096003824, "loss": 0.551, "step": 1146 }, { "epoch": 0.68, "grad_norm": 0.17819691596218545, "learning_rate": 0.0001481160458174388, "loss": 0.4773, "step": 1147 }, { "epoch": 0.68, "grad_norm": 0.17697981019548276, "learning_rate": 0.0001480336181564551, "loss": 0.4586, "step": 1148 }, { "epoch": 0.69, "grad_norm": 0.2051551821180113, "learning_rate": 0.0001479511480499255, "loss": 0.5141, "step": 1149 }, { "epoch": 0.69, "grad_norm": 0.19457252340787579, "learning_rate": 0.00014786863557072582, "loss": 0.4769, "step": 1150 }, { "epoch": 0.69, "grad_norm": 0.20436510440947783, "learning_rate": 0.00014778608079176923, "loss": 0.5556, "step": 1151 }, { "epoch": 0.69, "grad_norm": 0.18295670635083391, "learning_rate": 0.00014770348378600646, "loss": 0.5063, "step": 1152 }, { "epoch": 0.69, "grad_norm": 0.18628367387903996, "learning_rate": 0.00014762084462642539, "loss": 0.5427, "step": 1153 }, { "epoch": 0.69, "grad_norm": 0.2161510404034079, "learning_rate": 0.00014753816338605123, "loss": 0.4854, "step": 1154 }, { "epoch": 0.69, "grad_norm": 0.17721958276959857, "learning_rate": 0.00014745544013794636, "loss": 0.5226, "step": 1155 }, { "epoch": 0.69, "grad_norm": 0.18343920037243416, "learning_rate": 0.0001473726749552103, "loss": 0.5187, "step": 1156 }, { "epoch": 0.69, "grad_norm": 0.19071533215976352, "learning_rate": 0.00014728986791097957, "loss": 0.5307, "step": 1157 }, { "epoch": 0.69, "grad_norm": 0.17869129856056046, "learning_rate": 0.00014720701907842772, "loss": 0.4744, "step": 1158 }, { "epoch": 0.69, "grad_norm": 0.18707001538233387, "learning_rate": 0.00014712412853076524, "loss": 0.4709, "step": 1159 }, { "epoch": 0.69, "grad_norm": 0.18697017631013504, "learning_rate": 0.00014704119634123948, "loss": 0.5077, "step": 1160 }, { "epoch": 0.69, "grad_norm": 0.18814502472746178, "learning_rate": 0.00014695822258313455, "loss": 0.5233, "step": 1161 }, { "epoch": 0.69, "grad_norm": 0.2368823091300486, "learning_rate": 0.00014687520732977128, "loss": 0.4966, "step": 1162 }, { "epoch": 0.69, "grad_norm": 0.21080889905859476, "learning_rate": 0.00014679215065450726, "loss": 0.5262, "step": 1163 }, { "epoch": 0.69, "grad_norm": 0.20202781405176504, "learning_rate": 0.0001467090526307366, "loss": 0.547, "step": 1164 }, { "epoch": 0.7, "grad_norm": 0.22096110211580308, "learning_rate": 0.00014662591333189, "loss": 0.5188, "step": 1165 }, { "epoch": 0.7, "grad_norm": 0.20568086982318412, "learning_rate": 0.0001465427328314346, "loss": 0.4716, "step": 1166 }, { "epoch": 0.7, "grad_norm": 0.2001156528680437, "learning_rate": 0.0001464595112028739, "loss": 0.4726, "step": 1167 }, { "epoch": 0.7, "grad_norm": 0.22271237142339692, "learning_rate": 0.0001463762485197479, "loss": 0.5685, "step": 1168 }, { "epoch": 0.7, "grad_norm": 0.20249978518037207, "learning_rate": 0.00014629294485563271, "loss": 0.4992, "step": 1169 }, { "epoch": 0.7, "grad_norm": 0.21051317014195287, "learning_rate": 0.00014620960028414074, "loss": 0.4941, "step": 1170 }, { "epoch": 0.7, "grad_norm": 0.22015178328592389, "learning_rate": 0.0001461262148789205, "loss": 0.5554, "step": 1171 }, { "epoch": 0.7, "grad_norm": 0.1718994350084717, "learning_rate": 0.00014604278871365662, "loss": 0.4964, "step": 1172 }, { "epoch": 0.7, "grad_norm": 0.17185816501519585, "learning_rate": 0.0001459593218620698, "loss": 0.5104, "step": 1173 }, { "epoch": 0.7, "grad_norm": 0.21401829521307225, "learning_rate": 0.0001458758143979166, "loss": 0.499, "step": 1174 }, { "epoch": 0.7, "grad_norm": 0.19776025857516183, "learning_rate": 0.00014579226639498946, "loss": 0.496, "step": 1175 }, { "epoch": 0.7, "grad_norm": 0.1961032370488782, "learning_rate": 0.00014570867792711674, "loss": 0.5107, "step": 1176 }, { "epoch": 0.7, "grad_norm": 0.18716882427415388, "learning_rate": 0.0001456250490681625, "loss": 0.4876, "step": 1177 }, { "epoch": 0.7, "grad_norm": 0.1956152915992498, "learning_rate": 0.00014554137989202643, "loss": 0.5732, "step": 1178 }, { "epoch": 0.7, "grad_norm": 0.19786724370350303, "learning_rate": 0.000145457670472644, "loss": 0.5551, "step": 1179 }, { "epoch": 0.7, "grad_norm": 0.18947724734010707, "learning_rate": 0.00014537392088398608, "loss": 0.4944, "step": 1180 }, { "epoch": 0.7, "grad_norm": 0.2097627269461171, "learning_rate": 0.00014529013120005916, "loss": 0.5383, "step": 1181 }, { "epoch": 0.71, "grad_norm": 0.17430571658619776, "learning_rate": 0.0001452063014949051, "loss": 0.4849, "step": 1182 }, { "epoch": 0.71, "grad_norm": 0.1709299956294579, "learning_rate": 0.0001451224318426011, "loss": 0.4627, "step": 1183 }, { "epoch": 0.71, "grad_norm": 0.17998153878412265, "learning_rate": 0.0001450385223172597, "loss": 0.5102, "step": 1184 }, { "epoch": 0.71, "grad_norm": 0.1973625392843972, "learning_rate": 0.0001449545729930287, "loss": 0.5508, "step": 1185 }, { "epoch": 0.71, "grad_norm": 0.1871194681672037, "learning_rate": 0.00014487058394409104, "loss": 0.5061, "step": 1186 }, { "epoch": 0.71, "grad_norm": 0.1869428596068349, "learning_rate": 0.00014478655524466475, "loss": 0.4707, "step": 1187 }, { "epoch": 0.71, "grad_norm": 0.1847860359839481, "learning_rate": 0.00014470248696900285, "loss": 0.5141, "step": 1188 }, { "epoch": 0.71, "grad_norm": 0.18708736509686738, "learning_rate": 0.00014461837919139348, "loss": 0.5192, "step": 1189 }, { "epoch": 0.71, "grad_norm": 0.22393402471111507, "learning_rate": 0.00014453423198615957, "loss": 0.5192, "step": 1190 }, { "epoch": 0.71, "grad_norm": 0.19155699706033086, "learning_rate": 0.00014445004542765888, "loss": 0.5054, "step": 1191 }, { "epoch": 0.71, "grad_norm": 0.18623804761748072, "learning_rate": 0.00014436581959028405, "loss": 0.5212, "step": 1192 }, { "epoch": 0.71, "grad_norm": 0.18614307922016232, "learning_rate": 0.00014428155454846225, "loss": 0.4667, "step": 1193 }, { "epoch": 0.71, "grad_norm": 0.20760229991014245, "learning_rate": 0.0001441972503766555, "loss": 0.4897, "step": 1194 }, { "epoch": 0.71, "grad_norm": 0.1713367358079391, "learning_rate": 0.00014411290714936033, "loss": 0.5075, "step": 1195 }, { "epoch": 0.71, "grad_norm": 0.18925060252185533, "learning_rate": 0.00014402852494110768, "loss": 0.5185, "step": 1196 }, { "epoch": 0.71, "grad_norm": 0.1845531822650512, "learning_rate": 0.00014394410382646304, "loss": 0.4799, "step": 1197 }, { "epoch": 0.71, "grad_norm": 0.17403525502879638, "learning_rate": 0.00014385964388002623, "loss": 0.4935, "step": 1198 }, { "epoch": 0.72, "grad_norm": 0.1991754972420626, "learning_rate": 0.00014377514517643144, "loss": 0.5419, "step": 1199 }, { "epoch": 0.72, "grad_norm": 0.17030413694508353, "learning_rate": 0.00014369060779034708, "loss": 0.4907, "step": 1200 }, { "epoch": 0.72, "grad_norm": 0.17984445026847887, "learning_rate": 0.00014360603179647567, "loss": 0.5024, "step": 1201 }, { "epoch": 0.72, "grad_norm": 0.1782253350761795, "learning_rate": 0.000143521417269554, "loss": 0.4629, "step": 1202 }, { "epoch": 0.72, "grad_norm": 0.18078126508901013, "learning_rate": 0.00014343676428435275, "loss": 0.4884, "step": 1203 }, { "epoch": 0.72, "grad_norm": 0.17808414211423282, "learning_rate": 0.0001433520729156767, "loss": 0.4884, "step": 1204 }, { "epoch": 0.72, "grad_norm": 0.19634028134840262, "learning_rate": 0.0001432673432383645, "loss": 0.5253, "step": 1205 }, { "epoch": 0.72, "grad_norm": 0.17516039333071384, "learning_rate": 0.00014318257532728866, "loss": 0.5234, "step": 1206 }, { "epoch": 0.72, "grad_norm": 0.18938507108619154, "learning_rate": 0.0001430977692573554, "loss": 0.4569, "step": 1207 }, { "epoch": 0.72, "grad_norm": 0.18529885163701465, "learning_rate": 0.00014301292510350485, "loss": 0.5588, "step": 1208 }, { "epoch": 0.72, "grad_norm": 0.1980418349421889, "learning_rate": 0.0001429280429407106, "loss": 0.5567, "step": 1209 }, { "epoch": 0.72, "grad_norm": 0.18102359715715138, "learning_rate": 0.00014284312284397994, "loss": 0.5346, "step": 1210 }, { "epoch": 0.72, "grad_norm": 0.1682936887439461, "learning_rate": 0.00014275816488835364, "loss": 0.4636, "step": 1211 }, { "epoch": 0.72, "grad_norm": 0.1800173543128752, "learning_rate": 0.00014267316914890583, "loss": 0.5273, "step": 1212 }, { "epoch": 0.72, "grad_norm": 0.18733177044546032, "learning_rate": 0.00014258813570074429, "loss": 0.5375, "step": 1213 }, { "epoch": 0.72, "grad_norm": 0.1748926227325175, "learning_rate": 0.00014250306461900984, "loss": 0.5167, "step": 1214 }, { "epoch": 0.72, "grad_norm": 0.18731958978952354, "learning_rate": 0.00014241795597887675, "loss": 0.4931, "step": 1215 }, { "epoch": 0.73, "grad_norm": 0.1826531388805938, "learning_rate": 0.00014233280985555234, "loss": 0.5561, "step": 1216 }, { "epoch": 0.73, "grad_norm": 0.17687603086969267, "learning_rate": 0.00014224762632427713, "loss": 0.4916, "step": 1217 }, { "epoch": 0.73, "grad_norm": 0.18307772952489884, "learning_rate": 0.0001421624054603247, "loss": 0.4885, "step": 1218 }, { "epoch": 0.73, "grad_norm": 0.17499886290677696, "learning_rate": 0.00014207714733900162, "loss": 0.472, "step": 1219 }, { "epoch": 0.73, "grad_norm": 0.21149209308036526, "learning_rate": 0.00014199185203564728, "loss": 0.5162, "step": 1220 }, { "epoch": 0.73, "grad_norm": 0.18757068290035836, "learning_rate": 0.00014190651962563407, "loss": 0.5057, "step": 1221 }, { "epoch": 0.73, "grad_norm": 0.1968053254662056, "learning_rate": 0.00014182115018436715, "loss": 0.4965, "step": 1222 }, { "epoch": 0.73, "grad_norm": 0.1832324100206531, "learning_rate": 0.0001417357437872843, "loss": 0.5297, "step": 1223 }, { "epoch": 0.73, "grad_norm": 0.1947511964403348, "learning_rate": 0.00014165030050985604, "loss": 0.5108, "step": 1224 }, { "epoch": 0.73, "grad_norm": 0.1887394891277499, "learning_rate": 0.00014156482042758544, "loss": 0.5378, "step": 1225 }, { "epoch": 0.73, "grad_norm": 0.1948370350677339, "learning_rate": 0.0001414793036160081, "loss": 0.5031, "step": 1226 }, { "epoch": 0.73, "grad_norm": 0.1876481087251801, "learning_rate": 0.00014139375015069215, "loss": 0.5245, "step": 1227 }, { "epoch": 0.73, "grad_norm": 0.17100568796101634, "learning_rate": 0.00014130816010723805, "loss": 0.474, "step": 1228 }, { "epoch": 0.73, "grad_norm": 0.18845060671638836, "learning_rate": 0.0001412225335612785, "loss": 0.4944, "step": 1229 }, { "epoch": 0.73, "grad_norm": 0.19905303803856061, "learning_rate": 0.00014113687058847857, "loss": 0.5389, "step": 1230 }, { "epoch": 0.73, "grad_norm": 0.18736361174530702, "learning_rate": 0.00014105117126453554, "loss": 0.5128, "step": 1231 }, { "epoch": 0.74, "grad_norm": 0.18113672252747182, "learning_rate": 0.00014096543566517871, "loss": 0.5077, "step": 1232 }, { "epoch": 0.74, "grad_norm": 0.18697696198984598, "learning_rate": 0.00014087966386616945, "loss": 0.5083, "step": 1233 }, { "epoch": 0.74, "grad_norm": 0.18321266285619817, "learning_rate": 0.00014079385594330121, "loss": 0.501, "step": 1234 }, { "epoch": 0.74, "grad_norm": 0.2085760786366843, "learning_rate": 0.00014070801197239928, "loss": 0.5337, "step": 1235 }, { "epoch": 0.74, "grad_norm": 0.20073471496417797, "learning_rate": 0.00014062213202932085, "loss": 0.5354, "step": 1236 }, { "epoch": 0.74, "grad_norm": 0.17820357534513603, "learning_rate": 0.00014053621618995488, "loss": 0.4728, "step": 1237 }, { "epoch": 0.74, "grad_norm": 0.18692997172911976, "learning_rate": 0.00014045026453022197, "loss": 0.5461, "step": 1238 }, { "epoch": 0.74, "grad_norm": 0.2216376697939887, "learning_rate": 0.00014036427712607453, "loss": 0.5099, "step": 1239 }, { "epoch": 0.74, "grad_norm": 0.19522613324619895, "learning_rate": 0.00014027825405349642, "loss": 0.5257, "step": 1240 }, { "epoch": 0.74, "grad_norm": 0.23644520056181112, "learning_rate": 0.0001401921953885031, "loss": 0.5637, "step": 1241 }, { "epoch": 0.74, "grad_norm": 0.1850220175837294, "learning_rate": 0.00014010610120714147, "loss": 0.5299, "step": 1242 }, { "epoch": 0.74, "grad_norm": 0.17697459178123112, "learning_rate": 0.00014001997158548973, "loss": 0.4723, "step": 1243 }, { "epoch": 0.74, "grad_norm": 0.20359077418663543, "learning_rate": 0.00013993380659965755, "loss": 0.5546, "step": 1244 }, { "epoch": 0.74, "grad_norm": 0.1875137496084646, "learning_rate": 0.00013984760632578577, "loss": 0.5747, "step": 1245 }, { "epoch": 0.74, "grad_norm": 0.21313947649725407, "learning_rate": 0.00013976137084004633, "loss": 0.5321, "step": 1246 }, { "epoch": 0.74, "grad_norm": 0.17217418667779574, "learning_rate": 0.0001396751002186424, "loss": 0.4719, "step": 1247 }, { "epoch": 0.74, "grad_norm": 0.18863205998380614, "learning_rate": 0.00013958879453780817, "loss": 0.5627, "step": 1248 }, { "epoch": 0.75, "grad_norm": 0.17087031621768445, "learning_rate": 0.00013950245387380882, "loss": 0.4626, "step": 1249 }, { "epoch": 0.75, "grad_norm": 0.17677171504664238, "learning_rate": 0.00013941607830294042, "loss": 0.4593, "step": 1250 }, { "epoch": 0.75, "grad_norm": 0.19402672015245978, "learning_rate": 0.00013932966790152987, "loss": 0.5231, "step": 1251 }, { "epoch": 0.75, "grad_norm": 0.2008980487940075, "learning_rate": 0.00013924322274593486, "loss": 0.5223, "step": 1252 }, { "epoch": 0.75, "grad_norm": 0.19654503986426297, "learning_rate": 0.00013915674291254383, "loss": 0.5489, "step": 1253 }, { "epoch": 0.75, "grad_norm": 0.18755885764625013, "learning_rate": 0.00013907022847777585, "loss": 0.5124, "step": 1254 }, { "epoch": 0.75, "grad_norm": 0.16715913531165327, "learning_rate": 0.00013898367951808052, "loss": 0.4741, "step": 1255 }, { "epoch": 0.75, "grad_norm": 0.18224361618494386, "learning_rate": 0.000138897096109938, "loss": 0.4852, "step": 1256 }, { "epoch": 0.75, "grad_norm": 0.17309322546124714, "learning_rate": 0.00013881047832985886, "loss": 0.4782, "step": 1257 }, { "epoch": 0.75, "grad_norm": 0.17208927952188605, "learning_rate": 0.00013872382625438405, "loss": 0.5383, "step": 1258 }, { "epoch": 0.75, "grad_norm": 0.18195195367386732, "learning_rate": 0.00013863713996008483, "loss": 0.4793, "step": 1259 }, { "epoch": 0.75, "grad_norm": 0.1869015589060158, "learning_rate": 0.00013855041952356273, "loss": 0.4918, "step": 1260 }, { "epoch": 0.75, "grad_norm": 0.22970064407630242, "learning_rate": 0.00013846366502144936, "loss": 0.5468, "step": 1261 }, { "epoch": 0.75, "grad_norm": 0.18223297061392882, "learning_rate": 0.00013837687653040653, "loss": 0.4836, "step": 1262 }, { "epoch": 0.75, "grad_norm": 0.1811453560251677, "learning_rate": 0.00013829005412712607, "loss": 0.4799, "step": 1263 }, { "epoch": 0.75, "grad_norm": 0.19531829020393465, "learning_rate": 0.00013820319788832968, "loss": 0.4886, "step": 1264 }, { "epoch": 0.75, "grad_norm": 0.1873613837354565, "learning_rate": 0.0001381163078907691, "loss": 0.5082, "step": 1265 }, { "epoch": 0.76, "grad_norm": 0.2725120336639838, "learning_rate": 0.0001380293842112258, "loss": 0.5434, "step": 1266 }, { "epoch": 0.76, "grad_norm": 0.1844679328183984, "learning_rate": 0.00013794242692651102, "loss": 0.51, "step": 1267 }, { "epoch": 0.76, "grad_norm": 0.188306631118848, "learning_rate": 0.00013785543611346578, "loss": 0.4763, "step": 1268 }, { "epoch": 0.76, "grad_norm": 0.19531189056502454, "learning_rate": 0.00013776841184896064, "loss": 0.5165, "step": 1269 }, { "epoch": 0.76, "grad_norm": 0.20264440596591932, "learning_rate": 0.00013768135420989577, "loss": 0.5387, "step": 1270 }, { "epoch": 0.76, "grad_norm": 0.2084557864213027, "learning_rate": 0.00013759426327320074, "loss": 0.5763, "step": 1271 }, { "epoch": 0.76, "grad_norm": 0.20500727562381132, "learning_rate": 0.0001375071391158347, "loss": 0.5333, "step": 1272 }, { "epoch": 0.76, "grad_norm": 0.18316050604278142, "learning_rate": 0.00013741998181478603, "loss": 0.4955, "step": 1273 }, { "epoch": 0.76, "grad_norm": 0.20590646162784712, "learning_rate": 0.00013733279144707245, "loss": 0.519, "step": 1274 }, { "epoch": 0.76, "grad_norm": 0.17998508710498043, "learning_rate": 0.00013724556808974086, "loss": 0.4866, "step": 1275 }, { "epoch": 0.76, "grad_norm": 0.20092486892395545, "learning_rate": 0.0001371583118198674, "loss": 0.5918, "step": 1276 }, { "epoch": 0.76, "grad_norm": 0.1910150922839534, "learning_rate": 0.0001370710227145572, "loss": 0.5362, "step": 1277 }, { "epoch": 0.76, "grad_norm": 0.18781648781714727, "learning_rate": 0.00013698370085094442, "loss": 0.5094, "step": 1278 }, { "epoch": 0.76, "grad_norm": 0.16979981901477645, "learning_rate": 0.0001368963463061922, "loss": 0.4665, "step": 1279 }, { "epoch": 0.76, "grad_norm": 0.17477774487542888, "learning_rate": 0.0001368089591574926, "loss": 0.5096, "step": 1280 }, { "epoch": 0.76, "grad_norm": 0.18366095693364085, "learning_rate": 0.00013672153948206635, "loss": 0.4843, "step": 1281 }, { "epoch": 0.76, "grad_norm": 0.17419423552171015, "learning_rate": 0.00013663408735716307, "loss": 0.4886, "step": 1282 }, { "epoch": 0.77, "grad_norm": 0.18711956168861, "learning_rate": 0.00013654660286006095, "loss": 0.5093, "step": 1283 }, { "epoch": 0.77, "grad_norm": 0.17966945439077842, "learning_rate": 0.0001364590860680669, "loss": 0.5146, "step": 1284 }, { "epoch": 0.77, "grad_norm": 0.1762512874090438, "learning_rate": 0.00013637153705851616, "loss": 0.4863, "step": 1285 }, { "epoch": 0.77, "grad_norm": 0.1851970347695891, "learning_rate": 0.00013628395590877277, "loss": 0.5562, "step": 1286 }, { "epoch": 0.77, "grad_norm": 0.19299530099729093, "learning_rate": 0.00013619634269622884, "loss": 0.4686, "step": 1287 }, { "epoch": 0.77, "grad_norm": 0.19014321879289434, "learning_rate": 0.00013610869749830498, "loss": 0.5187, "step": 1288 }, { "epoch": 0.77, "grad_norm": 0.1851421397455001, "learning_rate": 0.00013602102039245002, "loss": 0.5738, "step": 1289 }, { "epoch": 0.77, "grad_norm": 0.18412094053224104, "learning_rate": 0.00013593331145614104, "loss": 0.5022, "step": 1290 }, { "epoch": 0.77, "grad_norm": 0.17961356701720557, "learning_rate": 0.00013584557076688322, "loss": 0.5351, "step": 1291 }, { "epoch": 0.77, "grad_norm": 0.1705319278777615, "learning_rate": 0.00013575779840220976, "loss": 0.5084, "step": 1292 }, { "epoch": 0.77, "grad_norm": 0.18019495479729933, "learning_rate": 0.00013566999443968185, "loss": 0.5236, "step": 1293 }, { "epoch": 0.77, "grad_norm": 0.17974898003188056, "learning_rate": 0.00013558215895688867, "loss": 0.542, "step": 1294 }, { "epoch": 0.77, "grad_norm": 0.1797247167748882, "learning_rate": 0.00013549429203144723, "loss": 0.5077, "step": 1295 }, { "epoch": 0.77, "grad_norm": 0.1850550416693565, "learning_rate": 0.00013540639374100226, "loss": 0.5396, "step": 1296 }, { "epoch": 0.77, "grad_norm": 0.1733382663357764, "learning_rate": 0.00013531846416322627, "loss": 0.5239, "step": 1297 }, { "epoch": 0.77, "grad_norm": 0.17977034098323824, "learning_rate": 0.00013523050337581943, "loss": 0.4717, "step": 1298 }, { "epoch": 0.78, "grad_norm": 0.18483046559711522, "learning_rate": 0.0001351425114565094, "loss": 0.5378, "step": 1299 }, { "epoch": 0.78, "grad_norm": 0.17834941323623535, "learning_rate": 0.0001350544884830515, "loss": 0.5662, "step": 1300 }, { "epoch": 0.78, "grad_norm": 0.18309245782790126, "learning_rate": 0.00013496643453322828, "loss": 0.5812, "step": 1301 }, { "epoch": 0.78, "grad_norm": 0.19808755077064702, "learning_rate": 0.0001348783496848499, "loss": 0.5721, "step": 1302 }, { "epoch": 0.78, "grad_norm": 0.18923943372482865, "learning_rate": 0.00013479023401575366, "loss": 0.5383, "step": 1303 }, { "epoch": 0.78, "grad_norm": 0.20010184755781324, "learning_rate": 0.00013470208760380412, "loss": 0.4803, "step": 1304 }, { "epoch": 0.78, "grad_norm": 0.2759865843375521, "learning_rate": 0.0001346139105268931, "loss": 0.5583, "step": 1305 }, { "epoch": 0.78, "grad_norm": 0.17317696637576427, "learning_rate": 0.00013452570286293938, "loss": 0.4797, "step": 1306 }, { "epoch": 0.78, "grad_norm": 0.1915090235119688, "learning_rate": 0.00013443746468988884, "loss": 0.467, "step": 1307 }, { "epoch": 0.78, "grad_norm": 0.20256119138747514, "learning_rate": 0.00013434919608571437, "loss": 0.5311, "step": 1308 }, { "epoch": 0.78, "grad_norm": 0.17993484843579866, "learning_rate": 0.00013426089712841564, "loss": 0.4832, "step": 1309 }, { "epoch": 0.78, "grad_norm": 0.18038829097640488, "learning_rate": 0.00013417256789601925, "loss": 0.5688, "step": 1310 }, { "epoch": 0.78, "grad_norm": 0.19541363045484325, "learning_rate": 0.00013408420846657844, "loss": 0.5244, "step": 1311 }, { "epoch": 0.78, "grad_norm": 0.19899174638442974, "learning_rate": 0.00013399581891817324, "loss": 0.5588, "step": 1312 }, { "epoch": 0.78, "grad_norm": 0.1882127221313746, "learning_rate": 0.00013390739932891022, "loss": 0.4932, "step": 1313 }, { "epoch": 0.78, "grad_norm": 0.1823628172344837, "learning_rate": 0.00013381894977692257, "loss": 0.5594, "step": 1314 }, { "epoch": 0.78, "grad_norm": 0.17992657977066912, "learning_rate": 0.00013373047034036988, "loss": 0.5509, "step": 1315 }, { "epoch": 0.79, "grad_norm": 0.18381185146899134, "learning_rate": 0.0001336419610974382, "loss": 0.5209, "step": 1316 }, { "epoch": 0.79, "grad_norm": 0.16713876415743062, "learning_rate": 0.00013355342212633986, "loss": 0.4726, "step": 1317 }, { "epoch": 0.79, "grad_norm": 0.1945012456185686, "learning_rate": 0.0001334648535053136, "loss": 0.5733, "step": 1318 }, { "epoch": 0.79, "grad_norm": 0.1832463509617787, "learning_rate": 0.00013337625531262414, "loss": 0.5257, "step": 1319 }, { "epoch": 0.79, "grad_norm": 0.17703976530840718, "learning_rate": 0.0001332876276265625, "loss": 0.4854, "step": 1320 }, { "epoch": 0.79, "grad_norm": 0.1857229688186767, "learning_rate": 0.00013319897052544577, "loss": 0.4803, "step": 1321 }, { "epoch": 0.79, "grad_norm": 0.18149715939246025, "learning_rate": 0.00013311028408761688, "loss": 0.4886, "step": 1322 }, { "epoch": 0.79, "grad_norm": 0.18600803718548015, "learning_rate": 0.00013302156839144484, "loss": 0.5098, "step": 1323 }, { "epoch": 0.79, "grad_norm": 0.18184709807791666, "learning_rate": 0.00013293282351532442, "loss": 0.5253, "step": 1324 }, { "epoch": 0.79, "grad_norm": 0.17926319053458772, "learning_rate": 0.00013284404953767625, "loss": 0.5112, "step": 1325 }, { "epoch": 0.79, "grad_norm": 0.18473321145802576, "learning_rate": 0.00013275524653694665, "loss": 0.5257, "step": 1326 }, { "epoch": 0.79, "grad_norm": 0.17432084959871605, "learning_rate": 0.00013266641459160753, "loss": 0.4819, "step": 1327 }, { "epoch": 0.79, "grad_norm": 0.2136616093736039, "learning_rate": 0.0001325775537801564, "loss": 0.5289, "step": 1328 }, { "epoch": 0.79, "grad_norm": 0.18909582251032753, "learning_rate": 0.00013248866418111635, "loss": 0.5369, "step": 1329 }, { "epoch": 0.79, "grad_norm": 0.1915972979886006, "learning_rate": 0.00013239974587303584, "loss": 0.5926, "step": 1330 }, { "epoch": 0.79, "grad_norm": 0.18549724952330918, "learning_rate": 0.00013231079893448873, "loss": 0.4909, "step": 1331 }, { "epoch": 0.79, "grad_norm": 0.1986357464763957, "learning_rate": 0.00013222182344407415, "loss": 0.5257, "step": 1332 }, { "epoch": 0.8, "grad_norm": 0.1649013077802235, "learning_rate": 0.00013213281948041647, "loss": 0.4578, "step": 1333 }, { "epoch": 0.8, "grad_norm": 0.24324520525042556, "learning_rate": 0.0001320437871221652, "loss": 0.5119, "step": 1334 }, { "epoch": 0.8, "grad_norm": 0.19388615550793745, "learning_rate": 0.00013195472644799504, "loss": 0.4655, "step": 1335 }, { "epoch": 0.8, "grad_norm": 0.16279182912251683, "learning_rate": 0.00013186563753660562, "loss": 0.4868, "step": 1336 }, { "epoch": 0.8, "grad_norm": 0.17463318249809762, "learning_rate": 0.0001317765204667215, "loss": 0.5026, "step": 1337 }, { "epoch": 0.8, "grad_norm": 0.17843994655748693, "learning_rate": 0.0001316873753170922, "loss": 0.4725, "step": 1338 }, { "epoch": 0.8, "grad_norm": 0.17705143987562852, "learning_rate": 0.00013159820216649198, "loss": 0.4835, "step": 1339 }, { "epoch": 0.8, "grad_norm": 0.2071900675528722, "learning_rate": 0.00013150900109371998, "loss": 0.5452, "step": 1340 }, { "epoch": 0.8, "grad_norm": 0.1855602116667224, "learning_rate": 0.00013141977217759977, "loss": 0.5166, "step": 1341 }, { "epoch": 0.8, "grad_norm": 0.19368584941752956, "learning_rate": 0.00013133051549697977, "loss": 0.5258, "step": 1342 }, { "epoch": 0.8, "grad_norm": 0.19453794357182327, "learning_rate": 0.00013124123113073278, "loss": 0.5039, "step": 1343 }, { "epoch": 0.8, "grad_norm": 0.16449765544951725, "learning_rate": 0.0001311519191577562, "loss": 0.4935, "step": 1344 }, { "epoch": 0.8, "grad_norm": 0.1817628748335402, "learning_rate": 0.0001310625796569717, "loss": 0.4937, "step": 1345 }, { "epoch": 0.8, "grad_norm": 0.2142696862792987, "learning_rate": 0.00013097321270732524, "loss": 0.5004, "step": 1346 }, { "epoch": 0.8, "grad_norm": 0.2076553608561363, "learning_rate": 0.0001308838183877872, "loss": 0.499, "step": 1347 }, { "epoch": 0.8, "grad_norm": 0.1694419193937227, "learning_rate": 0.00013079439677735207, "loss": 0.477, "step": 1348 }, { "epoch": 0.8, "grad_norm": 0.20906856602739568, "learning_rate": 0.0001307049479550384, "loss": 0.5227, "step": 1349 }, { "epoch": 0.81, "grad_norm": 0.17644464533212287, "learning_rate": 0.00013061547199988885, "loss": 0.5636, "step": 1350 }, { "epoch": 0.81, "grad_norm": 0.18582243028910475, "learning_rate": 0.00013052596899097005, "loss": 0.4588, "step": 1351 }, { "epoch": 0.81, "grad_norm": 0.19864208660134122, "learning_rate": 0.0001304364390073725, "loss": 0.5367, "step": 1352 }, { "epoch": 0.81, "grad_norm": 0.2025705912016794, "learning_rate": 0.00013034688212821058, "loss": 0.4989, "step": 1353 }, { "epoch": 0.81, "grad_norm": 0.1850384411891724, "learning_rate": 0.00013025729843262241, "loss": 0.4829, "step": 1354 }, { "epoch": 0.81, "grad_norm": 0.18112685382941437, "learning_rate": 0.00013016768799976983, "loss": 0.501, "step": 1355 }, { "epoch": 0.81, "grad_norm": 0.19083770143909534, "learning_rate": 0.00013007805090883826, "loss": 0.4693, "step": 1356 }, { "epoch": 0.81, "grad_norm": 0.1741168713346763, "learning_rate": 0.00012998838723903675, "loss": 0.5051, "step": 1357 }, { "epoch": 0.81, "grad_norm": 0.17998449629698884, "learning_rate": 0.00012989869706959777, "loss": 0.5192, "step": 1358 }, { "epoch": 0.81, "grad_norm": 0.22733876158298189, "learning_rate": 0.0001298089804797772, "loss": 0.5699, "step": 1359 }, { "epoch": 0.81, "grad_norm": 0.18706763134147958, "learning_rate": 0.00012971923754885438, "loss": 0.5352, "step": 1360 }, { "epoch": 0.81, "grad_norm": 0.17137720274694657, "learning_rate": 0.0001296294683561318, "loss": 0.5028, "step": 1361 }, { "epoch": 0.81, "grad_norm": 0.1706735720997184, "learning_rate": 0.00012953967298093513, "loss": 0.5152, "step": 1362 }, { "epoch": 0.81, "grad_norm": 0.1877012462021262, "learning_rate": 0.00012944985150261341, "loss": 0.4895, "step": 1363 }, { "epoch": 0.81, "grad_norm": 0.18521525521633062, "learning_rate": 0.00012936000400053845, "loss": 0.5483, "step": 1364 }, { "epoch": 0.81, "grad_norm": 0.175077365429483, "learning_rate": 0.00012927013055410522, "loss": 0.5405, "step": 1365 }, { "epoch": 0.82, "grad_norm": 0.2453533595215046, "learning_rate": 0.00012918023124273165, "loss": 0.5118, "step": 1366 }, { "epoch": 0.82, "grad_norm": 0.20592576443274452, "learning_rate": 0.00012909030614585836, "loss": 0.5008, "step": 1367 }, { "epoch": 0.82, "grad_norm": 0.19466804271674717, "learning_rate": 0.00012900035534294893, "loss": 0.4973, "step": 1368 }, { "epoch": 0.82, "grad_norm": 0.18407694211273226, "learning_rate": 0.00012891037891348957, "loss": 0.4928, "step": 1369 }, { "epoch": 0.82, "grad_norm": 0.1944897009288739, "learning_rate": 0.00012882037693698917, "loss": 0.5042, "step": 1370 }, { "epoch": 0.82, "grad_norm": 0.1876729080378583, "learning_rate": 0.00012873034949297912, "loss": 0.5565, "step": 1371 }, { "epoch": 0.82, "grad_norm": 0.18740458209713104, "learning_rate": 0.0001286402966610134, "loss": 0.4912, "step": 1372 }, { "epoch": 0.82, "grad_norm": 0.17895655124360504, "learning_rate": 0.00012855021852066842, "loss": 0.4735, "step": 1373 }, { "epoch": 0.82, "grad_norm": 0.21851593513203868, "learning_rate": 0.00012846011515154287, "loss": 0.5548, "step": 1374 }, { "epoch": 0.82, "grad_norm": 0.19220669747584535, "learning_rate": 0.00012836998663325782, "loss": 0.5339, "step": 1375 }, { "epoch": 0.82, "grad_norm": 0.1877766868834106, "learning_rate": 0.00012827983304545656, "loss": 0.5379, "step": 1376 }, { "epoch": 0.82, "grad_norm": 0.1781170900201972, "learning_rate": 0.00012818965446780448, "loss": 0.4614, "step": 1377 }, { "epoch": 0.82, "grad_norm": 0.17271334480103068, "learning_rate": 0.00012809945097998907, "loss": 0.4667, "step": 1378 }, { "epoch": 0.82, "grad_norm": 0.19192709550266393, "learning_rate": 0.00012800922266171987, "loss": 0.539, "step": 1379 }, { "epoch": 0.82, "grad_norm": 0.18347756297917528, "learning_rate": 0.0001279189695927283, "loss": 0.4869, "step": 1380 }, { "epoch": 0.82, "grad_norm": 0.18435293599254082, "learning_rate": 0.0001278286918527677, "loss": 0.5142, "step": 1381 }, { "epoch": 0.82, "grad_norm": 0.17568604254392264, "learning_rate": 0.00012773838952161322, "loss": 0.4944, "step": 1382 }, { "epoch": 0.83, "grad_norm": 0.19863759493863561, "learning_rate": 0.0001276480626790617, "loss": 0.5071, "step": 1383 }, { "epoch": 0.83, "grad_norm": 0.18656826295630696, "learning_rate": 0.00012755771140493167, "loss": 0.4743, "step": 1384 }, { "epoch": 0.83, "grad_norm": 0.17629250804517016, "learning_rate": 0.0001274673357790632, "loss": 0.5198, "step": 1385 }, { "epoch": 0.83, "grad_norm": 0.17537444688807757, "learning_rate": 0.00012737693588131793, "loss": 0.49, "step": 1386 }, { "epoch": 0.83, "grad_norm": 0.1739583842649344, "learning_rate": 0.00012728651179157895, "loss": 0.4826, "step": 1387 }, { "epoch": 0.83, "grad_norm": 0.19774251629485748, "learning_rate": 0.00012719606358975073, "loss": 0.5571, "step": 1388 }, { "epoch": 0.83, "grad_norm": 0.17736202390835257, "learning_rate": 0.00012710559135575895, "loss": 0.4928, "step": 1389 }, { "epoch": 0.83, "grad_norm": 0.18721704864956284, "learning_rate": 0.00012701509516955067, "loss": 0.4798, "step": 1390 }, { "epoch": 0.83, "grad_norm": 0.19216069335454633, "learning_rate": 0.00012692457511109402, "loss": 0.5268, "step": 1391 }, { "epoch": 0.83, "grad_norm": 0.20170703615044985, "learning_rate": 0.00012683403126037825, "loss": 0.5556, "step": 1392 }, { "epoch": 0.83, "grad_norm": 0.17163471940645972, "learning_rate": 0.00012674346369741365, "loss": 0.5224, "step": 1393 }, { "epoch": 0.83, "grad_norm": 0.168318318516553, "learning_rate": 0.0001266528725022315, "loss": 0.5071, "step": 1394 }, { "epoch": 0.83, "grad_norm": 0.17605642876202626, "learning_rate": 0.00012656225775488383, "loss": 0.5356, "step": 1395 }, { "epoch": 0.83, "grad_norm": 0.17094228936642658, "learning_rate": 0.0001264716195354436, "loss": 0.4705, "step": 1396 }, { "epoch": 0.83, "grad_norm": 0.17472714805803075, "learning_rate": 0.00012638095792400452, "loss": 0.4903, "step": 1397 }, { "epoch": 0.83, "grad_norm": 0.17866538442934543, "learning_rate": 0.00012629027300068088, "loss": 0.5151, "step": 1398 }, { "epoch": 0.83, "grad_norm": 0.16752002793148427, "learning_rate": 0.0001261995648456076, "loss": 0.5094, "step": 1399 }, { "epoch": 0.84, "grad_norm": 0.18374616312572198, "learning_rate": 0.00012610883353894026, "loss": 0.4547, "step": 1400 }, { "epoch": 0.84, "grad_norm": 0.17615421249336274, "learning_rate": 0.00012601807916085461, "loss": 0.5194, "step": 1401 }, { "epoch": 0.84, "grad_norm": 0.17640888040569758, "learning_rate": 0.00012592730179154712, "loss": 0.4926, "step": 1402 }, { "epoch": 0.84, "grad_norm": 0.17818443729463354, "learning_rate": 0.0001258365015112344, "loss": 0.5138, "step": 1403 }, { "epoch": 0.84, "grad_norm": 0.18271989198154837, "learning_rate": 0.00012574567840015324, "loss": 0.5012, "step": 1404 }, { "epoch": 0.84, "grad_norm": 0.1828993894744449, "learning_rate": 0.00012565483253856071, "loss": 0.4917, "step": 1405 }, { "epoch": 0.84, "grad_norm": 0.17537537909673637, "learning_rate": 0.00012556396400673403, "loss": 0.4856, "step": 1406 }, { "epoch": 0.84, "grad_norm": 0.1791770204684942, "learning_rate": 0.00012547307288497035, "loss": 0.4993, "step": 1407 }, { "epoch": 0.84, "grad_norm": 0.1926761899524845, "learning_rate": 0.00012538215925358688, "loss": 0.532, "step": 1408 }, { "epoch": 0.84, "grad_norm": 0.1641415902243254, "learning_rate": 0.00012529122319292053, "loss": 0.4994, "step": 1409 }, { "epoch": 0.84, "grad_norm": 0.18035208365167332, "learning_rate": 0.00012520026478332822, "loss": 0.457, "step": 1410 }, { "epoch": 0.84, "grad_norm": 0.1767179210201008, "learning_rate": 0.00012510928410518663, "loss": 0.515, "step": 1411 }, { "epoch": 0.84, "grad_norm": 0.16556221078273253, "learning_rate": 0.00012501828123889194, "loss": 0.4578, "step": 1412 }, { "epoch": 0.84, "grad_norm": 0.17248580048933712, "learning_rate": 0.00012492725626486013, "loss": 0.4818, "step": 1413 }, { "epoch": 0.84, "grad_norm": 0.17049351620509895, "learning_rate": 0.00012483620926352656, "loss": 0.4889, "step": 1414 }, { "epoch": 0.84, "grad_norm": 0.2445103047102675, "learning_rate": 0.00012474514031534617, "loss": 0.543, "step": 1415 }, { "epoch": 0.84, "grad_norm": 0.1832392790663422, "learning_rate": 0.00012465404950079325, "loss": 0.5609, "step": 1416 }, { "epoch": 0.85, "grad_norm": 0.18408303780573687, "learning_rate": 0.00012456293690036135, "loss": 0.489, "step": 1417 }, { "epoch": 0.85, "grad_norm": 0.18185382443521964, "learning_rate": 0.00012447180259456342, "loss": 0.5156, "step": 1418 }, { "epoch": 0.85, "grad_norm": 0.19471472250974153, "learning_rate": 0.00012438064666393144, "loss": 0.493, "step": 1419 }, { "epoch": 0.85, "grad_norm": 0.19996647014153662, "learning_rate": 0.00012428946918901655, "loss": 0.5355, "step": 1420 }, { "epoch": 0.85, "grad_norm": 0.17841905649814713, "learning_rate": 0.00012419827025038905, "loss": 0.5193, "step": 1421 }, { "epoch": 0.85, "grad_norm": 0.23618023782431, "learning_rate": 0.00012410704992863792, "loss": 0.5052, "step": 1422 }, { "epoch": 0.85, "grad_norm": 0.19206431381415717, "learning_rate": 0.00012401580830437135, "loss": 0.5061, "step": 1423 }, { "epoch": 0.85, "grad_norm": 0.18232605343034466, "learning_rate": 0.0001239245454582162, "loss": 0.4726, "step": 1424 }, { "epoch": 0.85, "grad_norm": 0.1667177719369294, "learning_rate": 0.000123833261470818, "loss": 0.4584, "step": 1425 }, { "epoch": 0.85, "grad_norm": 0.18664991662366204, "learning_rate": 0.0001237419564228412, "loss": 0.4984, "step": 1426 }, { "epoch": 0.85, "grad_norm": 0.18842903468293132, "learning_rate": 0.00012365063039496862, "loss": 0.5021, "step": 1427 }, { "epoch": 0.85, "grad_norm": 0.17356657187313135, "learning_rate": 0.00012355928346790174, "loss": 0.4677, "step": 1428 }, { "epoch": 0.85, "grad_norm": 0.19172179721922822, "learning_rate": 0.0001234679157223605, "loss": 0.5298, "step": 1429 }, { "epoch": 0.85, "grad_norm": 0.17844753388051934, "learning_rate": 0.00012337652723908325, "loss": 0.5179, "step": 1430 }, { "epoch": 0.85, "grad_norm": 0.17488823189990194, "learning_rate": 0.0001232851180988266, "loss": 0.504, "step": 1431 }, { "epoch": 0.85, "grad_norm": 0.19923950835484613, "learning_rate": 0.00012319368838236547, "loss": 0.4801, "step": 1432 }, { "epoch": 0.86, "grad_norm": 0.18421010540531865, "learning_rate": 0.00012310223817049292, "loss": 0.4874, "step": 1433 }, { "epoch": 0.86, "grad_norm": 0.18280627148010795, "learning_rate": 0.00012301076754402018, "loss": 0.4807, "step": 1434 }, { "epoch": 0.86, "grad_norm": 0.18255969688709006, "learning_rate": 0.00012291927658377648, "loss": 0.5507, "step": 1435 }, { "epoch": 0.86, "grad_norm": 0.19369212033571756, "learning_rate": 0.00012282776537060903, "loss": 0.4807, "step": 1436 }, { "epoch": 0.86, "grad_norm": 0.17045351156650784, "learning_rate": 0.0001227362339853829, "loss": 0.5108, "step": 1437 }, { "epoch": 0.86, "grad_norm": 0.18805439573471683, "learning_rate": 0.000122644682508981, "loss": 0.5024, "step": 1438 }, { "epoch": 0.86, "grad_norm": 0.18557193779276274, "learning_rate": 0.0001225531110223041, "loss": 0.4968, "step": 1439 }, { "epoch": 0.86, "grad_norm": 0.18446256740657319, "learning_rate": 0.00012246151960627053, "loss": 0.5296, "step": 1440 }, { "epoch": 0.86, "grad_norm": 0.1860920392864418, "learning_rate": 0.0001223699083418162, "loss": 0.5143, "step": 1441 }, { "epoch": 0.86, "grad_norm": 0.2044775732039818, "learning_rate": 0.00012227827730989466, "loss": 0.5057, "step": 1442 }, { "epoch": 0.86, "grad_norm": 0.18607693244293935, "learning_rate": 0.00012218662659147693, "loss": 0.5174, "step": 1443 }, { "epoch": 0.86, "grad_norm": 0.18611453439901565, "learning_rate": 0.00012209495626755134, "loss": 0.4989, "step": 1444 }, { "epoch": 0.86, "grad_norm": 0.19583337708028287, "learning_rate": 0.00012200326641912361, "loss": 0.5225, "step": 1445 }, { "epoch": 0.86, "grad_norm": 0.19901404620525662, "learning_rate": 0.00012191155712721667, "loss": 0.5108, "step": 1446 }, { "epoch": 0.86, "grad_norm": 0.2034918139139227, "learning_rate": 0.0001218198284728707, "loss": 0.53, "step": 1447 }, { "epoch": 0.86, "grad_norm": 0.19938382509011504, "learning_rate": 0.00012172808053714292, "loss": 0.4907, "step": 1448 }, { "epoch": 0.86, "grad_norm": 0.1979156919673814, "learning_rate": 0.00012163631340110764, "loss": 0.4977, "step": 1449 }, { "epoch": 0.87, "grad_norm": 0.1773307028514164, "learning_rate": 0.00012154452714585605, "loss": 0.5305, "step": 1450 }, { "epoch": 0.87, "grad_norm": 0.1822927258357729, "learning_rate": 0.00012145272185249634, "loss": 0.4921, "step": 1451 }, { "epoch": 0.87, "grad_norm": 0.19519341132650003, "learning_rate": 0.0001213608976021535, "loss": 0.4735, "step": 1452 }, { "epoch": 0.87, "grad_norm": 0.18840961162083253, "learning_rate": 0.00012126905447596921, "loss": 0.5396, "step": 1453 }, { "epoch": 0.87, "grad_norm": 0.18484188418670194, "learning_rate": 0.00012117719255510188, "loss": 0.4916, "step": 1454 }, { "epoch": 0.87, "grad_norm": 0.1787508136445565, "learning_rate": 0.00012108531192072652, "loss": 0.4667, "step": 1455 }, { "epoch": 0.87, "grad_norm": 0.17143949715387075, "learning_rate": 0.0001209934126540347, "loss": 0.4869, "step": 1456 }, { "epoch": 0.87, "grad_norm": 0.19700823060671513, "learning_rate": 0.00012090149483623438, "loss": 0.4673, "step": 1457 }, { "epoch": 0.87, "grad_norm": 0.18571871176182977, "learning_rate": 0.00012080955854855002, "loss": 0.5284, "step": 1458 }, { "epoch": 0.87, "grad_norm": 0.19439561697487548, "learning_rate": 0.00012071760387222229, "loss": 0.5232, "step": 1459 }, { "epoch": 0.87, "grad_norm": 0.19667065535729555, "learning_rate": 0.0001206256308885082, "loss": 0.5436, "step": 1460 }, { "epoch": 0.87, "grad_norm": 0.18778267851212818, "learning_rate": 0.00012053363967868092, "loss": 0.514, "step": 1461 }, { "epoch": 0.87, "grad_norm": 0.18611375927652626, "learning_rate": 0.00012044163032402965, "loss": 0.4708, "step": 1462 }, { "epoch": 0.87, "grad_norm": 0.18484579524440076, "learning_rate": 0.0001203496029058597, "loss": 0.464, "step": 1463 }, { "epoch": 0.87, "grad_norm": 0.1826656462774754, "learning_rate": 0.00012025755750549233, "loss": 0.4919, "step": 1464 }, { "epoch": 0.87, "grad_norm": 0.18150683810049317, "learning_rate": 0.00012016549420426471, "loss": 0.4936, "step": 1465 }, { "epoch": 0.87, "grad_norm": 0.18124559007552254, "learning_rate": 0.00012007341308352977, "loss": 0.4972, "step": 1466 }, { "epoch": 0.88, "grad_norm": 0.19182333070322963, "learning_rate": 0.00011998131422465621, "loss": 0.4856, "step": 1467 }, { "epoch": 0.88, "grad_norm": 0.178004127767968, "learning_rate": 0.00011988919770902845, "loss": 0.4818, "step": 1468 }, { "epoch": 0.88, "grad_norm": 0.17292199373403655, "learning_rate": 0.00011979706361804644, "loss": 0.4829, "step": 1469 }, { "epoch": 0.88, "grad_norm": 0.18196556074734327, "learning_rate": 0.00011970491203312568, "loss": 0.5003, "step": 1470 }, { "epoch": 0.88, "grad_norm": 0.18754874881279202, "learning_rate": 0.0001196127430356972, "loss": 0.5743, "step": 1471 }, { "epoch": 0.88, "grad_norm": 0.1694899960861821, "learning_rate": 0.00011952055670720732, "loss": 0.4812, "step": 1472 }, { "epoch": 0.88, "grad_norm": 0.17685121597710485, "learning_rate": 0.00011942835312911773, "loss": 0.5106, "step": 1473 }, { "epoch": 0.88, "grad_norm": 0.18468147975405122, "learning_rate": 0.00011933613238290535, "loss": 0.4881, "step": 1474 }, { "epoch": 0.88, "grad_norm": 0.19502651305818872, "learning_rate": 0.00011924389455006226, "loss": 0.5436, "step": 1475 }, { "epoch": 0.88, "grad_norm": 0.1912752770383613, "learning_rate": 0.00011915163971209566, "loss": 0.4975, "step": 1476 }, { "epoch": 0.88, "grad_norm": 0.18045614062303392, "learning_rate": 0.00011905936795052774, "loss": 0.4946, "step": 1477 }, { "epoch": 0.88, "grad_norm": 0.19188311700454608, "learning_rate": 0.0001189670793468957, "loss": 0.5096, "step": 1478 }, { "epoch": 0.88, "grad_norm": 0.17844071972998474, "learning_rate": 0.00011887477398275162, "loss": 0.4932, "step": 1479 }, { "epoch": 0.88, "grad_norm": 0.1825369869650523, "learning_rate": 0.00011878245193966229, "loss": 0.5087, "step": 1480 }, { "epoch": 0.88, "grad_norm": 0.187073417853317, "learning_rate": 0.00011869011329920936, "loss": 0.4916, "step": 1481 }, { "epoch": 0.88, "grad_norm": 0.18205562507913237, "learning_rate": 0.00011859775814298905, "loss": 0.5233, "step": 1482 }, { "epoch": 0.88, "grad_norm": 0.18850193307348725, "learning_rate": 0.0001185053865526123, "loss": 0.509, "step": 1483 }, { "epoch": 0.89, "grad_norm": 0.16598501445752575, "learning_rate": 0.00011841299860970445, "loss": 0.4606, "step": 1484 }, { "epoch": 0.89, "grad_norm": 0.18164697924930495, "learning_rate": 0.00011832059439590533, "loss": 0.5326, "step": 1485 }, { "epoch": 0.89, "grad_norm": 0.1744542009553421, "learning_rate": 0.00011822817399286916, "loss": 0.4806, "step": 1486 }, { "epoch": 0.89, "grad_norm": 0.1866112771984086, "learning_rate": 0.00011813573748226447, "loss": 0.5142, "step": 1487 }, { "epoch": 0.89, "grad_norm": 0.1804334025313157, "learning_rate": 0.00011804328494577402, "loss": 0.5055, "step": 1488 }, { "epoch": 0.89, "grad_norm": 0.1827107380774223, "learning_rate": 0.00011795081646509469, "loss": 0.4705, "step": 1489 }, { "epoch": 0.89, "grad_norm": 0.17671386861052463, "learning_rate": 0.00011785833212193749, "loss": 0.5103, "step": 1490 }, { "epoch": 0.89, "grad_norm": 0.18983228560014775, "learning_rate": 0.00011776583199802746, "loss": 0.5105, "step": 1491 }, { "epoch": 0.89, "grad_norm": 0.17998266735444157, "learning_rate": 0.00011767331617510358, "loss": 0.5207, "step": 1492 }, { "epoch": 0.89, "grad_norm": 0.17711101362820791, "learning_rate": 0.00011758078473491864, "loss": 0.5266, "step": 1493 }, { "epoch": 0.89, "grad_norm": 0.18914416644450918, "learning_rate": 0.00011748823775923934, "loss": 0.5338, "step": 1494 }, { "epoch": 0.89, "grad_norm": 0.18057892909261303, "learning_rate": 0.00011739567532984598, "loss": 0.5549, "step": 1495 }, { "epoch": 0.89, "grad_norm": 0.16973466919637561, "learning_rate": 0.00011730309752853261, "loss": 0.4838, "step": 1496 }, { "epoch": 0.89, "grad_norm": 0.18240256763305998, "learning_rate": 0.00011721050443710688, "loss": 0.4946, "step": 1497 }, { "epoch": 0.89, "grad_norm": 0.18071282928562282, "learning_rate": 0.00011711789613738986, "loss": 0.5128, "step": 1498 }, { "epoch": 0.89, "grad_norm": 0.1764697230658466, "learning_rate": 0.00011702527271121609, "loss": 0.5081, "step": 1499 }, { "epoch": 0.89, "grad_norm": 0.18036733171638797, "learning_rate": 0.00011693263424043353, "loss": 0.492, "step": 1500 }, { "epoch": 0.9, "grad_norm": 0.18783837144683072, "learning_rate": 0.00011683998080690334, "loss": 0.5066, "step": 1501 }, { "epoch": 0.9, "grad_norm": 0.1748875971730362, "learning_rate": 0.00011674731249250008, "loss": 0.5045, "step": 1502 }, { "epoch": 0.9, "grad_norm": 0.18835075966322473, "learning_rate": 0.00011665462937911124, "loss": 0.5153, "step": 1503 }, { "epoch": 0.9, "grad_norm": 0.16522307101295688, "learning_rate": 0.00011656193154863749, "loss": 0.4485, "step": 1504 }, { "epoch": 0.9, "grad_norm": 0.1743041745662128, "learning_rate": 0.00011646921908299254, "loss": 0.4668, "step": 1505 }, { "epoch": 0.9, "grad_norm": 0.18915730869999403, "learning_rate": 0.00011637649206410298, "loss": 0.5169, "step": 1506 }, { "epoch": 0.9, "grad_norm": 0.1787635275807655, "learning_rate": 0.00011628375057390824, "loss": 0.5218, "step": 1507 }, { "epoch": 0.9, "grad_norm": 0.18297477342275548, "learning_rate": 0.00011619099469436061, "loss": 0.497, "step": 1508 }, { "epoch": 0.9, "grad_norm": 0.1780890941415051, "learning_rate": 0.00011609822450742507, "loss": 0.4961, "step": 1509 }, { "epoch": 0.9, "grad_norm": 0.17078744735386345, "learning_rate": 0.0001160054400950792, "loss": 0.4832, "step": 1510 }, { "epoch": 0.9, "grad_norm": 0.1784315880273548, "learning_rate": 0.00011591264153931321, "loss": 0.4745, "step": 1511 }, { "epoch": 0.9, "grad_norm": 0.1871274533549485, "learning_rate": 0.00011581982892212975, "loss": 0.5072, "step": 1512 }, { "epoch": 0.9, "grad_norm": 0.18545283275745875, "learning_rate": 0.0001157270023255439, "loss": 0.4982, "step": 1513 }, { "epoch": 0.9, "grad_norm": 0.18159317342397205, "learning_rate": 0.00011563416183158318, "loss": 0.5188, "step": 1514 }, { "epoch": 0.9, "grad_norm": 0.20002147439421913, "learning_rate": 0.00011554130752228731, "loss": 0.5472, "step": 1515 }, { "epoch": 0.9, "grad_norm": 0.18673799334263, "learning_rate": 0.00011544843947970822, "loss": 0.469, "step": 1516 }, { "epoch": 0.91, "grad_norm": 0.2099300853781104, "learning_rate": 0.00011535555778590999, "loss": 0.5273, "step": 1517 }, { "epoch": 0.91, "grad_norm": 0.18505904288666042, "learning_rate": 0.00011526266252296876, "loss": 0.5222, "step": 1518 }, { "epoch": 0.91, "grad_norm": 0.2022588519204151, "learning_rate": 0.0001151697537729727, "loss": 0.5154, "step": 1519 }, { "epoch": 0.91, "grad_norm": 0.17874437237649884, "learning_rate": 0.00011507683161802184, "loss": 0.5493, "step": 1520 }, { "epoch": 0.91, "grad_norm": 0.16382256794398076, "learning_rate": 0.00011498389614022807, "loss": 0.4838, "step": 1521 }, { "epoch": 0.91, "grad_norm": 0.1788771786525409, "learning_rate": 0.00011489094742171502, "loss": 0.5068, "step": 1522 }, { "epoch": 0.91, "grad_norm": 0.19844148701897005, "learning_rate": 0.00011479798554461818, "loss": 0.499, "step": 1523 }, { "epoch": 0.91, "grad_norm": 0.18029997805524606, "learning_rate": 0.0001147050105910845, "loss": 0.5218, "step": 1524 }, { "epoch": 0.91, "grad_norm": 0.2070583771119555, "learning_rate": 0.00011461202264327246, "loss": 0.5527, "step": 1525 }, { "epoch": 0.91, "grad_norm": 0.17683689253976204, "learning_rate": 0.00011451902178335219, "loss": 0.475, "step": 1526 }, { "epoch": 0.91, "grad_norm": 0.1746582188063431, "learning_rate": 0.0001144260080935051, "loss": 0.5002, "step": 1527 }, { "epoch": 0.91, "grad_norm": 0.19411077398249144, "learning_rate": 0.00011433298165592396, "loss": 0.4862, "step": 1528 }, { "epoch": 0.91, "grad_norm": 0.1873910047084602, "learning_rate": 0.00011423994255281285, "loss": 0.4882, "step": 1529 }, { "epoch": 0.91, "grad_norm": 0.18794249715310352, "learning_rate": 0.000114146890866387, "loss": 0.543, "step": 1530 }, { "epoch": 0.91, "grad_norm": 0.1795844721124014, "learning_rate": 0.00011405382667887276, "loss": 0.5305, "step": 1531 }, { "epoch": 0.91, "grad_norm": 0.17955259790256417, "learning_rate": 0.00011396075007250758, "loss": 0.4973, "step": 1532 }, { "epoch": 0.91, "grad_norm": 0.21366812260453688, "learning_rate": 0.00011386766112953977, "loss": 0.5278, "step": 1533 }, { "epoch": 0.92, "grad_norm": 0.18167687811794028, "learning_rate": 0.00011377455993222867, "loss": 0.5249, "step": 1534 }, { "epoch": 0.92, "grad_norm": 0.17645370527975052, "learning_rate": 0.00011368144656284436, "loss": 0.4947, "step": 1535 }, { "epoch": 0.92, "grad_norm": 0.16934323070380375, "learning_rate": 0.00011358832110366775, "loss": 0.4616, "step": 1536 }, { "epoch": 0.92, "grad_norm": 0.17196203109279234, "learning_rate": 0.00011349518363699036, "loss": 0.4638, "step": 1537 }, { "epoch": 0.92, "grad_norm": 0.1943534034058594, "learning_rate": 0.00011340203424511434, "loss": 0.5094, "step": 1538 }, { "epoch": 0.92, "grad_norm": 0.17128940028769005, "learning_rate": 0.00011330887301035242, "loss": 0.4624, "step": 1539 }, { "epoch": 0.92, "grad_norm": 0.179017474107053, "learning_rate": 0.00011321570001502775, "loss": 0.4872, "step": 1540 }, { "epoch": 0.92, "grad_norm": 0.17288738113525648, "learning_rate": 0.00011312251534147387, "loss": 0.4754, "step": 1541 }, { "epoch": 0.92, "grad_norm": 0.17577830411270592, "learning_rate": 0.0001130293190720347, "loss": 0.4729, "step": 1542 }, { "epoch": 0.92, "grad_norm": 0.16957226974611797, "learning_rate": 0.00011293611128906431, "loss": 0.4814, "step": 1543 }, { "epoch": 0.92, "grad_norm": 0.1760426058506451, "learning_rate": 0.00011284289207492706, "loss": 0.4619, "step": 1544 }, { "epoch": 0.92, "grad_norm": 0.18329235131533697, "learning_rate": 0.00011274966151199731, "loss": 0.5044, "step": 1545 }, { "epoch": 0.92, "grad_norm": 0.18307223453280677, "learning_rate": 0.00011265641968265945, "loss": 0.5241, "step": 1546 }, { "epoch": 0.92, "grad_norm": 0.17773989766788875, "learning_rate": 0.00011256316666930798, "loss": 0.4925, "step": 1547 }, { "epoch": 0.92, "grad_norm": 0.1769674765097833, "learning_rate": 0.00011246990255434704, "loss": 0.5262, "step": 1548 }, { "epoch": 0.92, "grad_norm": 0.1849871149599951, "learning_rate": 0.00011237662742019075, "loss": 0.4594, "step": 1549 }, { "epoch": 0.92, "grad_norm": 0.18867364566508427, "learning_rate": 0.00011228334134926297, "loss": 0.476, "step": 1550 }, { "epoch": 0.93, "grad_norm": 0.17417769451068804, "learning_rate": 0.00011219004442399712, "loss": 0.4972, "step": 1551 }, { "epoch": 0.93, "grad_norm": 0.16859318717354108, "learning_rate": 0.00011209673672683632, "loss": 0.4802, "step": 1552 }, { "epoch": 0.93, "grad_norm": 0.1644307458626678, "learning_rate": 0.00011200341834023309, "loss": 0.4829, "step": 1553 }, { "epoch": 0.93, "grad_norm": 0.1989496601102084, "learning_rate": 0.00011191008934664951, "loss": 0.6644, "step": 1554 }, { "epoch": 0.93, "grad_norm": 0.1844631029567801, "learning_rate": 0.000111816749828557, "loss": 0.4901, "step": 1555 }, { "epoch": 0.93, "grad_norm": 0.19816902332331113, "learning_rate": 0.00011172339986843626, "loss": 0.5496, "step": 1556 }, { "epoch": 0.93, "grad_norm": 0.19691336622769823, "learning_rate": 0.00011163003954877718, "loss": 0.5586, "step": 1557 }, { "epoch": 0.93, "grad_norm": 0.18166390903396584, "learning_rate": 0.00011153666895207885, "loss": 0.5509, "step": 1558 }, { "epoch": 0.93, "grad_norm": 0.19059509070126388, "learning_rate": 0.00011144328816084952, "loss": 0.5301, "step": 1559 }, { "epoch": 0.93, "grad_norm": 0.1907238818250723, "learning_rate": 0.00011134989725760632, "loss": 0.5015, "step": 1560 }, { "epoch": 0.93, "grad_norm": 0.18292558413429197, "learning_rate": 0.00011125649632487538, "loss": 0.5246, "step": 1561 }, { "epoch": 0.93, "grad_norm": 0.17463088614239725, "learning_rate": 0.00011116308544519163, "loss": 0.4912, "step": 1562 }, { "epoch": 0.93, "grad_norm": 0.19109769957786926, "learning_rate": 0.00011106966470109888, "loss": 0.5016, "step": 1563 }, { "epoch": 0.93, "grad_norm": 0.20891602551163976, "learning_rate": 0.00011097623417514957, "loss": 0.5061, "step": 1564 }, { "epoch": 0.93, "grad_norm": 0.1912861078378425, "learning_rate": 0.00011088279394990491, "loss": 0.5087, "step": 1565 }, { "epoch": 0.93, "grad_norm": 0.20860613345682275, "learning_rate": 0.00011078934410793453, "loss": 0.5475, "step": 1566 }, { "epoch": 0.93, "grad_norm": 0.2016276928245179, "learning_rate": 0.00011069588473181663, "loss": 0.4917, "step": 1567 }, { "epoch": 0.94, "grad_norm": 0.18622143936681837, "learning_rate": 0.00011060241590413787, "loss": 0.4691, "step": 1568 }, { "epoch": 0.94, "grad_norm": 0.1862650789358927, "learning_rate": 0.0001105089377074932, "loss": 0.4783, "step": 1569 }, { "epoch": 0.94, "grad_norm": 0.17314711363865778, "learning_rate": 0.00011041545022448585, "loss": 0.4636, "step": 1570 }, { "epoch": 0.94, "grad_norm": 0.18882699212061288, "learning_rate": 0.00011032195353772732, "loss": 0.5071, "step": 1571 }, { "epoch": 0.94, "grad_norm": 0.18313968659351176, "learning_rate": 0.00011022844772983716, "loss": 0.4925, "step": 1572 }, { "epoch": 0.94, "grad_norm": 0.18989183467798568, "learning_rate": 0.00011013493288344307, "loss": 0.5138, "step": 1573 }, { "epoch": 0.94, "grad_norm": 0.20009639832032847, "learning_rate": 0.00011004140908118069, "loss": 0.5033, "step": 1574 }, { "epoch": 0.94, "grad_norm": 0.1864822830628491, "learning_rate": 0.00010994787640569348, "loss": 0.4566, "step": 1575 }, { "epoch": 0.94, "grad_norm": 0.18537930991361964, "learning_rate": 0.00010985433493963294, "loss": 0.4931, "step": 1576 }, { "epoch": 0.94, "grad_norm": 0.1860930876562612, "learning_rate": 0.00010976078476565818, "loss": 0.4843, "step": 1577 }, { "epoch": 0.94, "grad_norm": 0.2386895987379907, "learning_rate": 0.00010966722596643607, "loss": 0.5246, "step": 1578 }, { "epoch": 0.94, "grad_norm": 0.17624730923913154, "learning_rate": 0.00010957365862464106, "loss": 0.4471, "step": 1579 }, { "epoch": 0.94, "grad_norm": 0.17947142111167033, "learning_rate": 0.00010948008282295523, "loss": 0.5057, "step": 1580 }, { "epoch": 0.94, "grad_norm": 0.17182151942408264, "learning_rate": 0.00010938649864406803, "loss": 0.4452, "step": 1581 }, { "epoch": 0.94, "grad_norm": 0.17298006001721172, "learning_rate": 0.0001092929061706764, "loss": 0.4578, "step": 1582 }, { "epoch": 0.94, "grad_norm": 0.2070838001590237, "learning_rate": 0.00010919930548548456, "loss": 0.4887, "step": 1583 }, { "epoch": 0.95, "grad_norm": 0.18919686254137597, "learning_rate": 0.00010910569667120402, "loss": 0.5132, "step": 1584 }, { "epoch": 0.95, "grad_norm": 0.18877811317785967, "learning_rate": 0.0001090120798105534, "loss": 0.4522, "step": 1585 }, { "epoch": 0.95, "grad_norm": 0.17183583532409152, "learning_rate": 0.00010891845498625857, "loss": 0.4938, "step": 1586 }, { "epoch": 0.95, "grad_norm": 0.19365576655760885, "learning_rate": 0.00010882482228105229, "loss": 0.5192, "step": 1587 }, { "epoch": 0.95, "grad_norm": 0.17490010992510652, "learning_rate": 0.00010873118177767433, "loss": 0.4711, "step": 1588 }, { "epoch": 0.95, "grad_norm": 0.17342191737901563, "learning_rate": 0.00010863753355887143, "loss": 0.4775, "step": 1589 }, { "epoch": 0.95, "grad_norm": 0.16379055188844005, "learning_rate": 0.00010854387770739707, "loss": 0.4708, "step": 1590 }, { "epoch": 0.95, "grad_norm": 0.1842692603288873, "learning_rate": 0.00010845021430601143, "loss": 0.4949, "step": 1591 }, { "epoch": 0.95, "grad_norm": 0.18033303940018516, "learning_rate": 0.00010835654343748149, "loss": 0.4991, "step": 1592 }, { "epoch": 0.95, "grad_norm": 0.16891948671510196, "learning_rate": 0.00010826286518458073, "loss": 0.4001, "step": 1593 }, { "epoch": 0.95, "grad_norm": 0.17260620046518338, "learning_rate": 0.00010816917963008916, "loss": 0.4984, "step": 1594 }, { "epoch": 0.95, "grad_norm": 0.18883680555663057, "learning_rate": 0.00010807548685679334, "loss": 0.5055, "step": 1595 }, { "epoch": 0.95, "grad_norm": 0.17805386706526016, "learning_rate": 0.00010798178694748607, "loss": 0.4544, "step": 1596 }, { "epoch": 0.95, "grad_norm": 0.16667712713064364, "learning_rate": 0.00010788807998496655, "loss": 0.4746, "step": 1597 }, { "epoch": 0.95, "grad_norm": 0.18366350719934063, "learning_rate": 0.00010779436605204017, "loss": 0.5247, "step": 1598 }, { "epoch": 0.95, "grad_norm": 0.22274345582510527, "learning_rate": 0.0001077006452315185, "loss": 0.5095, "step": 1599 }, { "epoch": 0.95, "grad_norm": 0.18493326300947324, "learning_rate": 0.00010760691760621921, "loss": 0.5327, "step": 1600 }, { "epoch": 0.96, "grad_norm": 0.16600394767674562, "learning_rate": 0.00010751318325896592, "loss": 0.498, "step": 1601 }, { "epoch": 0.96, "grad_norm": 0.1788563275400356, "learning_rate": 0.00010741944227258827, "loss": 0.5264, "step": 1602 }, { "epoch": 0.96, "grad_norm": 0.1757402482395372, "learning_rate": 0.00010732569472992171, "loss": 0.5285, "step": 1603 }, { "epoch": 0.96, "grad_norm": 0.18663007232179554, "learning_rate": 0.00010723194071380751, "loss": 0.5189, "step": 1604 }, { "epoch": 0.96, "grad_norm": 0.19049017159093576, "learning_rate": 0.00010713818030709268, "loss": 0.5003, "step": 1605 }, { "epoch": 0.96, "grad_norm": 0.1788772885287754, "learning_rate": 0.00010704441359262982, "loss": 0.465, "step": 1606 }, { "epoch": 0.96, "grad_norm": 0.2158972916651406, "learning_rate": 0.00010695064065327712, "loss": 0.5272, "step": 1607 }, { "epoch": 0.96, "grad_norm": 0.1736784845280322, "learning_rate": 0.00010685686157189832, "loss": 0.5141, "step": 1608 }, { "epoch": 0.96, "grad_norm": 0.19533897028956482, "learning_rate": 0.00010676307643136254, "loss": 0.5258, "step": 1609 }, { "epoch": 0.96, "grad_norm": 0.19307284822375173, "learning_rate": 0.00010666928531454428, "loss": 0.5016, "step": 1610 }, { "epoch": 0.96, "grad_norm": 0.17903971380850872, "learning_rate": 0.00010657548830432329, "loss": 0.4497, "step": 1611 }, { "epoch": 0.96, "grad_norm": 0.19164639321131477, "learning_rate": 0.00010648168548358455, "loss": 0.5137, "step": 1612 }, { "epoch": 0.96, "grad_norm": 0.17618034640166205, "learning_rate": 0.00010638787693521819, "loss": 0.5087, "step": 1613 }, { "epoch": 0.96, "grad_norm": 0.18247646676718576, "learning_rate": 0.00010629406274211934, "loss": 0.5074, "step": 1614 }, { "epoch": 0.96, "grad_norm": 0.17276068231312514, "learning_rate": 0.00010620024298718822, "loss": 0.4787, "step": 1615 }, { "epoch": 0.96, "grad_norm": 0.1915641904837803, "learning_rate": 0.00010610641775332983, "loss": 0.5306, "step": 1616 }, { "epoch": 0.96, "grad_norm": 0.17501613856715206, "learning_rate": 0.00010601258712345414, "loss": 0.5037, "step": 1617 }, { "epoch": 0.97, "grad_norm": 0.18815087782526124, "learning_rate": 0.00010591875118047588, "loss": 0.5116, "step": 1618 }, { "epoch": 0.97, "grad_norm": 0.186783359821177, "learning_rate": 0.00010582491000731432, "loss": 0.5044, "step": 1619 }, { "epoch": 0.97, "grad_norm": 0.22993928050303203, "learning_rate": 0.00010573106368689352, "loss": 0.4821, "step": 1620 }, { "epoch": 0.97, "grad_norm": 0.19722071181591044, "learning_rate": 0.00010563721230214203, "loss": 0.5189, "step": 1621 }, { "epoch": 0.97, "grad_norm": 0.1809577680466395, "learning_rate": 0.00010554335593599285, "loss": 0.5428, "step": 1622 }, { "epoch": 0.97, "grad_norm": 0.1773652413715408, "learning_rate": 0.00010544949467138346, "loss": 0.4978, "step": 1623 }, { "epoch": 0.97, "grad_norm": 0.16603963794042106, "learning_rate": 0.00010535562859125558, "loss": 0.4497, "step": 1624 }, { "epoch": 0.97, "grad_norm": 0.19264000969327572, "learning_rate": 0.0001052617577785552, "loss": 0.5067, "step": 1625 }, { "epoch": 0.97, "grad_norm": 0.19997471766094152, "learning_rate": 0.00010516788231623253, "loss": 0.4959, "step": 1626 }, { "epoch": 0.97, "grad_norm": 0.1853233178226942, "learning_rate": 0.00010507400228724192, "loss": 0.5259, "step": 1627 }, { "epoch": 0.97, "grad_norm": 0.20841560813457816, "learning_rate": 0.00010498011777454163, "loss": 0.6294, "step": 1628 }, { "epoch": 0.97, "grad_norm": 0.17236196464793524, "learning_rate": 0.000104886228861094, "loss": 0.503, "step": 1629 }, { "epoch": 0.97, "grad_norm": 0.17688449011527507, "learning_rate": 0.00010479233562986519, "loss": 0.4859, "step": 1630 }, { "epoch": 0.97, "grad_norm": 0.2609892130797167, "learning_rate": 0.00010469843816382526, "loss": 0.548, "step": 1631 }, { "epoch": 0.97, "grad_norm": 0.19046903787621935, "learning_rate": 0.0001046045365459479, "loss": 0.5337, "step": 1632 }, { "epoch": 0.97, "grad_norm": 0.18314320161449926, "learning_rate": 0.00010451063085921056, "loss": 0.4978, "step": 1633 }, { "epoch": 0.97, "grad_norm": 0.19018122473676155, "learning_rate": 0.00010441672118659422, "loss": 0.4981, "step": 1634 }, { "epoch": 0.98, "grad_norm": 0.1879539993234289, "learning_rate": 0.00010432280761108342, "loss": 0.502, "step": 1635 }, { "epoch": 0.98, "grad_norm": 0.1957235753033609, "learning_rate": 0.00010422889021566618, "loss": 0.5232, "step": 1636 }, { "epoch": 0.98, "grad_norm": 0.1830846528779905, "learning_rate": 0.0001041349690833338, "loss": 0.5185, "step": 1637 }, { "epoch": 0.98, "grad_norm": 0.2110613700559306, "learning_rate": 0.00010404104429708097, "loss": 0.5631, "step": 1638 }, { "epoch": 0.98, "grad_norm": 0.19406820066555858, "learning_rate": 0.00010394711593990554, "loss": 0.487, "step": 1639 }, { "epoch": 0.98, "grad_norm": 0.17641222102986034, "learning_rate": 0.00010385318409480862, "loss": 0.4597, "step": 1640 }, { "epoch": 0.98, "grad_norm": 0.1729485551615291, "learning_rate": 0.00010375924884479427, "loss": 0.4869, "step": 1641 }, { "epoch": 0.98, "grad_norm": 0.17035672775207952, "learning_rate": 0.00010366531027286967, "loss": 0.4805, "step": 1642 }, { "epoch": 0.98, "grad_norm": 0.1718925201317743, "learning_rate": 0.00010357136846204487, "loss": 0.4773, "step": 1643 }, { "epoch": 0.98, "grad_norm": 0.17917233153986767, "learning_rate": 0.00010347742349533278, "loss": 0.4535, "step": 1644 }, { "epoch": 0.98, "grad_norm": 0.19544887303797653, "learning_rate": 0.00010338347545574916, "loss": 0.5009, "step": 1645 }, { "epoch": 0.98, "grad_norm": 0.1759404996666079, "learning_rate": 0.00010328952442631241, "loss": 0.462, "step": 1646 }, { "epoch": 0.98, "grad_norm": 0.17490417817834802, "learning_rate": 0.00010319557049004365, "loss": 0.4648, "step": 1647 }, { "epoch": 0.98, "grad_norm": 0.18605625193133354, "learning_rate": 0.00010310161372996648, "loss": 0.535, "step": 1648 }, { "epoch": 0.98, "grad_norm": 0.18211716389824942, "learning_rate": 0.00010300765422910706, "loss": 0.482, "step": 1649 }, { "epoch": 0.98, "grad_norm": 0.195298449859917, "learning_rate": 0.00010291369207049397, "loss": 0.4966, "step": 1650 }, { "epoch": 0.99, "grad_norm": 0.20359810497996356, "learning_rate": 0.00010281972733715808, "loss": 0.4735, "step": 1651 }, { "epoch": 0.99, "grad_norm": 0.18071892768830874, "learning_rate": 0.00010272576011213262, "loss": 0.5107, "step": 1652 }, { "epoch": 0.99, "grad_norm": 0.19488059200805252, "learning_rate": 0.00010263179047845297, "loss": 0.5581, "step": 1653 }, { "epoch": 0.99, "grad_norm": 0.1827251419622096, "learning_rate": 0.00010253781851915663, "loss": 0.5429, "step": 1654 }, { "epoch": 0.99, "grad_norm": 0.1815977598840352, "learning_rate": 0.0001024438443172832, "loss": 0.4719, "step": 1655 }, { "epoch": 0.99, "grad_norm": 0.1750166365728634, "learning_rate": 0.00010234986795587418, "loss": 0.4972, "step": 1656 }, { "epoch": 0.99, "grad_norm": 0.16860820894878417, "learning_rate": 0.00010225588951797309, "loss": 0.4881, "step": 1657 }, { "epoch": 0.99, "grad_norm": 0.1653555565773814, "learning_rate": 0.00010216190908662522, "loss": 0.4759, "step": 1658 }, { "epoch": 0.99, "grad_norm": 0.1973447838806209, "learning_rate": 0.0001020679267448776, "loss": 0.5361, "step": 1659 }, { "epoch": 0.99, "grad_norm": 0.17166094417441266, "learning_rate": 0.00010197394257577902, "loss": 0.4587, "step": 1660 }, { "epoch": 0.99, "grad_norm": 0.17920046555821298, "learning_rate": 0.00010187995666237977, "loss": 0.5237, "step": 1661 }, { "epoch": 0.99, "grad_norm": 0.1746051081494759, "learning_rate": 0.00010178596908773179, "loss": 0.5352, "step": 1662 }, { "epoch": 0.99, "grad_norm": 0.1904814752787734, "learning_rate": 0.00010169197993488851, "loss": 0.4947, "step": 1663 }, { "epoch": 0.99, "grad_norm": 0.1682118449533182, "learning_rate": 0.0001015979892869046, "loss": 0.5075, "step": 1664 }, { "epoch": 0.99, "grad_norm": 0.1787167363650672, "learning_rate": 0.00010150399722683623, "loss": 0.503, "step": 1665 }, { "epoch": 0.99, "grad_norm": 0.17433015556420575, "learning_rate": 0.00010141000383774067, "loss": 0.4736, "step": 1666 }, { "epoch": 0.99, "grad_norm": 0.18890832234956914, "learning_rate": 0.00010131600920267645, "loss": 0.482, "step": 1667 }, { "epoch": 1.0, "grad_norm": 0.16995768612256246, "learning_rate": 0.00010122201340470321, "loss": 0.4764, "step": 1668 }, { "epoch": 1.0, "grad_norm": 0.19189171962262738, "learning_rate": 0.00010112801652688155, "loss": 0.5234, "step": 1669 }, { "epoch": 1.0, "grad_norm": 0.1887634223189705, "learning_rate": 0.00010103401865227304, "loss": 0.4763, "step": 1670 }, { "epoch": 1.0, "grad_norm": 0.1842920226325373, "learning_rate": 0.00010094001986394024, "loss": 0.5156, "step": 1671 }, { "epoch": 1.0, "grad_norm": 0.17969781851880492, "learning_rate": 0.00010084602024494633, "loss": 0.4826, "step": 1672 }, { "epoch": 1.0, "grad_norm": 0.19854544410385247, "learning_rate": 0.0001007520198783554, "loss": 0.5498, "step": 1673 }, { "epoch": 1.0, "grad_norm": 0.16752089299272865, "learning_rate": 0.0001006580188472321, "loss": 0.4453, "step": 1674 }, { "epoch": 1.0, "grad_norm": 0.1740348252936511, "learning_rate": 0.00010056401723464166, "loss": 0.5058, "step": 1675 }, { "epoch": 1.0, "grad_norm": 0.1734770189678822, "learning_rate": 0.00010047001512364992, "loss": 0.4988, "step": 1676 }, { "epoch": 1.0, "grad_norm": 0.19216191844782038, "learning_rate": 0.00010037601259732308, "loss": 0.5046, "step": 1677 }, { "epoch": 1.0, "grad_norm": 0.19668950532063106, "learning_rate": 0.00010028200973872766, "loss": 0.5271, "step": 1678 }, { "epoch": 1.0, "grad_norm": 0.17543353269555373, "learning_rate": 0.00010018800663093057, "loss": 0.4429, "step": 1679 }, { "epoch": 1.0, "grad_norm": 0.1649204380890276, "learning_rate": 0.00010009400335699894, "loss": 0.4532, "step": 1680 }, { "epoch": 1.0, "grad_norm": 0.19447076402139798, "learning_rate": 0.0001, "loss": 0.5214, "step": 1681 }, { "epoch": 1.0, "grad_norm": 0.2095390863895404, "learning_rate": 9.990599664300105e-05, "loss": 0.5393, "step": 1682 }, { "epoch": 1.0, "grad_norm": 0.19567081187455143, "learning_rate": 9.981199336906944e-05, "loss": 0.5266, "step": 1683 }, { "epoch": 1.0, "grad_norm": 0.17528573990808377, "learning_rate": 9.971799026127236e-05, "loss": 0.4686, "step": 1684 }, { "epoch": 1.01, "grad_norm": 0.16498662906420647, "learning_rate": 9.962398740267696e-05, "loss": 0.4576, "step": 1685 }, { "epoch": 1.01, "grad_norm": 0.1868464359812763, "learning_rate": 9.952998487635011e-05, "loss": 0.5504, "step": 1686 }, { "epoch": 1.01, "grad_norm": 0.1848579187325209, "learning_rate": 9.943598276535835e-05, "loss": 0.4671, "step": 1687 }, { "epoch": 1.01, "grad_norm": 0.18102324049270346, "learning_rate": 9.934198115276793e-05, "loss": 0.4869, "step": 1688 }, { "epoch": 1.01, "grad_norm": 0.17304140002160995, "learning_rate": 9.924798012164459e-05, "loss": 0.5308, "step": 1689 }, { "epoch": 1.01, "grad_norm": 0.17379146302190593, "learning_rate": 9.915397975505369e-05, "loss": 0.4888, "step": 1690 }, { "epoch": 1.01, "grad_norm": 0.1771864954623761, "learning_rate": 9.90599801360598e-05, "loss": 0.5018, "step": 1691 }, { "epoch": 1.01, "grad_norm": 0.19350545061458058, "learning_rate": 9.896598134772697e-05, "loss": 0.5308, "step": 1692 }, { "epoch": 1.01, "grad_norm": 0.18561731061862277, "learning_rate": 9.887198347311849e-05, "loss": 0.51, "step": 1693 }, { "epoch": 1.01, "grad_norm": 0.2391921626817571, "learning_rate": 9.877798659529683e-05, "loss": 0.492, "step": 1694 }, { "epoch": 1.01, "grad_norm": 0.18604469349478459, "learning_rate": 9.868399079732356e-05, "loss": 0.4857, "step": 1695 }, { "epoch": 1.01, "grad_norm": 0.2054893911376399, "learning_rate": 9.858999616225939e-05, "loss": 0.5076, "step": 1696 }, { "epoch": 1.01, "grad_norm": 0.19787007449159563, "learning_rate": 9.849600277316379e-05, "loss": 0.4909, "step": 1697 }, { "epoch": 1.01, "grad_norm": 0.19531017803501713, "learning_rate": 9.840201071309539e-05, "loss": 0.5164, "step": 1698 }, { "epoch": 1.01, "grad_norm": 0.1743391543139743, "learning_rate": 9.830802006511154e-05, "loss": 0.4362, "step": 1699 }, { "epoch": 1.01, "grad_norm": 0.1793245330684509, "learning_rate": 9.821403091226822e-05, "loss": 0.5189, "step": 1700 }, { "epoch": 1.01, "grad_norm": 0.17976563495164294, "learning_rate": 9.812004333762027e-05, "loss": 0.501, "step": 1701 }, { "epoch": 1.02, "grad_norm": 0.18720637986337038, "learning_rate": 9.802605742422104e-05, "loss": 0.5105, "step": 1702 }, { "epoch": 1.02, "grad_norm": 0.18379799853488712, "learning_rate": 9.793207325512242e-05, "loss": 0.4786, "step": 1703 }, { "epoch": 1.02, "grad_norm": 0.1869951004361499, "learning_rate": 9.78380909133748e-05, "loss": 0.4969, "step": 1704 }, { "epoch": 1.02, "grad_norm": 0.19079725212306675, "learning_rate": 9.77441104820269e-05, "loss": 0.469, "step": 1705 }, { "epoch": 1.02, "grad_norm": 0.19533398341345093, "learning_rate": 9.765013204412583e-05, "loss": 0.4961, "step": 1706 }, { "epoch": 1.02, "grad_norm": 0.1836971846789222, "learning_rate": 9.755615568271683e-05, "loss": 0.5366, "step": 1707 }, { "epoch": 1.02, "grad_norm": 0.17261463561503315, "learning_rate": 9.746218148084337e-05, "loss": 0.5368, "step": 1708 }, { "epoch": 1.0, "grad_norm": 0.20247882518351865, "learning_rate": 9.736820952154706e-05, "loss": 0.5068, "step": 1709 }, { "epoch": 1.0, "grad_norm": 0.1598672802477059, "learning_rate": 9.72742398878674e-05, "loss": 0.388, "step": 1710 }, { "epoch": 1.0, "grad_norm": 0.16834802241635866, "learning_rate": 9.718027266284192e-05, "loss": 0.4175, "step": 1711 }, { "epoch": 1.0, "grad_norm": 0.1771452263915879, "learning_rate": 9.708630792950608e-05, "loss": 0.4277, "step": 1712 }, { "epoch": 1.0, "grad_norm": 0.1708577470493871, "learning_rate": 9.699234577089297e-05, "loss": 0.3812, "step": 1713 }, { "epoch": 1.0, "grad_norm": 0.18260209637789201, "learning_rate": 9.689838627003354e-05, "loss": 0.4253, "step": 1714 }, { "epoch": 1.0, "grad_norm": 0.22465773493800734, "learning_rate": 9.68044295099564e-05, "loss": 0.4531, "step": 1715 }, { "epoch": 1.0, "grad_norm": 0.2014183902769127, "learning_rate": 9.671047557368761e-05, "loss": 0.3596, "step": 1716 }, { "epoch": 1.0, "grad_norm": 0.19751970937795732, "learning_rate": 9.661652454425086e-05, "loss": 0.4525, "step": 1717 }, { "epoch": 1.01, "grad_norm": 0.20012635962136738, "learning_rate": 9.652257650466723e-05, "loss": 0.3992, "step": 1718 }, { "epoch": 1.01, "grad_norm": 0.18539871190485555, "learning_rate": 9.642863153795516e-05, "loss": 0.3962, "step": 1719 }, { "epoch": 1.01, "grad_norm": 0.18906247477131508, "learning_rate": 9.633468972713034e-05, "loss": 0.3985, "step": 1720 }, { "epoch": 1.01, "grad_norm": 0.19159144899087352, "learning_rate": 9.624075115520572e-05, "loss": 0.4178, "step": 1721 }, { "epoch": 1.01, "grad_norm": 0.22081857132075158, "learning_rate": 9.614681590519143e-05, "loss": 0.419, "step": 1722 }, { "epoch": 1.01, "grad_norm": 0.19043102570259512, "learning_rate": 9.605288406009447e-05, "loss": 0.4013, "step": 1723 }, { "epoch": 1.01, "grad_norm": 0.19103339288259663, "learning_rate": 9.595895570291906e-05, "loss": 0.4401, "step": 1724 }, { "epoch": 1.01, "grad_norm": 0.1734966093658671, "learning_rate": 9.586503091666623e-05, "loss": 0.3825, "step": 1725 }, { "epoch": 1.01, "grad_norm": 0.17722830697815692, "learning_rate": 9.577110978433385e-05, "loss": 0.4314, "step": 1726 }, { "epoch": 1.01, "grad_norm": 0.19257129465178685, "learning_rate": 9.567719238891658e-05, "loss": 0.4564, "step": 1727 }, { "epoch": 1.01, "grad_norm": 0.18967332310638602, "learning_rate": 9.55832788134058e-05, "loss": 0.439, "step": 1728 }, { "epoch": 1.01, "grad_norm": 0.20692915676429485, "learning_rate": 9.548936914078946e-05, "loss": 0.4399, "step": 1729 }, { "epoch": 1.01, "grad_norm": 0.19518385923192133, "learning_rate": 9.53954634540521e-05, "loss": 0.4042, "step": 1730 }, { "epoch": 1.01, "grad_norm": 0.1751627976443351, "learning_rate": 9.530156183617475e-05, "loss": 0.3965, "step": 1731 }, { "epoch": 1.01, "grad_norm": 0.1815012657437565, "learning_rate": 9.520766437013483e-05, "loss": 0.4374, "step": 1732 }, { "epoch": 1.01, "grad_norm": 0.18061460328754317, "learning_rate": 9.511377113890602e-05, "loss": 0.3605, "step": 1733 }, { "epoch": 1.02, "grad_norm": 0.20605498698659788, "learning_rate": 9.501988222545838e-05, "loss": 0.4335, "step": 1734 }, { "epoch": 1.02, "grad_norm": 0.18420612406230608, "learning_rate": 9.492599771275813e-05, "loss": 0.4283, "step": 1735 }, { "epoch": 1.02, "grad_norm": 0.16811914920856832, "learning_rate": 9.483211768376749e-05, "loss": 0.3856, "step": 1736 }, { "epoch": 1.02, "grad_norm": 0.1756881106242861, "learning_rate": 9.473824222144483e-05, "loss": 0.4262, "step": 1737 }, { "epoch": 1.02, "grad_norm": 0.1748388175069077, "learning_rate": 9.464437140874447e-05, "loss": 0.3776, "step": 1738 }, { "epoch": 1.02, "grad_norm": 0.1850259364570879, "learning_rate": 9.455050532861656e-05, "loss": 0.4085, "step": 1739 }, { "epoch": 1.02, "grad_norm": 0.1724597528234989, "learning_rate": 9.445664406400716e-05, "loss": 0.3584, "step": 1740 }, { "epoch": 1.02, "grad_norm": 0.1892387423485602, "learning_rate": 9.4362787697858e-05, "loss": 0.3991, "step": 1741 }, { "epoch": 1.02, "grad_norm": 0.17643614255793788, "learning_rate": 9.42689363131065e-05, "loss": 0.3876, "step": 1742 }, { "epoch": 1.02, "grad_norm": 0.18239920728297748, "learning_rate": 9.417508999268569e-05, "loss": 0.3856, "step": 1743 }, { "epoch": 1.02, "grad_norm": 0.17690658739964463, "learning_rate": 9.408124881952418e-05, "loss": 0.3972, "step": 1744 }, { "epoch": 1.02, "grad_norm": 0.19597272887285241, "learning_rate": 9.398741287654587e-05, "loss": 0.4211, "step": 1745 }, { "epoch": 1.02, "grad_norm": 0.1818944310579676, "learning_rate": 9.389358224667019e-05, "loss": 0.3708, "step": 1746 }, { "epoch": 1.02, "grad_norm": 0.1767137534783939, "learning_rate": 9.379975701281181e-05, "loss": 0.3912, "step": 1747 }, { "epoch": 1.02, "grad_norm": 0.1711169455706469, "learning_rate": 9.370593725788068e-05, "loss": 0.3876, "step": 1748 }, { "epoch": 1.02, "grad_norm": 0.1731607579175634, "learning_rate": 9.361212306478185e-05, "loss": 0.3862, "step": 1749 }, { "epoch": 1.02, "grad_norm": 0.1863257179297087, "learning_rate": 9.351831451641546e-05, "loss": 0.4546, "step": 1750 }, { "epoch": 1.03, "grad_norm": 0.20882434007762518, "learning_rate": 9.342451169567675e-05, "loss": 0.4224, "step": 1751 }, { "epoch": 1.03, "grad_norm": 0.19101829618663632, "learning_rate": 9.333071468545573e-05, "loss": 0.4388, "step": 1752 }, { "epoch": 1.03, "grad_norm": 0.18459630326288334, "learning_rate": 9.323692356863746e-05, "loss": 0.4376, "step": 1753 }, { "epoch": 1.03, "grad_norm": 0.17877215371813454, "learning_rate": 9.314313842810172e-05, "loss": 0.4084, "step": 1754 }, { "epoch": 1.03, "grad_norm": 0.21787097366397204, "learning_rate": 9.30493593467229e-05, "loss": 0.4495, "step": 1755 }, { "epoch": 1.03, "grad_norm": 0.1820262678604187, "learning_rate": 9.295558640737019e-05, "loss": 0.426, "step": 1756 }, { "epoch": 1.03, "grad_norm": 0.1713277869502375, "learning_rate": 9.286181969290736e-05, "loss": 0.3856, "step": 1757 }, { "epoch": 1.03, "grad_norm": 0.18414728315702017, "learning_rate": 9.276805928619251e-05, "loss": 0.4138, "step": 1758 }, { "epoch": 1.03, "grad_norm": 0.18241075623039185, "learning_rate": 9.267430527007831e-05, "loss": 0.3828, "step": 1759 }, { "epoch": 1.03, "grad_norm": 0.18698971522482388, "learning_rate": 9.258055772741174e-05, "loss": 0.4183, "step": 1760 }, { "epoch": 1.03, "grad_norm": 0.16609172517956425, "learning_rate": 9.24868167410341e-05, "loss": 0.3542, "step": 1761 }, { "epoch": 1.03, "grad_norm": 0.18231584353924798, "learning_rate": 9.239308239378081e-05, "loss": 0.43, "step": 1762 }, { "epoch": 1.03, "grad_norm": 0.18874617751046177, "learning_rate": 9.229935476848151e-05, "loss": 0.4213, "step": 1763 }, { "epoch": 1.03, "grad_norm": 0.17314287835154177, "learning_rate": 9.220563394795984e-05, "loss": 0.401, "step": 1764 }, { "epoch": 1.03, "grad_norm": 0.1862791761822762, "learning_rate": 9.211192001503346e-05, "loss": 0.4073, "step": 1765 }, { "epoch": 1.03, "grad_norm": 0.20601420449280228, "learning_rate": 9.201821305251393e-05, "loss": 0.3986, "step": 1766 }, { "epoch": 1.03, "grad_norm": 0.19246329075521015, "learning_rate": 9.192451314320669e-05, "loss": 0.4495, "step": 1767 }, { "epoch": 1.04, "grad_norm": 0.16997776284538635, "learning_rate": 9.183082036991084e-05, "loss": 0.3906, "step": 1768 }, { "epoch": 1.04, "grad_norm": 0.2036008039486783, "learning_rate": 9.173713481541929e-05, "loss": 0.4135, "step": 1769 }, { "epoch": 1.04, "grad_norm": 0.18909810433881546, "learning_rate": 9.164345656251853e-05, "loss": 0.431, "step": 1770 }, { "epoch": 1.04, "grad_norm": 0.20571260974020206, "learning_rate": 9.154978569398859e-05, "loss": 0.4038, "step": 1771 }, { "epoch": 1.04, "grad_norm": 0.18675687168122815, "learning_rate": 9.145612229260295e-05, "loss": 0.4198, "step": 1772 }, { "epoch": 1.04, "grad_norm": 0.18206854372967374, "learning_rate": 9.13624664411286e-05, "loss": 0.4399, "step": 1773 }, { "epoch": 1.04, "grad_norm": 0.18084477521813697, "learning_rate": 9.126881822232568e-05, "loss": 0.4206, "step": 1774 }, { "epoch": 1.04, "grad_norm": 0.19191834762302898, "learning_rate": 9.117517771894773e-05, "loss": 0.4108, "step": 1775 }, { "epoch": 1.04, "grad_norm": 0.1755560835129537, "learning_rate": 9.108154501374143e-05, "loss": 0.3662, "step": 1776 }, { "epoch": 1.04, "grad_norm": 0.185181106446634, "learning_rate": 9.098792018944661e-05, "loss": 0.4275, "step": 1777 }, { "epoch": 1.04, "grad_norm": 0.2030891286279709, "learning_rate": 9.089430332879599e-05, "loss": 0.4119, "step": 1778 }, { "epoch": 1.04, "grad_norm": 0.175294036399659, "learning_rate": 9.080069451451544e-05, "loss": 0.3841, "step": 1779 }, { "epoch": 1.04, "grad_norm": 0.1930453644377178, "learning_rate": 9.070709382932363e-05, "loss": 0.4081, "step": 1780 }, { "epoch": 1.04, "grad_norm": 0.17187828203070712, "learning_rate": 9.0613501355932e-05, "loss": 0.4122, "step": 1781 }, { "epoch": 1.04, "grad_norm": 0.22525449819172308, "learning_rate": 9.05199171770448e-05, "loss": 0.4644, "step": 1782 }, { "epoch": 1.04, "grad_norm": 0.1908671974863637, "learning_rate": 9.042634137535898e-05, "loss": 0.3771, "step": 1783 }, { "epoch": 1.04, "grad_norm": 0.1755715761595777, "learning_rate": 9.033277403356397e-05, "loss": 0.39, "step": 1784 }, { "epoch": 1.05, "grad_norm": 0.18097636315463037, "learning_rate": 9.023921523434184e-05, "loss": 0.3848, "step": 1785 }, { "epoch": 1.05, "grad_norm": 0.17287399316233582, "learning_rate": 9.01456650603671e-05, "loss": 0.3714, "step": 1786 }, { "epoch": 1.05, "grad_norm": 0.18747127569805108, "learning_rate": 9.005212359430654e-05, "loss": 0.409, "step": 1787 }, { "epoch": 1.05, "grad_norm": 0.18137619801864702, "learning_rate": 8.995859091881935e-05, "loss": 0.3967, "step": 1788 }, { "epoch": 1.05, "grad_norm": 0.1779504228236956, "learning_rate": 8.986506711655692e-05, "loss": 0.4152, "step": 1789 }, { "epoch": 1.05, "grad_norm": 0.19489093546042696, "learning_rate": 8.977155227016286e-05, "loss": 0.4182, "step": 1790 }, { "epoch": 1.05, "grad_norm": 0.1845854795417051, "learning_rate": 8.967804646227271e-05, "loss": 0.4132, "step": 1791 }, { "epoch": 1.05, "grad_norm": 0.17653594604432007, "learning_rate": 8.958454977551414e-05, "loss": 0.4075, "step": 1792 }, { "epoch": 1.05, "grad_norm": 0.17538723506657966, "learning_rate": 8.949106229250685e-05, "loss": 0.4085, "step": 1793 }, { "epoch": 1.05, "grad_norm": 0.1984669167523276, "learning_rate": 8.939758409586216e-05, "loss": 0.3935, "step": 1794 }, { "epoch": 1.05, "grad_norm": 0.1817428273700713, "learning_rate": 8.930411526818337e-05, "loss": 0.3728, "step": 1795 }, { "epoch": 1.05, "grad_norm": 0.17233958696209886, "learning_rate": 8.92106558920655e-05, "loss": 0.4157, "step": 1796 }, { "epoch": 1.05, "grad_norm": 0.18025099372025463, "learning_rate": 8.911720605009511e-05, "loss": 0.3834, "step": 1797 }, { "epoch": 1.05, "grad_norm": 0.1876060834020746, "learning_rate": 8.902376582485043e-05, "loss": 0.4072, "step": 1798 }, { "epoch": 1.05, "grad_norm": 0.17556808476603256, "learning_rate": 8.893033529890118e-05, "loss": 0.3941, "step": 1799 }, { "epoch": 1.05, "grad_norm": 0.21284599089407197, "learning_rate": 8.883691455480839e-05, "loss": 0.4245, "step": 1800 }, { "epoch": 1.06, "grad_norm": 0.22170523691150348, "learning_rate": 8.874350367512465e-05, "loss": 0.4269, "step": 1801 }, { "epoch": 1.06, "grad_norm": 0.1797945175853634, "learning_rate": 8.865010274239372e-05, "loss": 0.3614, "step": 1802 }, { "epoch": 1.06, "grad_norm": 0.19392543266039863, "learning_rate": 8.85567118391505e-05, "loss": 0.415, "step": 1803 }, { "epoch": 1.06, "grad_norm": 0.18620026243486437, "learning_rate": 8.846333104792116e-05, "loss": 0.416, "step": 1804 }, { "epoch": 1.06, "grad_norm": 0.1936617538942135, "learning_rate": 8.836996045122286e-05, "loss": 0.404, "step": 1805 }, { "epoch": 1.06, "grad_norm": 0.205931504034666, "learning_rate": 8.827660013156381e-05, "loss": 0.4025, "step": 1806 }, { "epoch": 1.06, "grad_norm": 0.18421178078483139, "learning_rate": 8.818325017144302e-05, "loss": 0.3884, "step": 1807 }, { "epoch": 1.06, "grad_norm": 0.1872697932453832, "learning_rate": 8.808991065335049e-05, "loss": 0.3989, "step": 1808 }, { "epoch": 1.06, "grad_norm": 0.1749427185898119, "learning_rate": 8.799658165976694e-05, "loss": 0.4226, "step": 1809 }, { "epoch": 1.06, "grad_norm": 0.19819149794184812, "learning_rate": 8.790326327316372e-05, "loss": 0.4501, "step": 1810 }, { "epoch": 1.06, "grad_norm": 0.18523438192000483, "learning_rate": 8.780995557600287e-05, "loss": 0.3982, "step": 1811 }, { "epoch": 1.06, "grad_norm": 0.17754926519420264, "learning_rate": 8.771665865073707e-05, "loss": 0.4238, "step": 1812 }, { "epoch": 1.06, "grad_norm": 0.18958930276429575, "learning_rate": 8.762337257980927e-05, "loss": 0.4067, "step": 1813 }, { "epoch": 1.06, "grad_norm": 0.17184782349154784, "learning_rate": 8.753009744565297e-05, "loss": 0.3839, "step": 1814 }, { "epoch": 1.06, "grad_norm": 0.18472593089993697, "learning_rate": 8.743683333069208e-05, "loss": 0.4147, "step": 1815 }, { "epoch": 1.06, "grad_norm": 0.1754635852591751, "learning_rate": 8.734358031734056e-05, "loss": 0.3632, "step": 1816 }, { "epoch": 1.06, "grad_norm": 0.18050807312591965, "learning_rate": 8.725033848800273e-05, "loss": 0.4126, "step": 1817 }, { "epoch": 1.07, "grad_norm": 0.18768687894888247, "learning_rate": 8.715710792507295e-05, "loss": 0.4277, "step": 1818 }, { "epoch": 1.07, "grad_norm": 0.16954704286667174, "learning_rate": 8.706388871093571e-05, "loss": 0.3805, "step": 1819 }, { "epoch": 1.07, "grad_norm": 0.17940056263287732, "learning_rate": 8.697068092796531e-05, "loss": 0.3967, "step": 1820 }, { "epoch": 1.07, "grad_norm": 0.1957651587746892, "learning_rate": 8.687748465852614e-05, "loss": 0.4095, "step": 1821 }, { "epoch": 1.07, "grad_norm": 0.1787849320144345, "learning_rate": 8.678429998497229e-05, "loss": 0.401, "step": 1822 }, { "epoch": 1.07, "grad_norm": 0.1844744058248385, "learning_rate": 8.66911269896476e-05, "loss": 0.3861, "step": 1823 }, { "epoch": 1.07, "grad_norm": 0.19625307901860042, "learning_rate": 8.659796575488566e-05, "loss": 0.4097, "step": 1824 }, { "epoch": 1.07, "grad_norm": 0.1950267957891965, "learning_rate": 8.650481636300969e-05, "loss": 0.4397, "step": 1825 }, { "epoch": 1.07, "grad_norm": 0.17815768732680395, "learning_rate": 8.641167889633228e-05, "loss": 0.3898, "step": 1826 }, { "epoch": 1.07, "grad_norm": 0.18737413534954547, "learning_rate": 8.631855343715565e-05, "loss": 0.4171, "step": 1827 }, { "epoch": 1.07, "grad_norm": 0.1820901135233458, "learning_rate": 8.622544006777136e-05, "loss": 0.3997, "step": 1828 }, { "epoch": 1.07, "grad_norm": 0.1804075064552973, "learning_rate": 8.613233887046027e-05, "loss": 0.3789, "step": 1829 }, { "epoch": 1.07, "grad_norm": 0.19493763026231556, "learning_rate": 8.603924992749245e-05, "loss": 0.3841, "step": 1830 }, { "epoch": 1.07, "grad_norm": 0.20359305389403523, "learning_rate": 8.594617332112725e-05, "loss": 0.4453, "step": 1831 }, { "epoch": 1.07, "grad_norm": 0.2265086862162401, "learning_rate": 8.585310913361301e-05, "loss": 0.4563, "step": 1832 }, { "epoch": 1.07, "grad_norm": 0.17567731483293006, "learning_rate": 8.576005744718716e-05, "loss": 0.4016, "step": 1833 }, { "epoch": 1.07, "grad_norm": 0.17379900034242313, "learning_rate": 8.566701834407605e-05, "loss": 0.3586, "step": 1834 }, { "epoch": 1.08, "grad_norm": 0.1942490326860427, "learning_rate": 8.557399190649496e-05, "loss": 0.4221, "step": 1835 }, { "epoch": 1.08, "grad_norm": 0.19035569554722812, "learning_rate": 8.548097821664785e-05, "loss": 0.4172, "step": 1836 }, { "epoch": 1.08, "grad_norm": 0.1843142843552852, "learning_rate": 8.538797735672753e-05, "loss": 0.4148, "step": 1837 }, { "epoch": 1.08, "grad_norm": 0.18512939658727262, "learning_rate": 8.529498940891554e-05, "loss": 0.3773, "step": 1838 }, { "epoch": 1.08, "grad_norm": 0.20380626583697006, "learning_rate": 8.520201445538183e-05, "loss": 0.3611, "step": 1839 }, { "epoch": 1.08, "grad_norm": 0.19398861478004933, "learning_rate": 8.510905257828496e-05, "loss": 0.412, "step": 1840 }, { "epoch": 1.08, "grad_norm": 0.19213160694154596, "learning_rate": 8.501610385977198e-05, "loss": 0.4024, "step": 1841 }, { "epoch": 1.08, "grad_norm": 0.18667688767595464, "learning_rate": 8.49231683819782e-05, "loss": 0.4313, "step": 1842 }, { "epoch": 1.08, "grad_norm": 0.17127203811779163, "learning_rate": 8.483024622702732e-05, "loss": 0.3568, "step": 1843 }, { "epoch": 1.08, "grad_norm": 0.17717528881555858, "learning_rate": 8.473733747703129e-05, "loss": 0.4102, "step": 1844 }, { "epoch": 1.08, "grad_norm": 0.18077085803280193, "learning_rate": 8.464444221409004e-05, "loss": 0.3759, "step": 1845 }, { "epoch": 1.08, "grad_norm": 0.18149742276377073, "learning_rate": 8.45515605202918e-05, "loss": 0.423, "step": 1846 }, { "epoch": 1.08, "grad_norm": 0.1845319878720036, "learning_rate": 8.44586924777127e-05, "loss": 0.4175, "step": 1847 }, { "epoch": 1.08, "grad_norm": 0.18412454666796282, "learning_rate": 8.436583816841684e-05, "loss": 0.3959, "step": 1848 }, { "epoch": 1.08, "grad_norm": 0.1818564229667886, "learning_rate": 8.42729976744561e-05, "loss": 0.425, "step": 1849 }, { "epoch": 1.08, "grad_norm": 0.21214694567252032, "learning_rate": 8.418017107787028e-05, "loss": 0.4223, "step": 1850 }, { "epoch": 1.08, "grad_norm": 0.18464329908222107, "learning_rate": 8.408735846068683e-05, "loss": 0.4099, "step": 1851 }, { "epoch": 1.09, "grad_norm": 0.1953000842888941, "learning_rate": 8.399455990492082e-05, "loss": 0.373, "step": 1852 }, { "epoch": 1.09, "grad_norm": 0.19283458690381147, "learning_rate": 8.390177549257494e-05, "loss": 0.4272, "step": 1853 }, { "epoch": 1.09, "grad_norm": 0.186815226924515, "learning_rate": 8.38090053056394e-05, "loss": 0.3891, "step": 1854 }, { "epoch": 1.09, "grad_norm": 0.17547970914468208, "learning_rate": 8.371624942609177e-05, "loss": 0.3661, "step": 1855 }, { "epoch": 1.09, "grad_norm": 0.18299608790344135, "learning_rate": 8.362350793589705e-05, "loss": 0.4133, "step": 1856 }, { "epoch": 1.09, "grad_norm": 0.188868917620866, "learning_rate": 8.353078091700751e-05, "loss": 0.3885, "step": 1857 }, { "epoch": 1.09, "grad_norm": 0.18813258089991336, "learning_rate": 8.343806845136255e-05, "loss": 0.4251, "step": 1858 }, { "epoch": 1.09, "grad_norm": 0.2141392463690851, "learning_rate": 8.334537062088878e-05, "loss": 0.3776, "step": 1859 }, { "epoch": 1.09, "grad_norm": 0.18765213967305597, "learning_rate": 8.325268750749991e-05, "loss": 0.4284, "step": 1860 }, { "epoch": 1.09, "grad_norm": 0.17745783167340184, "learning_rate": 8.316001919309667e-05, "loss": 0.3719, "step": 1861 }, { "epoch": 1.09, "grad_norm": 0.17032487036928615, "learning_rate": 8.306736575956651e-05, "loss": 0.3871, "step": 1862 }, { "epoch": 1.09, "grad_norm": 0.1946877937916614, "learning_rate": 8.297472728878392e-05, "loss": 0.4444, "step": 1863 }, { "epoch": 1.09, "grad_norm": 0.17977342966223753, "learning_rate": 8.288210386261019e-05, "loss": 0.4286, "step": 1864 }, { "epoch": 1.09, "grad_norm": 0.17193878042087601, "learning_rate": 8.278949556289314e-05, "loss": 0.3836, "step": 1865 }, { "epoch": 1.09, "grad_norm": 0.17939010894202997, "learning_rate": 8.269690247146737e-05, "loss": 0.3601, "step": 1866 }, { "epoch": 1.09, "grad_norm": 0.19980024554153514, "learning_rate": 8.260432467015403e-05, "loss": 0.4138, "step": 1867 }, { "epoch": 1.1, "grad_norm": 0.18353233337118147, "learning_rate": 8.251176224076067e-05, "loss": 0.4275, "step": 1868 }, { "epoch": 1.1, "grad_norm": 0.1834636907263801, "learning_rate": 8.241921526508135e-05, "loss": 0.4151, "step": 1869 }, { "epoch": 1.1, "grad_norm": 0.17296086904070018, "learning_rate": 8.232668382489646e-05, "loss": 0.3677, "step": 1870 }, { "epoch": 1.1, "grad_norm": 0.16857606057643587, "learning_rate": 8.223416800197256e-05, "loss": 0.3758, "step": 1871 }, { "epoch": 1.1, "grad_norm": 0.18178713996724474, "learning_rate": 8.214166787806252e-05, "loss": 0.4152, "step": 1872 }, { "epoch": 1.1, "grad_norm": 0.18252800035226638, "learning_rate": 8.204918353490535e-05, "loss": 0.3854, "step": 1873 }, { "epoch": 1.1, "grad_norm": 0.19028068789402494, "learning_rate": 8.195671505422602e-05, "loss": 0.3792, "step": 1874 }, { "epoch": 1.1, "grad_norm": 0.19079634326382508, "learning_rate": 8.186426251773554e-05, "loss": 0.4294, "step": 1875 }, { "epoch": 1.1, "grad_norm": 0.17426500052210128, "learning_rate": 8.177182600713084e-05, "loss": 0.3913, "step": 1876 }, { "epoch": 1.1, "grad_norm": 0.17814955101973237, "learning_rate": 8.167940560409469e-05, "loss": 0.383, "step": 1877 }, { "epoch": 1.1, "grad_norm": 0.1846284044065382, "learning_rate": 8.158700139029557e-05, "loss": 0.3809, "step": 1878 }, { "epoch": 1.1, "grad_norm": 0.192583860307762, "learning_rate": 8.14946134473877e-05, "loss": 0.4238, "step": 1879 }, { "epoch": 1.1, "grad_norm": 0.1813183524341541, "learning_rate": 8.140224185701097e-05, "loss": 0.4186, "step": 1880 }, { "epoch": 1.1, "grad_norm": 0.18483836078982704, "learning_rate": 8.130988670079068e-05, "loss": 0.3645, "step": 1881 }, { "epoch": 1.1, "grad_norm": 0.18664080177947134, "learning_rate": 8.121754806033772e-05, "loss": 0.4221, "step": 1882 }, { "epoch": 1.1, "grad_norm": 0.21118292992092838, "learning_rate": 8.112522601724844e-05, "loss": 0.3988, "step": 1883 }, { "epoch": 1.1, "grad_norm": 0.21933086442133368, "learning_rate": 8.103292065310431e-05, "loss": 0.4194, "step": 1884 }, { "epoch": 1.11, "grad_norm": 0.18369374982890216, "learning_rate": 8.094063204947227e-05, "loss": 0.4313, "step": 1885 }, { "epoch": 1.11, "grad_norm": 0.17871299903509055, "learning_rate": 8.084836028790438e-05, "loss": 0.3579, "step": 1886 }, { "epoch": 1.11, "grad_norm": 0.21040972807802708, "learning_rate": 8.075610544993777e-05, "loss": 0.4453, "step": 1887 }, { "epoch": 1.11, "grad_norm": 0.20196209589838468, "learning_rate": 8.066386761709467e-05, "loss": 0.4136, "step": 1888 }, { "epoch": 1.11, "grad_norm": 0.2012185214218647, "learning_rate": 8.057164687088228e-05, "loss": 0.4185, "step": 1889 }, { "epoch": 1.11, "grad_norm": 0.1930053093599451, "learning_rate": 8.04794432927927e-05, "loss": 0.4088, "step": 1890 }, { "epoch": 1.11, "grad_norm": 0.18229158780923613, "learning_rate": 8.038725696430281e-05, "loss": 0.4119, "step": 1891 }, { "epoch": 1.11, "grad_norm": 0.23378468706836802, "learning_rate": 8.029508796687432e-05, "loss": 0.4211, "step": 1892 }, { "epoch": 1.11, "grad_norm": 0.2222541226430703, "learning_rate": 8.020293638195361e-05, "loss": 0.4337, "step": 1893 }, { "epoch": 1.11, "grad_norm": 0.18043439554449295, "learning_rate": 8.011080229097159e-05, "loss": 0.3612, "step": 1894 }, { "epoch": 1.11, "grad_norm": 0.18301487658509413, "learning_rate": 8.00186857753438e-05, "loss": 0.369, "step": 1895 }, { "epoch": 1.11, "grad_norm": 0.16630129703748364, "learning_rate": 7.992658691647027e-05, "loss": 0.3718, "step": 1896 }, { "epoch": 1.11, "grad_norm": 0.19163500305469702, "learning_rate": 7.98345057957353e-05, "loss": 0.4357, "step": 1897 }, { "epoch": 1.11, "grad_norm": 0.1960461834615872, "learning_rate": 7.974244249450767e-05, "loss": 0.418, "step": 1898 }, { "epoch": 1.11, "grad_norm": 0.19224624824324751, "learning_rate": 7.965039709414032e-05, "loss": 0.3982, "step": 1899 }, { "epoch": 1.11, "grad_norm": 0.18840740313567736, "learning_rate": 7.955836967597038e-05, "loss": 0.4095, "step": 1900 }, { "epoch": 1.11, "grad_norm": 0.1844305249109545, "learning_rate": 7.946636032131912e-05, "loss": 0.4772, "step": 1901 }, { "epoch": 1.12, "grad_norm": 0.2009709634690373, "learning_rate": 7.937436911149184e-05, "loss": 0.4181, "step": 1902 }, { "epoch": 1.12, "grad_norm": 0.2020297014858179, "learning_rate": 7.928239612777775e-05, "loss": 0.4264, "step": 1903 }, { "epoch": 1.12, "grad_norm": 0.1895774812668892, "learning_rate": 7.919044145145e-05, "loss": 0.4022, "step": 1904 }, { "epoch": 1.12, "grad_norm": 0.18240865004600954, "learning_rate": 7.909850516376563e-05, "loss": 0.3806, "step": 1905 }, { "epoch": 1.12, "grad_norm": 0.1681279511498143, "learning_rate": 7.900658734596536e-05, "loss": 0.377, "step": 1906 }, { "epoch": 1.12, "grad_norm": 0.20398653582049914, "learning_rate": 7.891468807927351e-05, "loss": 0.4547, "step": 1907 }, { "epoch": 1.12, "grad_norm": 0.17695818800041072, "learning_rate": 7.882280744489815e-05, "loss": 0.404, "step": 1908 }, { "epoch": 1.12, "grad_norm": 0.1888069872727214, "learning_rate": 7.873094552403083e-05, "loss": 0.4014, "step": 1909 }, { "epoch": 1.12, "grad_norm": 0.19794289849692784, "learning_rate": 7.863910239784653e-05, "loss": 0.378, "step": 1910 }, { "epoch": 1.12, "grad_norm": 0.2563098214130778, "learning_rate": 7.854727814750366e-05, "loss": 0.4226, "step": 1911 }, { "epoch": 1.12, "grad_norm": 0.17518721676678928, "learning_rate": 7.845547285414399e-05, "loss": 0.3878, "step": 1912 }, { "epoch": 1.12, "grad_norm": 0.18134286816882556, "learning_rate": 7.83636865988924e-05, "loss": 0.4501, "step": 1913 }, { "epoch": 1.12, "grad_norm": 0.19364287166803107, "learning_rate": 7.827191946285709e-05, "loss": 0.3813, "step": 1914 }, { "epoch": 1.12, "grad_norm": 0.1889713570534168, "learning_rate": 7.818017152712933e-05, "loss": 0.3485, "step": 1915 }, { "epoch": 1.12, "grad_norm": 0.1886184628106324, "learning_rate": 7.808844287278336e-05, "loss": 0.3974, "step": 1916 }, { "epoch": 1.12, "grad_norm": 0.18047433681543168, "learning_rate": 7.799673358087643e-05, "loss": 0.4264, "step": 1917 }, { "epoch": 1.12, "grad_norm": 0.17959650439172858, "learning_rate": 7.790504373244866e-05, "loss": 0.3832, "step": 1918 }, { "epoch": 1.13, "grad_norm": 0.1913833511955245, "learning_rate": 7.78133734085231e-05, "loss": 0.4556, "step": 1919 }, { "epoch": 1.13, "grad_norm": 0.1826764203359158, "learning_rate": 7.772172269010535e-05, "loss": 0.4044, "step": 1920 }, { "epoch": 1.13, "grad_norm": 0.17986155200134601, "learning_rate": 7.763009165818382e-05, "loss": 0.3745, "step": 1921 }, { "epoch": 1.13, "grad_norm": 0.18959379016802552, "learning_rate": 7.75384803937295e-05, "loss": 0.4505, "step": 1922 }, { "epoch": 1.13, "grad_norm": 0.17214698727930164, "learning_rate": 7.74468889776959e-05, "loss": 0.3736, "step": 1923 }, { "epoch": 1.13, "grad_norm": 0.18385963141889453, "learning_rate": 7.735531749101898e-05, "loss": 0.4265, "step": 1924 }, { "epoch": 1.13, "grad_norm": 0.17449413736303118, "learning_rate": 7.726376601461716e-05, "loss": 0.387, "step": 1925 }, { "epoch": 1.13, "grad_norm": 0.21654659378768404, "learning_rate": 7.7172234629391e-05, "loss": 0.4454, "step": 1926 }, { "epoch": 1.13, "grad_norm": 0.18185744997632827, "learning_rate": 7.708072341622352e-05, "loss": 0.4193, "step": 1927 }, { "epoch": 1.13, "grad_norm": 0.17732744182694674, "learning_rate": 7.698923245597986e-05, "loss": 0.3979, "step": 1928 }, { "epoch": 1.13, "grad_norm": 0.1637787721289065, "learning_rate": 7.68977618295071e-05, "loss": 0.3519, "step": 1929 }, { "epoch": 1.13, "grad_norm": 0.17846544793538227, "learning_rate": 7.680631161763457e-05, "loss": 0.3867, "step": 1930 }, { "epoch": 1.13, "grad_norm": 0.17240474406241782, "learning_rate": 7.671488190117341e-05, "loss": 0.4343, "step": 1931 }, { "epoch": 1.13, "grad_norm": 0.19469226587363003, "learning_rate": 7.662347276091677e-05, "loss": 0.3906, "step": 1932 }, { "epoch": 1.13, "grad_norm": 0.16319658498994377, "learning_rate": 7.653208427763949e-05, "loss": 0.3847, "step": 1933 }, { "epoch": 1.13, "grad_norm": 0.18439605456499675, "learning_rate": 7.644071653209826e-05, "loss": 0.3699, "step": 1934 }, { "epoch": 1.13, "grad_norm": 0.19104357865644897, "learning_rate": 7.63493696050314e-05, "loss": 0.3871, "step": 1935 }, { "epoch": 1.14, "grad_norm": 0.16163196012369285, "learning_rate": 7.625804357715882e-05, "loss": 0.3428, "step": 1936 }, { "epoch": 1.14, "grad_norm": 0.1747508317386647, "learning_rate": 7.616673852918198e-05, "loss": 0.4059, "step": 1937 }, { "epoch": 1.14, "grad_norm": 0.18121733774112123, "learning_rate": 7.607545454178386e-05, "loss": 0.3769, "step": 1938 }, { "epoch": 1.14, "grad_norm": 0.18241831118169877, "learning_rate": 7.598419169562867e-05, "loss": 0.3982, "step": 1939 }, { "epoch": 1.14, "grad_norm": 0.17679256583275185, "learning_rate": 7.589295007136206e-05, "loss": 0.3993, "step": 1940 }, { "epoch": 1.14, "grad_norm": 0.18477564112117775, "learning_rate": 7.580172974961101e-05, "loss": 0.3957, "step": 1941 }, { "epoch": 1.14, "grad_norm": 0.23283220926103426, "learning_rate": 7.571053081098346e-05, "loss": 0.4147, "step": 1942 }, { "epoch": 1.14, "grad_norm": 0.1659718388147332, "learning_rate": 7.561935333606858e-05, "loss": 0.3624, "step": 1943 }, { "epoch": 1.14, "grad_norm": 0.18206208296809556, "learning_rate": 7.552819740543661e-05, "loss": 0.4359, "step": 1944 }, { "epoch": 1.14, "grad_norm": 0.17940915446893982, "learning_rate": 7.543706309963868e-05, "loss": 0.4134, "step": 1945 }, { "epoch": 1.14, "grad_norm": 0.1702703539861626, "learning_rate": 7.534595049920679e-05, "loss": 0.3939, "step": 1946 }, { "epoch": 1.14, "grad_norm": 0.1689150777050423, "learning_rate": 7.525485968465384e-05, "loss": 0.3816, "step": 1947 }, { "epoch": 1.14, "grad_norm": 0.15897856127120902, "learning_rate": 7.516379073647346e-05, "loss": 0.377, "step": 1948 }, { "epoch": 1.14, "grad_norm": 0.17742771441407118, "learning_rate": 7.50727437351399e-05, "loss": 0.3619, "step": 1949 }, { "epoch": 1.14, "grad_norm": 0.1816340107538647, "learning_rate": 7.498171876110805e-05, "loss": 0.4024, "step": 1950 }, { "epoch": 1.14, "grad_norm": 0.19659089087972145, "learning_rate": 7.489071589481342e-05, "loss": 0.3878, "step": 1951 }, { "epoch": 1.15, "grad_norm": 0.18428595373407372, "learning_rate": 7.479973521667179e-05, "loss": 0.4133, "step": 1952 }, { "epoch": 1.15, "grad_norm": 0.20632609287533257, "learning_rate": 7.470877680707951e-05, "loss": 0.4194, "step": 1953 }, { "epoch": 1.15, "grad_norm": 0.17669764278155453, "learning_rate": 7.461784074641318e-05, "loss": 0.4146, "step": 1954 }, { "epoch": 1.15, "grad_norm": 0.177643428472356, "learning_rate": 7.452692711502964e-05, "loss": 0.3895, "step": 1955 }, { "epoch": 1.15, "grad_norm": 0.19261073679736163, "learning_rate": 7.443603599326596e-05, "loss": 0.4313, "step": 1956 }, { "epoch": 1.15, "grad_norm": 0.18331838163686537, "learning_rate": 7.43451674614393e-05, "loss": 0.4061, "step": 1957 }, { "epoch": 1.15, "grad_norm": 0.18087488767406523, "learning_rate": 7.42543215998468e-05, "loss": 0.4041, "step": 1958 }, { "epoch": 1.15, "grad_norm": 0.18411365462351864, "learning_rate": 7.416349848876562e-05, "loss": 0.3838, "step": 1959 }, { "epoch": 1.15, "grad_norm": 0.16545729365630413, "learning_rate": 7.407269820845286e-05, "loss": 0.398, "step": 1960 }, { "epoch": 1.15, "grad_norm": 0.18790700003247798, "learning_rate": 7.398192083914541e-05, "loss": 0.4313, "step": 1961 }, { "epoch": 1.15, "grad_norm": 0.17796337044732197, "learning_rate": 7.389116646105977e-05, "loss": 0.3651, "step": 1962 }, { "epoch": 1.15, "grad_norm": 0.17905613829190345, "learning_rate": 7.380043515439237e-05, "loss": 0.3664, "step": 1963 }, { "epoch": 1.15, "grad_norm": 0.18849597901714205, "learning_rate": 7.370972699931915e-05, "loss": 0.3926, "step": 1964 }, { "epoch": 1.15, "grad_norm": 0.1812978236307781, "learning_rate": 7.361904207599551e-05, "loss": 0.4053, "step": 1965 }, { "epoch": 1.15, "grad_norm": 0.18651056741943095, "learning_rate": 7.352838046455639e-05, "loss": 0.3985, "step": 1966 }, { "epoch": 1.15, "grad_norm": 0.1797159480659541, "learning_rate": 7.34377422451162e-05, "loss": 0.3922, "step": 1967 }, { "epoch": 1.15, "grad_norm": 0.19978615888480009, "learning_rate": 7.334712749776853e-05, "loss": 0.4348, "step": 1968 }, { "epoch": 1.16, "grad_norm": 0.17603187423247865, "learning_rate": 7.325653630258633e-05, "loss": 0.3818, "step": 1969 }, { "epoch": 1.16, "grad_norm": 0.18082094994914052, "learning_rate": 7.316596873962177e-05, "loss": 0.4009, "step": 1970 }, { "epoch": 1.16, "grad_norm": 0.1808224594125852, "learning_rate": 7.3075424888906e-05, "loss": 0.3969, "step": 1971 }, { "epoch": 1.16, "grad_norm": 0.19116767160142442, "learning_rate": 7.298490483044935e-05, "loss": 0.3873, "step": 1972 }, { "epoch": 1.16, "grad_norm": 0.17641305911358995, "learning_rate": 7.28944086442411e-05, "loss": 0.4288, "step": 1973 }, { "epoch": 1.16, "grad_norm": 0.17262296279132974, "learning_rate": 7.280393641024932e-05, "loss": 0.3738, "step": 1974 }, { "epoch": 1.16, "grad_norm": 0.17516839482576677, "learning_rate": 7.271348820842106e-05, "loss": 0.3659, "step": 1975 }, { "epoch": 1.16, "grad_norm": 0.17338489664694423, "learning_rate": 7.262306411868207e-05, "loss": 0.3807, "step": 1976 }, { "epoch": 1.16, "grad_norm": 0.1888159247358559, "learning_rate": 7.253266422093683e-05, "loss": 0.3881, "step": 1977 }, { "epoch": 1.16, "grad_norm": 0.17195219592344435, "learning_rate": 7.244228859506836e-05, "loss": 0.3644, "step": 1978 }, { "epoch": 1.16, "grad_norm": 0.18121973200861058, "learning_rate": 7.23519373209383e-05, "loss": 0.422, "step": 1979 }, { "epoch": 1.16, "grad_norm": 0.203387565315694, "learning_rate": 7.226161047838679e-05, "loss": 0.4226, "step": 1980 }, { "epoch": 1.16, "grad_norm": 0.19176102596209946, "learning_rate": 7.21713081472323e-05, "loss": 0.4114, "step": 1981 }, { "epoch": 1.16, "grad_norm": 0.17366451038903769, "learning_rate": 7.208103040727172e-05, "loss": 0.4188, "step": 1982 }, { "epoch": 1.16, "grad_norm": 0.19204682803231887, "learning_rate": 7.199077733828019e-05, "loss": 0.422, "step": 1983 }, { "epoch": 1.16, "grad_norm": 0.19235955451920322, "learning_rate": 7.190054902001097e-05, "loss": 0.435, "step": 1984 }, { "epoch": 1.16, "grad_norm": 0.1910525933944164, "learning_rate": 7.181034553219554e-05, "loss": 0.3995, "step": 1985 }, { "epoch": 1.17, "grad_norm": 0.18707018048340046, "learning_rate": 7.172016695454349e-05, "loss": 0.4059, "step": 1986 }, { "epoch": 1.17, "grad_norm": 0.1757299951012914, "learning_rate": 7.16300133667422e-05, "loss": 0.3957, "step": 1987 }, { "epoch": 1.17, "grad_norm": 0.18577391313825203, "learning_rate": 7.153988484845715e-05, "loss": 0.4407, "step": 1988 }, { "epoch": 1.17, "grad_norm": 0.1923290224001986, "learning_rate": 7.144978147933162e-05, "loss": 0.407, "step": 1989 }, { "epoch": 1.17, "grad_norm": 0.18575659520168106, "learning_rate": 7.135970333898661e-05, "loss": 0.3765, "step": 1990 }, { "epoch": 1.17, "grad_norm": 0.18086735161720963, "learning_rate": 7.12696505070209e-05, "loss": 0.3808, "step": 1991 }, { "epoch": 1.17, "grad_norm": 0.1854599118032104, "learning_rate": 7.117962306301084e-05, "loss": 0.424, "step": 1992 }, { "epoch": 1.17, "grad_norm": 0.17537407633031674, "learning_rate": 7.108962108651044e-05, "loss": 0.3759, "step": 1993 }, { "epoch": 1.17, "grad_norm": 0.18981483483536024, "learning_rate": 7.099964465705106e-05, "loss": 0.4125, "step": 1994 }, { "epoch": 1.17, "grad_norm": 0.18413193277267284, "learning_rate": 7.090969385414163e-05, "loss": 0.4338, "step": 1995 }, { "epoch": 1.17, "grad_norm": 0.18158751633879683, "learning_rate": 7.081976875726842e-05, "loss": 0.4043, "step": 1996 }, { "epoch": 1.17, "grad_norm": 0.1800954346143054, "learning_rate": 7.072986944589479e-05, "loss": 0.4144, "step": 1997 }, { "epoch": 1.17, "grad_norm": 0.18894150742733065, "learning_rate": 7.063999599946159e-05, "loss": 0.4117, "step": 1998 }, { "epoch": 1.17, "grad_norm": 0.20030988236022462, "learning_rate": 7.055014849738664e-05, "loss": 0.4534, "step": 1999 }, { "epoch": 1.17, "grad_norm": 0.17896542445794142, "learning_rate": 7.046032701906486e-05, "loss": 0.3843, "step": 2000 }, { "epoch": 1.17, "grad_norm": 0.18090923195179542, "learning_rate": 7.037053164386824e-05, "loss": 0.3942, "step": 2001 }, { "epoch": 1.17, "grad_norm": 0.20066012152663207, "learning_rate": 7.028076245114566e-05, "loss": 0.4578, "step": 2002 }, { "epoch": 1.18, "grad_norm": 0.1778928677232292, "learning_rate": 7.01910195202228e-05, "loss": 0.3919, "step": 2003 }, { "epoch": 1.18, "grad_norm": 0.17177894275792283, "learning_rate": 7.010130293040225e-05, "loss": 0.4107, "step": 2004 }, { "epoch": 1.18, "grad_norm": 0.19618483219310756, "learning_rate": 7.001161276096326e-05, "loss": 0.4207, "step": 2005 }, { "epoch": 1.18, "grad_norm": 0.19677381672104852, "learning_rate": 6.992194909116178e-05, "loss": 0.4249, "step": 2006 }, { "epoch": 1.18, "grad_norm": 0.1694684202946766, "learning_rate": 6.98323120002302e-05, "loss": 0.3868, "step": 2007 }, { "epoch": 1.18, "grad_norm": 0.17590822070156392, "learning_rate": 6.97427015673776e-05, "loss": 0.4102, "step": 2008 }, { "epoch": 1.18, "grad_norm": 0.16951737219148982, "learning_rate": 6.965311787178946e-05, "loss": 0.3726, "step": 2009 }, { "epoch": 1.18, "grad_norm": 0.19106620473149435, "learning_rate": 6.956356099262753e-05, "loss": 0.4069, "step": 2010 }, { "epoch": 1.18, "grad_norm": 0.1797551006905545, "learning_rate": 6.947403100902999e-05, "loss": 0.3935, "step": 2011 }, { "epoch": 1.18, "grad_norm": 0.19000730900286916, "learning_rate": 6.938452800011119e-05, "loss": 0.4702, "step": 2012 }, { "epoch": 1.18, "grad_norm": 0.17588777897038269, "learning_rate": 6.929505204496162e-05, "loss": 0.3983, "step": 2013 }, { "epoch": 1.18, "grad_norm": 0.17399512960046365, "learning_rate": 6.920560322264795e-05, "loss": 0.3814, "step": 2014 }, { "epoch": 1.18, "grad_norm": 0.18764249310755376, "learning_rate": 6.911618161221282e-05, "loss": 0.4644, "step": 2015 }, { "epoch": 1.18, "grad_norm": 0.1955184775642465, "learning_rate": 6.902678729267478e-05, "loss": 0.402, "step": 2016 }, { "epoch": 1.18, "grad_norm": 0.18693802731785295, "learning_rate": 6.893742034302835e-05, "loss": 0.4156, "step": 2017 }, { "epoch": 1.18, "grad_norm": 0.18911478909963586, "learning_rate": 6.88480808422438e-05, "loss": 0.4043, "step": 2018 }, { "epoch": 1.19, "grad_norm": 0.17496610630242038, "learning_rate": 6.875876886926723e-05, "loss": 0.3687, "step": 2019 }, { "epoch": 1.19, "grad_norm": 0.20364715714956416, "learning_rate": 6.866948450302025e-05, "loss": 0.4489, "step": 2020 }, { "epoch": 1.19, "grad_norm": 0.1764577158807296, "learning_rate": 6.858022782240024e-05, "loss": 0.4058, "step": 2021 }, { "epoch": 1.19, "grad_norm": 0.17749840229336564, "learning_rate": 6.849099890628008e-05, "loss": 0.3684, "step": 2022 }, { "epoch": 1.19, "grad_norm": 0.2033262587464068, "learning_rate": 6.840179783350802e-05, "loss": 0.4333, "step": 2023 }, { "epoch": 1.19, "grad_norm": 0.19801907204469715, "learning_rate": 6.831262468290782e-05, "loss": 0.3815, "step": 2024 }, { "epoch": 1.19, "grad_norm": 0.17479276111326633, "learning_rate": 6.822347953327852e-05, "loss": 0.3767, "step": 2025 }, { "epoch": 1.19, "grad_norm": 0.18329259931818292, "learning_rate": 6.813436246339439e-05, "loss": 0.4289, "step": 2026 }, { "epoch": 1.19, "grad_norm": 0.18093663471154162, "learning_rate": 6.804527355200496e-05, "loss": 0.3715, "step": 2027 }, { "epoch": 1.19, "grad_norm": 0.1792934447291919, "learning_rate": 6.795621287783482e-05, "loss": 0.3975, "step": 2028 }, { "epoch": 1.19, "grad_norm": 0.18691613088073863, "learning_rate": 6.786718051958357e-05, "loss": 0.3926, "step": 2029 }, { "epoch": 1.19, "grad_norm": 0.1922768604680592, "learning_rate": 6.777817655592587e-05, "loss": 0.4351, "step": 2030 }, { "epoch": 1.19, "grad_norm": 0.1783333495829052, "learning_rate": 6.768920106551128e-05, "loss": 0.3987, "step": 2031 }, { "epoch": 1.19, "grad_norm": 0.17612582083352998, "learning_rate": 6.760025412696419e-05, "loss": 0.3532, "step": 2032 }, { "epoch": 1.19, "grad_norm": 0.18031897593024745, "learning_rate": 6.751133581888367e-05, "loss": 0.4045, "step": 2033 }, { "epoch": 1.19, "grad_norm": 0.1943194093034378, "learning_rate": 6.742244621984362e-05, "loss": 0.4307, "step": 2034 }, { "epoch": 1.19, "grad_norm": 0.17761984368104813, "learning_rate": 6.733358540839253e-05, "loss": 0.4051, "step": 2035 }, { "epoch": 1.2, "grad_norm": 0.19053785851354668, "learning_rate": 6.724475346305338e-05, "loss": 0.403, "step": 2036 }, { "epoch": 1.2, "grad_norm": 0.1819068527231751, "learning_rate": 6.715595046232374e-05, "loss": 0.4102, "step": 2037 }, { "epoch": 1.2, "grad_norm": 0.17372665786419286, "learning_rate": 6.70671764846756e-05, "loss": 0.387, "step": 2038 }, { "epoch": 1.2, "grad_norm": 0.17727979798044205, "learning_rate": 6.697843160855518e-05, "loss": 0.3994, "step": 2039 }, { "epoch": 1.2, "grad_norm": 0.1882371246792817, "learning_rate": 6.688971591238313e-05, "loss": 0.4211, "step": 2040 }, { "epoch": 1.2, "grad_norm": 0.1804413873803804, "learning_rate": 6.68010294745543e-05, "loss": 0.4154, "step": 2041 }, { "epoch": 1.2, "grad_norm": 0.18656805553080838, "learning_rate": 6.671237237343751e-05, "loss": 0.4059, "step": 2042 }, { "epoch": 1.2, "grad_norm": 0.2030909495200903, "learning_rate": 6.66237446873759e-05, "loss": 0.389, "step": 2043 }, { "epoch": 1.2, "grad_norm": 0.17679937581787675, "learning_rate": 6.653514649468644e-05, "loss": 0.3785, "step": 2044 }, { "epoch": 1.2, "grad_norm": 0.21855116652362658, "learning_rate": 6.644657787366013e-05, "loss": 0.4254, "step": 2045 }, { "epoch": 1.2, "grad_norm": 0.18903179904922357, "learning_rate": 6.635803890256181e-05, "loss": 0.3625, "step": 2046 }, { "epoch": 1.2, "grad_norm": 0.17348919788827918, "learning_rate": 6.626952965963012e-05, "loss": 0.3667, "step": 2047 }, { "epoch": 1.2, "grad_norm": 0.17361251854492593, "learning_rate": 6.618105022307746e-05, "loss": 0.3579, "step": 2048 }, { "epoch": 1.2, "grad_norm": 0.16596905289453603, "learning_rate": 6.609260067108979e-05, "loss": 0.3831, "step": 2049 }, { "epoch": 1.2, "grad_norm": 0.1821772550745199, "learning_rate": 6.600418108182678e-05, "loss": 0.3989, "step": 2050 }, { "epoch": 1.2, "grad_norm": 0.17611934697238005, "learning_rate": 6.59157915334216e-05, "loss": 0.3883, "step": 2051 }, { "epoch": 1.2, "grad_norm": 0.1920197326417183, "learning_rate": 6.582743210398079e-05, "loss": 0.3746, "step": 2052 }, { "epoch": 1.21, "grad_norm": 0.17930833321205353, "learning_rate": 6.573910287158437e-05, "loss": 0.3933, "step": 2053 }, { "epoch": 1.21, "grad_norm": 0.17538040457127826, "learning_rate": 6.565080391428568e-05, "loss": 0.3556, "step": 2054 }, { "epoch": 1.21, "grad_norm": 0.18958975621302934, "learning_rate": 6.556253531011119e-05, "loss": 0.4422, "step": 2055 }, { "epoch": 1.21, "grad_norm": 0.17115102273167726, "learning_rate": 6.547429713706066e-05, "loss": 0.3716, "step": 2056 }, { "epoch": 1.21, "grad_norm": 0.17770240634247175, "learning_rate": 6.538608947310694e-05, "loss": 0.3951, "step": 2057 }, { "epoch": 1.21, "grad_norm": 0.1789512078302291, "learning_rate": 6.529791239619589e-05, "loss": 0.3891, "step": 2058 }, { "epoch": 1.21, "grad_norm": 0.18976068809608343, "learning_rate": 6.520976598424637e-05, "loss": 0.4145, "step": 2059 }, { "epoch": 1.21, "grad_norm": 0.18256866392900348, "learning_rate": 6.512165031515012e-05, "loss": 0.3956, "step": 2060 }, { "epoch": 1.21, "grad_norm": 0.1985727795822358, "learning_rate": 6.503356546677173e-05, "loss": 0.4006, "step": 2061 }, { "epoch": 1.21, "grad_norm": 0.1939392444337732, "learning_rate": 6.494551151694854e-05, "loss": 0.4387, "step": 2062 }, { "epoch": 1.21, "grad_norm": 0.1728571695359017, "learning_rate": 6.485748854349061e-05, "loss": 0.381, "step": 2063 }, { "epoch": 1.21, "grad_norm": 0.17795690326553984, "learning_rate": 6.476949662418062e-05, "loss": 0.4181, "step": 2064 }, { "epoch": 1.21, "grad_norm": 0.17801662831363646, "learning_rate": 6.468153583677377e-05, "loss": 0.389, "step": 2065 }, { "epoch": 1.21, "grad_norm": 0.189558829045939, "learning_rate": 6.459360625899775e-05, "loss": 0.3926, "step": 2066 }, { "epoch": 1.21, "grad_norm": 0.16815142536338085, "learning_rate": 6.450570796855282e-05, "loss": 0.3688, "step": 2067 }, { "epoch": 1.21, "grad_norm": 0.18054471114398113, "learning_rate": 6.441784104311135e-05, "loss": 0.4176, "step": 2068 }, { "epoch": 1.21, "grad_norm": 0.1900667998193899, "learning_rate": 6.433000556031816e-05, "loss": 0.4222, "step": 2069 }, { "epoch": 1.22, "grad_norm": 0.19866351971870957, "learning_rate": 6.424220159779029e-05, "loss": 0.4511, "step": 2070 }, { "epoch": 1.22, "grad_norm": 0.18085548000788157, "learning_rate": 6.415442923311679e-05, "loss": 0.3807, "step": 2071 }, { "epoch": 1.22, "grad_norm": 0.20079665391358242, "learning_rate": 6.406668854385895e-05, "loss": 0.4718, "step": 2072 }, { "epoch": 1.22, "grad_norm": 0.15874256335306394, "learning_rate": 6.397897960755002e-05, "loss": 0.3342, "step": 2073 }, { "epoch": 1.22, "grad_norm": 0.17134778394449662, "learning_rate": 6.389130250169505e-05, "loss": 0.3913, "step": 2074 }, { "epoch": 1.22, "grad_norm": 0.18450344260328666, "learning_rate": 6.38036573037712e-05, "loss": 0.4319, "step": 2075 }, { "epoch": 1.22, "grad_norm": 0.1820653336275944, "learning_rate": 6.371604409122725e-05, "loss": 0.4174, "step": 2076 }, { "epoch": 1.22, "grad_norm": 0.19171083276523215, "learning_rate": 6.362846294148383e-05, "loss": 0.4122, "step": 2077 }, { "epoch": 1.22, "grad_norm": 0.18542133466551414, "learning_rate": 6.354091393193315e-05, "loss": 0.4434, "step": 2078 }, { "epoch": 1.22, "grad_norm": 0.17992373370079723, "learning_rate": 6.345339713993905e-05, "loss": 0.3759, "step": 2079 }, { "epoch": 1.22, "grad_norm": 0.18560798648589613, "learning_rate": 6.336591264283697e-05, "loss": 0.4096, "step": 2080 }, { "epoch": 1.22, "grad_norm": 0.18633482450613692, "learning_rate": 6.327846051793367e-05, "loss": 0.4251, "step": 2081 }, { "epoch": 1.22, "grad_norm": 0.1836619900352879, "learning_rate": 6.319104084250742e-05, "loss": 0.3925, "step": 2082 }, { "epoch": 1.22, "grad_norm": 0.20091797196508251, "learning_rate": 6.310365369380779e-05, "loss": 0.404, "step": 2083 }, { "epoch": 1.22, "grad_norm": 0.1891532144533284, "learning_rate": 6.301629914905559e-05, "loss": 0.4185, "step": 2084 }, { "epoch": 1.22, "grad_norm": 0.18153942514029875, "learning_rate": 6.292897728544282e-05, "loss": 0.393, "step": 2085 }, { "epoch": 1.23, "grad_norm": 0.18372339720739236, "learning_rate": 6.284168818013263e-05, "loss": 0.4071, "step": 2086 }, { "epoch": 1.23, "grad_norm": 0.2011783046120318, "learning_rate": 6.275443191025916e-05, "loss": 0.4618, "step": 2087 }, { "epoch": 1.23, "grad_norm": 0.18169267533871733, "learning_rate": 6.266720855292756e-05, "loss": 0.4191, "step": 2088 }, { "epoch": 1.23, "grad_norm": 0.19727917370411416, "learning_rate": 6.258001818521397e-05, "loss": 0.4121, "step": 2089 }, { "epoch": 1.23, "grad_norm": 0.18067270195439983, "learning_rate": 6.249286088416534e-05, "loss": 0.4069, "step": 2090 }, { "epoch": 1.23, "grad_norm": 0.1806495150301102, "learning_rate": 6.240573672679929e-05, "loss": 0.4105, "step": 2091 }, { "epoch": 1.23, "grad_norm": 0.1810490781739672, "learning_rate": 6.231864579010426e-05, "loss": 0.4065, "step": 2092 }, { "epoch": 1.23, "grad_norm": 0.1881164857873241, "learning_rate": 6.22315881510394e-05, "loss": 0.3863, "step": 2093 }, { "epoch": 1.23, "grad_norm": 0.21083260811753377, "learning_rate": 6.214456388653423e-05, "loss": 0.4593, "step": 2094 }, { "epoch": 1.23, "grad_norm": 0.1863138604775313, "learning_rate": 6.205757307348898e-05, "loss": 0.3912, "step": 2095 }, { "epoch": 1.23, "grad_norm": 0.18959697990144703, "learning_rate": 6.197061578877424e-05, "loss": 0.4379, "step": 2096 }, { "epoch": 1.23, "grad_norm": 0.17591208224223437, "learning_rate": 6.18836921092309e-05, "loss": 0.3912, "step": 2097 }, { "epoch": 1.23, "grad_norm": 0.1673685779867354, "learning_rate": 6.179680211167031e-05, "loss": 0.367, "step": 2098 }, { "epoch": 1.23, "grad_norm": 0.18369272051157975, "learning_rate": 6.170994587287398e-05, "loss": 0.4273, "step": 2099 }, { "epoch": 1.23, "grad_norm": 0.18265310430390075, "learning_rate": 6.162312346959348e-05, "loss": 0.3959, "step": 2100 }, { "epoch": 1.23, "grad_norm": 0.19340840466526393, "learning_rate": 6.153633497855065e-05, "loss": 0.436, "step": 2101 }, { "epoch": 1.23, "grad_norm": 0.1887901294443666, "learning_rate": 6.144958047643732e-05, "loss": 0.4083, "step": 2102 }, { "epoch": 1.24, "grad_norm": 0.19466524748526068, "learning_rate": 6.136286003991518e-05, "loss": 0.4002, "step": 2103 }, { "epoch": 1.24, "grad_norm": 0.18396890947674227, "learning_rate": 6.127617374561597e-05, "loss": 0.4071, "step": 2104 }, { "epoch": 1.24, "grad_norm": 0.18803256333290438, "learning_rate": 6.118952167014115e-05, "loss": 0.4463, "step": 2105 }, { "epoch": 1.24, "grad_norm": 0.19311540609414365, "learning_rate": 6.110290389006204e-05, "loss": 0.41, "step": 2106 }, { "epoch": 1.24, "grad_norm": 0.17270612339310035, "learning_rate": 6.10163204819195e-05, "loss": 0.4025, "step": 2107 }, { "epoch": 1.24, "grad_norm": 0.18772333666422653, "learning_rate": 6.0929771522224165e-05, "loss": 0.3697, "step": 2108 }, { "epoch": 1.24, "grad_norm": 0.18606081415307524, "learning_rate": 6.0843257087456196e-05, "loss": 0.3716, "step": 2109 }, { "epoch": 1.24, "grad_norm": 0.19080973087182862, "learning_rate": 6.075677725406516e-05, "loss": 0.3923, "step": 2110 }, { "epoch": 1.24, "grad_norm": 0.1775067921640696, "learning_rate": 6.067033209847015e-05, "loss": 0.4098, "step": 2111 }, { "epoch": 1.24, "grad_norm": 0.18103399827757333, "learning_rate": 6.058392169705962e-05, "loss": 0.3799, "step": 2112 }, { "epoch": 1.24, "grad_norm": 0.17852952705890326, "learning_rate": 6.0497546126191206e-05, "loss": 0.3839, "step": 2113 }, { "epoch": 1.24, "grad_norm": 0.18179509972863547, "learning_rate": 6.041120546219183e-05, "loss": 0.3728, "step": 2114 }, { "epoch": 1.24, "grad_norm": 0.19484655001885112, "learning_rate": 6.0324899781357624e-05, "loss": 0.4212, "step": 2115 }, { "epoch": 1.24, "grad_norm": 0.1941471719257855, "learning_rate": 6.023862915995371e-05, "loss": 0.4265, "step": 2116 }, { "epoch": 1.24, "grad_norm": 0.1850316045854547, "learning_rate": 6.0152393674214256e-05, "loss": 0.4253, "step": 2117 }, { "epoch": 1.24, "grad_norm": 0.17451934633072588, "learning_rate": 6.006619340034243e-05, "loss": 0.3849, "step": 2118 }, { "epoch": 1.24, "grad_norm": 0.17437022993161974, "learning_rate": 5.998002841451027e-05, "loss": 0.3987, "step": 2119 }, { "epoch": 1.25, "grad_norm": 0.21820425933182622, "learning_rate": 5.9893898792858564e-05, "loss": 0.3954, "step": 2120 }, { "epoch": 1.25, "grad_norm": 0.1835293588254982, "learning_rate": 5.980780461149691e-05, "loss": 0.4105, "step": 2121 }, { "epoch": 1.25, "grad_norm": 0.18295428463723487, "learning_rate": 5.972174594650363e-05, "loss": 0.4099, "step": 2122 }, { "epoch": 1.25, "grad_norm": 0.1878568911364305, "learning_rate": 5.9635722873925514e-05, "loss": 0.3768, "step": 2123 }, { "epoch": 1.25, "grad_norm": 0.1961901235631556, "learning_rate": 5.954973546977806e-05, "loss": 0.4059, "step": 2124 }, { "epoch": 1.25, "grad_norm": 0.1789683805877612, "learning_rate": 5.946378381004518e-05, "loss": 0.3809, "step": 2125 }, { "epoch": 1.25, "grad_norm": 0.1785260125545566, "learning_rate": 5.937786797067917e-05, "loss": 0.3934, "step": 2126 }, { "epoch": 1.25, "grad_norm": 0.18168550230534175, "learning_rate": 5.929198802760072e-05, "loss": 0.4142, "step": 2127 }, { "epoch": 1.25, "grad_norm": 0.17826458713675644, "learning_rate": 5.920614405669882e-05, "loss": 0.3845, "step": 2128 }, { "epoch": 1.25, "grad_norm": 0.19848867507194148, "learning_rate": 5.9120336133830564e-05, "loss": 0.4147, "step": 2129 }, { "epoch": 1.25, "grad_norm": 0.19693140229221995, "learning_rate": 5.903456433482133e-05, "loss": 0.4409, "step": 2130 }, { "epoch": 1.25, "grad_norm": 0.18964809059631438, "learning_rate": 5.894882873546448e-05, "loss": 0.3773, "step": 2131 }, { "epoch": 1.25, "grad_norm": 0.19548800990162185, "learning_rate": 5.886312941152146e-05, "loss": 0.4239, "step": 2132 }, { "epoch": 1.25, "grad_norm": 0.18556517298441283, "learning_rate": 5.877746643872152e-05, "loss": 0.4206, "step": 2133 }, { "epoch": 1.25, "grad_norm": 0.196685614920556, "learning_rate": 5.8691839892761965e-05, "loss": 0.3868, "step": 2134 }, { "epoch": 1.25, "grad_norm": 0.19578399697755275, "learning_rate": 5.860624984930787e-05, "loss": 0.4237, "step": 2135 }, { "epoch": 1.25, "grad_norm": 0.1978580772450985, "learning_rate": 5.852069638399191e-05, "loss": 0.4108, "step": 2136 }, { "epoch": 1.26, "grad_norm": 0.18181163273583667, "learning_rate": 5.843517957241459e-05, "loss": 0.3805, "step": 2137 }, { "epoch": 1.26, "grad_norm": 0.17366006402637094, "learning_rate": 5.8349699490144e-05, "loss": 0.3652, "step": 2138 }, { "epoch": 1.26, "grad_norm": 0.19113273789571564, "learning_rate": 5.8264256212715726e-05, "loss": 0.423, "step": 2139 }, { "epoch": 1.26, "grad_norm": 0.17897157794176333, "learning_rate": 5.817884981563286e-05, "loss": 0.3933, "step": 2140 }, { "epoch": 1.26, "grad_norm": 0.17668933170504572, "learning_rate": 5.809348037436595e-05, "loss": 0.3922, "step": 2141 }, { "epoch": 1.26, "grad_norm": 0.17818487109986716, "learning_rate": 5.8008147964352756e-05, "loss": 0.3662, "step": 2142 }, { "epoch": 1.26, "grad_norm": 0.18207985224347573, "learning_rate": 5.792285266099844e-05, "loss": 0.407, "step": 2143 }, { "epoch": 1.26, "grad_norm": 0.17683629780008203, "learning_rate": 5.783759453967532e-05, "loss": 0.4212, "step": 2144 }, { "epoch": 1.26, "grad_norm": 0.1722722644784309, "learning_rate": 5.7752373675722884e-05, "loss": 0.3573, "step": 2145 }, { "epoch": 1.26, "grad_norm": 0.1791044681179746, "learning_rate": 5.766719014444768e-05, "loss": 0.4031, "step": 2146 }, { "epoch": 1.26, "grad_norm": 0.19337470024812153, "learning_rate": 5.758204402112326e-05, "loss": 0.3858, "step": 2147 }, { "epoch": 1.26, "grad_norm": 0.1936105726260476, "learning_rate": 5.749693538099018e-05, "loss": 0.4277, "step": 2148 }, { "epoch": 1.26, "grad_norm": 0.1784944177612045, "learning_rate": 5.741186429925574e-05, "loss": 0.3943, "step": 2149 }, { "epoch": 1.26, "grad_norm": 0.16474400059855843, "learning_rate": 5.732683085109416e-05, "loss": 0.3574, "step": 2150 }, { "epoch": 1.26, "grad_norm": 0.1747458965851442, "learning_rate": 5.7241835111646444e-05, "loss": 0.3542, "step": 2151 }, { "epoch": 1.26, "grad_norm": 0.17722508894646266, "learning_rate": 5.7156877156020106e-05, "loss": 0.383, "step": 2152 }, { "epoch": 1.27, "grad_norm": 0.18004930280023854, "learning_rate": 5.707195705928943e-05, "loss": 0.371, "step": 2153 }, { "epoch": 1.27, "grad_norm": 0.1998035437105761, "learning_rate": 5.6987074896495176e-05, "loss": 0.4052, "step": 2154 }, { "epoch": 1.27, "grad_norm": 0.2454047737061032, "learning_rate": 5.69022307426446e-05, "loss": 0.3956, "step": 2155 }, { "epoch": 1.27, "grad_norm": 0.18505776467623378, "learning_rate": 5.681742467271137e-05, "loss": 0.4188, "step": 2156 }, { "epoch": 1.27, "grad_norm": 0.1722368732605317, "learning_rate": 5.673265676163555e-05, "loss": 0.3941, "step": 2157 }, { "epoch": 1.27, "grad_norm": 0.17685811702648357, "learning_rate": 5.664792708432333e-05, "loss": 0.3567, "step": 2158 }, { "epoch": 1.27, "grad_norm": 0.179506955830838, "learning_rate": 5.6563235715647264e-05, "loss": 0.3972, "step": 2159 }, { "epoch": 1.27, "grad_norm": 0.16666951754419984, "learning_rate": 5.647858273044602e-05, "loss": 0.3469, "step": 2160 }, { "epoch": 1.27, "grad_norm": 0.18772902417071785, "learning_rate": 5.639396820352436e-05, "loss": 0.4102, "step": 2161 }, { "epoch": 1.27, "grad_norm": 0.17365394970568318, "learning_rate": 5.6309392209652924e-05, "loss": 0.3652, "step": 2162 }, { "epoch": 1.27, "grad_norm": 0.1781634158116914, "learning_rate": 5.622485482356854e-05, "loss": 0.3803, "step": 2163 }, { "epoch": 1.27, "grad_norm": 0.18323967750889245, "learning_rate": 5.614035611997378e-05, "loss": 0.4028, "step": 2164 }, { "epoch": 1.27, "grad_norm": 0.20230746144118264, "learning_rate": 5.605589617353697e-05, "loss": 0.3808, "step": 2165 }, { "epoch": 1.27, "grad_norm": 0.1805552749948731, "learning_rate": 5.597147505889233e-05, "loss": 0.4406, "step": 2166 }, { "epoch": 1.27, "grad_norm": 0.20374446705458787, "learning_rate": 5.588709285063971e-05, "loss": 0.4281, "step": 2167 }, { "epoch": 1.27, "grad_norm": 0.18591909290479935, "learning_rate": 5.580274962334451e-05, "loss": 0.4182, "step": 2168 }, { "epoch": 1.27, "grad_norm": 0.17103552679405504, "learning_rate": 5.571844545153777e-05, "loss": 0.3964, "step": 2169 }, { "epoch": 1.28, "grad_norm": 0.18684053271434015, "learning_rate": 5.5634180409716e-05, "loss": 0.4113, "step": 2170 }, { "epoch": 1.28, "grad_norm": 0.16926235511223492, "learning_rate": 5.5549954572341134e-05, "loss": 0.3709, "step": 2171 }, { "epoch": 1.28, "grad_norm": 0.1736106649601771, "learning_rate": 5.546576801384045e-05, "loss": 0.363, "step": 2172 }, { "epoch": 1.28, "grad_norm": 0.18806895104857238, "learning_rate": 5.538162080860655e-05, "loss": 0.4062, "step": 2173 }, { "epoch": 1.28, "grad_norm": 0.1844043066518322, "learning_rate": 5.529751303099717e-05, "loss": 0.3892, "step": 2174 }, { "epoch": 1.28, "grad_norm": 0.18160055542379905, "learning_rate": 5.52134447553353e-05, "loss": 0.3999, "step": 2175 }, { "epoch": 1.28, "grad_norm": 0.1956754962166279, "learning_rate": 5.512941605590898e-05, "loss": 0.4172, "step": 2176 }, { "epoch": 1.28, "grad_norm": 0.1838961563135142, "learning_rate": 5.5045427006971325e-05, "loss": 0.4147, "step": 2177 }, { "epoch": 1.28, "grad_norm": 0.19107640177369256, "learning_rate": 5.4961477682740325e-05, "loss": 0.413, "step": 2178 }, { "epoch": 1.28, "grad_norm": 0.18969577739431961, "learning_rate": 5.487756815739894e-05, "loss": 0.3999, "step": 2179 }, { "epoch": 1.28, "grad_norm": 0.1858908768432806, "learning_rate": 5.4793698505094926e-05, "loss": 0.3967, "step": 2180 }, { "epoch": 1.28, "grad_norm": 0.18020222489161605, "learning_rate": 5.4709868799940845e-05, "loss": 0.4091, "step": 2181 }, { "epoch": 1.28, "grad_norm": 0.19520907750630961, "learning_rate": 5.4626079116013906e-05, "loss": 0.4272, "step": 2182 }, { "epoch": 1.28, "grad_norm": 0.18640536176447983, "learning_rate": 5.4542329527356025e-05, "loss": 0.3604, "step": 2183 }, { "epoch": 1.28, "grad_norm": 0.19307798922222952, "learning_rate": 5.445862010797358e-05, "loss": 0.3864, "step": 2184 }, { "epoch": 1.28, "grad_norm": 0.18561105414776297, "learning_rate": 5.437495093183753e-05, "loss": 0.4381, "step": 2185 }, { "epoch": 1.28, "grad_norm": 0.16845529408994608, "learning_rate": 5.42913220728833e-05, "loss": 0.3478, "step": 2186 }, { "epoch": 1.29, "grad_norm": 0.18024110837169, "learning_rate": 5.420773360501057e-05, "loss": 0.3901, "step": 2187 }, { "epoch": 1.29, "grad_norm": 0.18049067690672227, "learning_rate": 5.412418560208343e-05, "loss": 0.4207, "step": 2188 }, { "epoch": 1.29, "grad_norm": 0.1739219539443579, "learning_rate": 5.4040678137930214e-05, "loss": 0.3659, "step": 2189 }, { "epoch": 1.29, "grad_norm": 0.18449670679648686, "learning_rate": 5.395721128634338e-05, "loss": 0.4288, "step": 2190 }, { "epoch": 1.29, "grad_norm": 0.1899315846259125, "learning_rate": 5.387378512107952e-05, "loss": 0.4338, "step": 2191 }, { "epoch": 1.29, "grad_norm": 0.1859583595169519, "learning_rate": 5.379039971585929e-05, "loss": 0.3851, "step": 2192 }, { "epoch": 1.29, "grad_norm": 0.18077813499619705, "learning_rate": 5.3707055144367336e-05, "loss": 0.4056, "step": 2193 }, { "epoch": 1.29, "grad_norm": 0.18580498148705987, "learning_rate": 5.362375148025213e-05, "loss": 0.393, "step": 2194 }, { "epoch": 1.29, "grad_norm": 0.19143544061487677, "learning_rate": 5.35404887971261e-05, "loss": 0.4284, "step": 2195 }, { "epoch": 1.29, "grad_norm": 0.17814943744573722, "learning_rate": 5.345726716856545e-05, "loss": 0.4002, "step": 2196 }, { "epoch": 1.29, "grad_norm": 0.17505964472580593, "learning_rate": 5.3374086668110034e-05, "loss": 0.3739, "step": 2197 }, { "epoch": 1.29, "grad_norm": 0.18148223115355736, "learning_rate": 5.329094736926342e-05, "loss": 0.3796, "step": 2198 }, { "epoch": 1.29, "grad_norm": 0.18553202610972083, "learning_rate": 5.320784934549277e-05, "loss": 0.3679, "step": 2199 }, { "epoch": 1.29, "grad_norm": 0.2028489781335254, "learning_rate": 5.312479267022874e-05, "loss": 0.3991, "step": 2200 }, { "epoch": 1.29, "grad_norm": 0.18274525464999591, "learning_rate": 5.304177741686549e-05, "loss": 0.3666, "step": 2201 }, { "epoch": 1.29, "grad_norm": 0.175148123213247, "learning_rate": 5.295880365876058e-05, "loss": 0.4083, "step": 2202 }, { "epoch": 1.29, "grad_norm": 0.17081671423731545, "learning_rate": 5.2875871469234786e-05, "loss": 0.3588, "step": 2203 }, { "epoch": 1.3, "grad_norm": 0.1709653542288416, "learning_rate": 5.27929809215723e-05, "loss": 0.3679, "step": 2204 }, { "epoch": 1.3, "grad_norm": 0.18417423979858946, "learning_rate": 5.271013208902045e-05, "loss": 0.4113, "step": 2205 }, { "epoch": 1.3, "grad_norm": 0.18420315430603607, "learning_rate": 5.262732504478975e-05, "loss": 0.4123, "step": 2206 }, { "epoch": 1.3, "grad_norm": 0.19549557097472783, "learning_rate": 5.254455986205362e-05, "loss": 0.4164, "step": 2207 }, { "epoch": 1.3, "grad_norm": 0.1890203514363044, "learning_rate": 5.246183661394876e-05, "loss": 0.4017, "step": 2208 }, { "epoch": 1.3, "grad_norm": 0.19289162012261554, "learning_rate": 5.237915537357463e-05, "loss": 0.4573, "step": 2209 }, { "epoch": 1.3, "grad_norm": 0.18549804668544928, "learning_rate": 5.2296516213993564e-05, "loss": 0.4038, "step": 2210 }, { "epoch": 1.3, "grad_norm": 0.19026869447014055, "learning_rate": 5.2213919208230775e-05, "loss": 0.4368, "step": 2211 }, { "epoch": 1.3, "grad_norm": 0.17654966806048847, "learning_rate": 5.2131364429274246e-05, "loss": 0.3874, "step": 2212 }, { "epoch": 1.3, "grad_norm": 0.18351252152306632, "learning_rate": 5.204885195007453e-05, "loss": 0.3784, "step": 2213 }, { "epoch": 1.3, "grad_norm": 0.19107149354135278, "learning_rate": 5.196638184354492e-05, "loss": 0.4243, "step": 2214 }, { "epoch": 1.3, "grad_norm": 0.19065800665794713, "learning_rate": 5.188395418256122e-05, "loss": 0.4148, "step": 2215 }, { "epoch": 1.3, "grad_norm": 0.18419176321318032, "learning_rate": 5.180156903996174e-05, "loss": 0.4121, "step": 2216 }, { "epoch": 1.3, "grad_norm": 0.1965071770498145, "learning_rate": 5.171922648854719e-05, "loss": 0.4072, "step": 2217 }, { "epoch": 1.3, "grad_norm": 0.17271642659146394, "learning_rate": 5.163692660108068e-05, "loss": 0.3423, "step": 2218 }, { "epoch": 1.3, "grad_norm": 0.1814946728416819, "learning_rate": 5.155466945028762e-05, "loss": 0.3886, "step": 2219 }, { "epoch": 1.31, "grad_norm": 0.2022707732074986, "learning_rate": 5.147245510885557e-05, "loss": 0.4658, "step": 2220 }, { "epoch": 1.31, "grad_norm": 0.18841969797937244, "learning_rate": 5.1390283649434365e-05, "loss": 0.3939, "step": 2221 }, { "epoch": 1.31, "grad_norm": 0.18887943727385983, "learning_rate": 5.130815514463595e-05, "loss": 0.4079, "step": 2222 }, { "epoch": 1.31, "grad_norm": 0.18485677458409824, "learning_rate": 5.122606966703418e-05, "loss": 0.3811, "step": 2223 }, { "epoch": 1.31, "grad_norm": 0.17738323302931794, "learning_rate": 5.1144027289165045e-05, "loss": 0.3669, "step": 2224 }, { "epoch": 1.31, "grad_norm": 0.18959955695535494, "learning_rate": 5.1062028083526356e-05, "loss": 0.388, "step": 2225 }, { "epoch": 1.31, "grad_norm": 0.1823026420421858, "learning_rate": 5.098007212257782e-05, "loss": 0.3942, "step": 2226 }, { "epoch": 1.31, "grad_norm": 0.19092667735759558, "learning_rate": 5.08981594787409e-05, "loss": 0.4174, "step": 2227 }, { "epoch": 1.31, "grad_norm": 0.18931049227499339, "learning_rate": 5.0816290224398844e-05, "loss": 0.3614, "step": 2228 }, { "epoch": 1.31, "grad_norm": 0.1933197577499917, "learning_rate": 5.073446443189642e-05, "loss": 0.4206, "step": 2229 }, { "epoch": 1.31, "grad_norm": 0.24897884567636863, "learning_rate": 5.0652682173540125e-05, "loss": 0.4227, "step": 2230 }, { "epoch": 1.31, "grad_norm": 0.18567988264628352, "learning_rate": 5.0570943521597934e-05, "loss": 0.361, "step": 2231 }, { "epoch": 1.31, "grad_norm": 0.1808443791123837, "learning_rate": 5.048924854829934e-05, "loss": 0.4162, "step": 2232 }, { "epoch": 1.31, "grad_norm": 0.17382289163178277, "learning_rate": 5.040759732583511e-05, "loss": 0.3578, "step": 2233 }, { "epoch": 1.31, "grad_norm": 0.1934059706225405, "learning_rate": 5.032598992635748e-05, "loss": 0.4163, "step": 2234 }, { "epoch": 1.31, "grad_norm": 0.18712047190744255, "learning_rate": 5.024442642197991e-05, "loss": 0.3923, "step": 2235 }, { "epoch": 1.31, "grad_norm": 0.194230304793972, "learning_rate": 5.016290688477707e-05, "loss": 0.4043, "step": 2236 }, { "epoch": 1.32, "grad_norm": 0.18345123283309359, "learning_rate": 5.008143138678479e-05, "loss": 0.3813, "step": 2237 }, { "epoch": 1.32, "grad_norm": 0.18014102531773896, "learning_rate": 5.000000000000002e-05, "loss": 0.3689, "step": 2238 }, { "epoch": 1.32, "grad_norm": 0.20811237998328158, "learning_rate": 4.991861279638061e-05, "loss": 0.4283, "step": 2239 }, { "epoch": 1.32, "grad_norm": 0.18205015271300518, "learning_rate": 4.983726984784548e-05, "loss": 0.3505, "step": 2240 }, { "epoch": 1.32, "grad_norm": 0.1795720443931225, "learning_rate": 4.975597122627445e-05, "loss": 0.4022, "step": 2241 }, { "epoch": 1.32, "grad_norm": 0.2009947553079081, "learning_rate": 4.9674717003508045e-05, "loss": 0.4152, "step": 2242 }, { "epoch": 1.32, "grad_norm": 0.1911373360756306, "learning_rate": 4.959350725134765e-05, "loss": 0.4151, "step": 2243 }, { "epoch": 1.32, "grad_norm": 0.19571119806792772, "learning_rate": 4.951234204155545e-05, "loss": 0.4138, "step": 2244 }, { "epoch": 1.32, "grad_norm": 0.17780390094617862, "learning_rate": 4.943122144585406e-05, "loss": 0.4184, "step": 2245 }, { "epoch": 1.32, "grad_norm": 0.18112248897689123, "learning_rate": 4.9350145535926796e-05, "loss": 0.4011, "step": 2246 }, { "epoch": 1.32, "grad_norm": 0.18862279533214676, "learning_rate": 4.9269114383417484e-05, "loss": 0.4111, "step": 2247 }, { "epoch": 1.32, "grad_norm": 0.18318423968599412, "learning_rate": 4.9188128059930394e-05, "loss": 0.4139, "step": 2248 }, { "epoch": 1.32, "grad_norm": 0.19014655668311042, "learning_rate": 4.910718663703012e-05, "loss": 0.3646, "step": 2249 }, { "epoch": 1.32, "grad_norm": 0.17458066227793817, "learning_rate": 4.902629018624164e-05, "loss": 0.3695, "step": 2250 }, { "epoch": 1.32, "grad_norm": 0.17637485698408012, "learning_rate": 4.89454387790502e-05, "loss": 0.404, "step": 2251 }, { "epoch": 1.32, "grad_norm": 0.16443705845773854, "learning_rate": 4.886463248690122e-05, "loss": 0.3568, "step": 2252 }, { "epoch": 1.32, "grad_norm": 0.19342091566192718, "learning_rate": 4.878387138120023e-05, "loss": 0.4185, "step": 2253 }, { "epoch": 1.33, "grad_norm": 0.1904303316001888, "learning_rate": 4.8703155533312925e-05, "loss": 0.3929, "step": 2254 }, { "epoch": 1.33, "grad_norm": 0.18246297112699594, "learning_rate": 4.862248501456484e-05, "loss": 0.4225, "step": 2255 }, { "epoch": 1.33, "grad_norm": 0.1777341775174244, "learning_rate": 4.8541859896241616e-05, "loss": 0.3738, "step": 2256 }, { "epoch": 1.33, "grad_norm": 0.18805743212178455, "learning_rate": 4.846128024958872e-05, "loss": 0.4079, "step": 2257 }, { "epoch": 1.33, "grad_norm": 0.1820196118051572, "learning_rate": 4.838074614581137e-05, "loss": 0.4098, "step": 2258 }, { "epoch": 1.33, "grad_norm": 0.16806431241971856, "learning_rate": 4.830025765607463e-05, "loss": 0.3666, "step": 2259 }, { "epoch": 1.33, "grad_norm": 0.19363355095088744, "learning_rate": 4.821981485150323e-05, "loss": 0.3951, "step": 2260 }, { "epoch": 1.33, "grad_norm": 0.2139490197452364, "learning_rate": 4.813941780318153e-05, "loss": 0.4495, "step": 2261 }, { "epoch": 1.33, "grad_norm": 0.18747955602297983, "learning_rate": 4.805906658215344e-05, "loss": 0.4116, "step": 2262 }, { "epoch": 1.33, "grad_norm": 0.19431415115430906, "learning_rate": 4.797876125942239e-05, "loss": 0.3709, "step": 2263 }, { "epoch": 1.33, "grad_norm": 0.19568799936115497, "learning_rate": 4.789850190595129e-05, "loss": 0.4169, "step": 2264 }, { "epoch": 1.33, "grad_norm": 0.18414590658896182, "learning_rate": 4.7818288592662295e-05, "loss": 0.4, "step": 2265 }, { "epoch": 1.33, "grad_norm": 0.1729939766717147, "learning_rate": 4.773812139043703e-05, "loss": 0.3921, "step": 2266 }, { "epoch": 1.33, "grad_norm": 0.18926823202564338, "learning_rate": 4.7658000370116315e-05, "loss": 0.3905, "step": 2267 }, { "epoch": 1.33, "grad_norm": 0.19583540064221253, "learning_rate": 4.757792560250012e-05, "loss": 0.4175, "step": 2268 }, { "epoch": 1.33, "grad_norm": 0.17806294072826737, "learning_rate": 4.749789715834758e-05, "loss": 0.3504, "step": 2269 }, { "epoch": 1.33, "grad_norm": 0.17134967767482345, "learning_rate": 4.741791510837691e-05, "loss": 0.3754, "step": 2270 }, { "epoch": 1.34, "grad_norm": 0.17893435464363647, "learning_rate": 4.733797952326532e-05, "loss": 0.3571, "step": 2271 }, { "epoch": 1.34, "grad_norm": 0.1708796829805529, "learning_rate": 4.725809047364894e-05, "loss": 0.3912, "step": 2272 }, { "epoch": 1.34, "grad_norm": 0.18302707589369907, "learning_rate": 4.717824803012284e-05, "loss": 0.3933, "step": 2273 }, { "epoch": 1.34, "grad_norm": 0.18717197395682367, "learning_rate": 4.7098452263240776e-05, "loss": 0.3852, "step": 2274 }, { "epoch": 1.34, "grad_norm": 0.17986740395717019, "learning_rate": 4.7018703243515385e-05, "loss": 0.39, "step": 2275 }, { "epoch": 1.34, "grad_norm": 0.1941875502444695, "learning_rate": 4.693900104141794e-05, "loss": 0.4185, "step": 2276 }, { "epoch": 1.34, "grad_norm": 0.18756813942113346, "learning_rate": 4.6859345727378387e-05, "loss": 0.3923, "step": 2277 }, { "epoch": 1.34, "grad_norm": 0.18099857236850664, "learning_rate": 4.6779737371785146e-05, "loss": 0.3917, "step": 2278 }, { "epoch": 1.34, "grad_norm": 0.17611734325453263, "learning_rate": 4.670017604498522e-05, "loss": 0.396, "step": 2279 }, { "epoch": 1.34, "grad_norm": 0.182638294765091, "learning_rate": 4.662066181728405e-05, "loss": 0.391, "step": 2280 }, { "epoch": 1.34, "grad_norm": 0.17438038471285144, "learning_rate": 4.654119475894543e-05, "loss": 0.3771, "step": 2281 }, { "epoch": 1.34, "grad_norm": 0.18251567336825883, "learning_rate": 4.6461774940191486e-05, "loss": 0.3673, "step": 2282 }, { "epoch": 1.34, "grad_norm": 0.48873441447493016, "learning_rate": 4.638240243120263e-05, "loss": 0.4359, "step": 2283 }, { "epoch": 1.34, "grad_norm": 0.2004078757737064, "learning_rate": 4.630307730211737e-05, "loss": 0.417, "step": 2284 }, { "epoch": 1.34, "grad_norm": 0.1869401684241615, "learning_rate": 4.622379962303246e-05, "loss": 0.4375, "step": 2285 }, { "epoch": 1.34, "grad_norm": 0.1792861761265464, "learning_rate": 4.6144569464002684e-05, "loss": 0.3917, "step": 2286 }, { "epoch": 1.35, "grad_norm": 0.1642865831739075, "learning_rate": 4.606538689504077e-05, "loss": 0.3682, "step": 2287 }, { "epoch": 1.35, "grad_norm": 0.1730553053615041, "learning_rate": 4.5986251986117454e-05, "loss": 0.4223, "step": 2288 }, { "epoch": 1.35, "grad_norm": 0.179517265255704, "learning_rate": 4.590716480716141e-05, "loss": 0.3911, "step": 2289 }, { "epoch": 1.35, "grad_norm": 0.1801566944583477, "learning_rate": 4.582812542805909e-05, "loss": 0.378, "step": 2290 }, { "epoch": 1.35, "grad_norm": 0.16832506352891377, "learning_rate": 4.5749133918654607e-05, "loss": 0.3776, "step": 2291 }, { "epoch": 1.35, "grad_norm": 0.17943384818308367, "learning_rate": 4.567019034874992e-05, "loss": 0.4011, "step": 2292 }, { "epoch": 1.35, "grad_norm": 0.18585633962936102, "learning_rate": 4.559129478810457e-05, "loss": 0.3921, "step": 2293 }, { "epoch": 1.35, "grad_norm": 0.17092385090432002, "learning_rate": 4.551244730643561e-05, "loss": 0.3608, "step": 2294 }, { "epoch": 1.35, "grad_norm": 0.1679173704574397, "learning_rate": 4.5433647973417703e-05, "loss": 0.3858, "step": 2295 }, { "epoch": 1.35, "grad_norm": 0.176027839089707, "learning_rate": 4.535489685868294e-05, "loss": 0.3685, "step": 2296 }, { "epoch": 1.35, "grad_norm": 0.19315250710153942, "learning_rate": 4.5276194031820774e-05, "loss": 0.4209, "step": 2297 }, { "epoch": 1.35, "grad_norm": 0.18262863053483308, "learning_rate": 4.5197539562378014e-05, "loss": 0.4106, "step": 2298 }, { "epoch": 1.35, "grad_norm": 0.17738986120110053, "learning_rate": 4.511893351985876e-05, "loss": 0.4144, "step": 2299 }, { "epoch": 1.35, "grad_norm": 0.1939134068956391, "learning_rate": 4.5040375973724216e-05, "loss": 0.4189, "step": 2300 }, { "epoch": 1.35, "grad_norm": 0.1876065806043386, "learning_rate": 4.496186699339284e-05, "loss": 0.3718, "step": 2301 }, { "epoch": 1.35, "grad_norm": 0.17704448706249795, "learning_rate": 4.488340664824019e-05, "loss": 0.4153, "step": 2302 }, { "epoch": 1.35, "grad_norm": 0.19343677065870282, "learning_rate": 4.4804995007598694e-05, "loss": 0.4102, "step": 2303 }, { "epoch": 1.36, "grad_norm": 0.20827443989702227, "learning_rate": 4.472663214075788e-05, "loss": 0.4191, "step": 2304 }, { "epoch": 1.36, "grad_norm": 0.2125984764796864, "learning_rate": 4.464831811696417e-05, "loss": 0.4045, "step": 2305 }, { "epoch": 1.36, "grad_norm": 0.19901809672692197, "learning_rate": 4.457005300542077e-05, "loss": 0.4358, "step": 2306 }, { "epoch": 1.36, "grad_norm": 0.1851209242969792, "learning_rate": 4.4491836875287674e-05, "loss": 0.4113, "step": 2307 }, { "epoch": 1.36, "grad_norm": 0.1977643264945235, "learning_rate": 4.441366979568162e-05, "loss": 0.4082, "step": 2308 }, { "epoch": 1.36, "grad_norm": 0.2235036270909591, "learning_rate": 4.433555183567604e-05, "loss": 0.3906, "step": 2309 }, { "epoch": 1.36, "grad_norm": 0.19623592849966992, "learning_rate": 4.425748306430082e-05, "loss": 0.3846, "step": 2310 }, { "epoch": 1.36, "grad_norm": 0.18223154039092768, "learning_rate": 4.41794635505425e-05, "loss": 0.3927, "step": 2311 }, { "epoch": 1.36, "grad_norm": 0.193962230041838, "learning_rate": 4.410149336334411e-05, "loss": 0.4521, "step": 2312 }, { "epoch": 1.36, "grad_norm": 0.18557136225472978, "learning_rate": 4.4023572571604965e-05, "loss": 0.4019, "step": 2313 }, { "epoch": 1.36, "grad_norm": 0.19258003070227037, "learning_rate": 4.3945701244180847e-05, "loss": 0.4382, "step": 2314 }, { "epoch": 1.36, "grad_norm": 0.17550674522959528, "learning_rate": 4.38678794498838e-05, "loss": 0.3746, "step": 2315 }, { "epoch": 1.36, "grad_norm": 0.17901345999016943, "learning_rate": 4.379010725748206e-05, "loss": 0.3937, "step": 2316 }, { "epoch": 1.36, "grad_norm": 0.18204388035763827, "learning_rate": 4.37123847357001e-05, "loss": 0.3668, "step": 2317 }, { "epoch": 1.36, "grad_norm": 0.1732827092017323, "learning_rate": 4.363471195321842e-05, "loss": 0.3915, "step": 2318 }, { "epoch": 1.36, "grad_norm": 0.1794821815129027, "learning_rate": 4.3557088978673676e-05, "loss": 0.4019, "step": 2319 }, { "epoch": 1.36, "grad_norm": 0.18125659105716263, "learning_rate": 4.347951588065835e-05, "loss": 0.4132, "step": 2320 }, { "epoch": 1.37, "grad_norm": 0.17799580610948854, "learning_rate": 4.3401992727721e-05, "loss": 0.3796, "step": 2321 }, { "epoch": 1.37, "grad_norm": 0.17617546687208122, "learning_rate": 4.3324519588366e-05, "loss": 0.3765, "step": 2322 }, { "epoch": 1.37, "grad_norm": 0.19417273186835765, "learning_rate": 4.324709653105346e-05, "loss": 0.4013, "step": 2323 }, { "epoch": 1.37, "grad_norm": 0.21077608405432277, "learning_rate": 4.316972362419931e-05, "loss": 0.4132, "step": 2324 }, { "epoch": 1.37, "grad_norm": 0.1803109022455879, "learning_rate": 4.309240093617524e-05, "loss": 0.3994, "step": 2325 }, { "epoch": 1.37, "grad_norm": 0.20316435530602564, "learning_rate": 4.301512853530837e-05, "loss": 0.4351, "step": 2326 }, { "epoch": 1.37, "grad_norm": 0.1863698168969151, "learning_rate": 4.2937906489881516e-05, "loss": 0.4138, "step": 2327 }, { "epoch": 1.37, "grad_norm": 0.1749372005931854, "learning_rate": 4.2860734868133e-05, "loss": 0.4187, "step": 2328 }, { "epoch": 1.37, "grad_norm": 0.1808193383735573, "learning_rate": 4.27836137382565e-05, "loss": 0.3941, "step": 2329 }, { "epoch": 1.37, "grad_norm": 0.18808176710898056, "learning_rate": 4.270654316840115e-05, "loss": 0.4131, "step": 2330 }, { "epoch": 1.37, "grad_norm": 0.1903477479527853, "learning_rate": 4.2629523226671375e-05, "loss": 0.414, "step": 2331 }, { "epoch": 1.37, "grad_norm": 0.17844977047828794, "learning_rate": 4.25525539811269e-05, "loss": 0.3861, "step": 2332 }, { "epoch": 1.37, "grad_norm": 0.1891382252674035, "learning_rate": 4.247563549978259e-05, "loss": 0.4394, "step": 2333 }, { "epoch": 1.37, "grad_norm": 0.17790885781029012, "learning_rate": 4.23987678506085e-05, "loss": 0.4006, "step": 2334 }, { "epoch": 1.37, "grad_norm": 0.1807431421440253, "learning_rate": 4.23219511015298e-05, "loss": 0.4029, "step": 2335 }, { "epoch": 1.37, "grad_norm": 0.19080875335733244, "learning_rate": 4.224518532042654e-05, "loss": 0.4219, "step": 2336 }, { "epoch": 1.37, "grad_norm": 0.1702363744752853, "learning_rate": 4.2168470575133866e-05, "loss": 0.3839, "step": 2337 }, { "epoch": 1.38, "grad_norm": 0.1933279271715595, "learning_rate": 4.2091806933441826e-05, "loss": 0.4337, "step": 2338 }, { "epoch": 1.38, "grad_norm": 0.22365190725291184, "learning_rate": 4.20151944630952e-05, "loss": 0.391, "step": 2339 }, { "epoch": 1.38, "grad_norm": 0.18499253190623838, "learning_rate": 4.1938633231793624e-05, "loss": 0.3911, "step": 2340 }, { "epoch": 1.38, "grad_norm": 0.1881298684489429, "learning_rate": 4.1862123307191484e-05, "loss": 0.3931, "step": 2341 }, { "epoch": 1.38, "grad_norm": 0.17357955926644725, "learning_rate": 4.178566475689777e-05, "loss": 0.3896, "step": 2342 }, { "epoch": 1.38, "grad_norm": 0.18756044147301135, "learning_rate": 4.1709257648476105e-05, "loss": 0.4033, "step": 2343 }, { "epoch": 1.38, "grad_norm": 0.18389028551541756, "learning_rate": 4.1632902049444686e-05, "loss": 0.3439, "step": 2344 }, { "epoch": 1.38, "grad_norm": 0.1849238463508717, "learning_rate": 4.155659802727608e-05, "loss": 0.3785, "step": 2345 }, { "epoch": 1.38, "grad_norm": 0.1773081310212394, "learning_rate": 4.14803456493974e-05, "loss": 0.3766, "step": 2346 }, { "epoch": 1.38, "grad_norm": 0.18233508648942373, "learning_rate": 4.1404144983190044e-05, "loss": 0.3549, "step": 2347 }, { "epoch": 1.38, "grad_norm": 0.1851036998957086, "learning_rate": 4.132799609598981e-05, "loss": 0.3977, "step": 2348 }, { "epoch": 1.38, "grad_norm": 0.21244947793755847, "learning_rate": 4.1251899055086586e-05, "loss": 0.4374, "step": 2349 }, { "epoch": 1.38, "grad_norm": 0.18484919749317866, "learning_rate": 4.117585392772457e-05, "loss": 0.4128, "step": 2350 }, { "epoch": 1.38, "grad_norm": 0.2210933648642488, "learning_rate": 4.109986078110207e-05, "loss": 0.3722, "step": 2351 }, { "epoch": 1.38, "grad_norm": 0.1951376264933818, "learning_rate": 4.102391968237143e-05, "loss": 0.3722, "step": 2352 }, { "epoch": 1.38, "grad_norm": 0.18729461438282666, "learning_rate": 4.0948030698639006e-05, "loss": 0.3946, "step": 2353 }, { "epoch": 1.38, "grad_norm": 0.17798234177335076, "learning_rate": 4.087219389696515e-05, "loss": 0.3632, "step": 2354 }, { "epoch": 1.39, "grad_norm": 0.18533426970439967, "learning_rate": 4.0796409344363995e-05, "loss": 0.3503, "step": 2355 }, { "epoch": 1.39, "grad_norm": 0.1838738950459424, "learning_rate": 4.072067710780359e-05, "loss": 0.3666, "step": 2356 }, { "epoch": 1.39, "grad_norm": 0.1782715179343142, "learning_rate": 4.064499725420579e-05, "loss": 0.4242, "step": 2357 }, { "epoch": 1.39, "grad_norm": 0.20282525928063655, "learning_rate": 4.0569369850446016e-05, "loss": 0.3772, "step": 2358 }, { "epoch": 1.39, "grad_norm": 0.1899237116709675, "learning_rate": 4.049379496335347e-05, "loss": 0.4187, "step": 2359 }, { "epoch": 1.39, "grad_norm": 0.20402628915397078, "learning_rate": 4.041827265971091e-05, "loss": 0.4279, "step": 2360 }, { "epoch": 1.39, "grad_norm": 0.18281918217454715, "learning_rate": 4.0342803006254624e-05, "loss": 0.4135, "step": 2361 }, { "epoch": 1.39, "grad_norm": 0.17755636118105714, "learning_rate": 4.0267386069674384e-05, "loss": 0.3872, "step": 2362 }, { "epoch": 1.39, "grad_norm": 0.17952222593236428, "learning_rate": 4.0192021916613344e-05, "loss": 0.398, "step": 2363 }, { "epoch": 1.39, "grad_norm": 0.17493934616958587, "learning_rate": 4.011671061366811e-05, "loss": 0.3779, "step": 2364 }, { "epoch": 1.39, "grad_norm": 0.1809894951465937, "learning_rate": 4.004145222738841e-05, "loss": 0.429, "step": 2365 }, { "epoch": 1.39, "grad_norm": 0.17931118104328542, "learning_rate": 3.996624682427739e-05, "loss": 0.3839, "step": 2366 }, { "epoch": 1.39, "grad_norm": 0.18045630350547573, "learning_rate": 3.989109447079131e-05, "loss": 0.3934, "step": 2367 }, { "epoch": 1.39, "grad_norm": 0.19746890167885522, "learning_rate": 3.98159952333395e-05, "loss": 0.3887, "step": 2368 }, { "epoch": 1.39, "grad_norm": 0.18627456979800522, "learning_rate": 3.974094917828438e-05, "loss": 0.4036, "step": 2369 }, { "epoch": 1.39, "grad_norm": 0.1900210723670723, "learning_rate": 3.966595637194153e-05, "loss": 0.3811, "step": 2370 }, { "epoch": 1.4, "grad_norm": 0.21931012070906103, "learning_rate": 3.9591016880579204e-05, "loss": 0.4137, "step": 2371 }, { "epoch": 1.4, "grad_norm": 0.17673450048063358, "learning_rate": 3.951613077041874e-05, "loss": 0.3972, "step": 2372 }, { "epoch": 1.4, "grad_norm": 0.1992515854559056, "learning_rate": 3.944129810763425e-05, "loss": 0.4417, "step": 2373 }, { "epoch": 1.4, "grad_norm": 0.21033013771714132, "learning_rate": 3.9366518958352585e-05, "loss": 0.3739, "step": 2374 }, { "epoch": 1.4, "grad_norm": 0.16581957344786058, "learning_rate": 3.929179338865333e-05, "loss": 0.3538, "step": 2375 }, { "epoch": 1.4, "grad_norm": 0.18415059414596757, "learning_rate": 3.9217121464568764e-05, "loss": 0.3997, "step": 2376 }, { "epoch": 1.4, "grad_norm": 0.21088258370598734, "learning_rate": 3.9142503252083695e-05, "loss": 0.4125, "step": 2377 }, { "epoch": 1.4, "grad_norm": 0.18939002851918915, "learning_rate": 3.906793881713552e-05, "loss": 0.4288, "step": 2378 }, { "epoch": 1.4, "grad_norm": 0.17858771383658043, "learning_rate": 3.899342822561409e-05, "loss": 0.3881, "step": 2379 }, { "epoch": 1.4, "grad_norm": 0.18362284734207265, "learning_rate": 3.8918971543361714e-05, "loss": 0.3823, "step": 2380 }, { "epoch": 1.4, "grad_norm": 0.19366380872850933, "learning_rate": 3.884456883617296e-05, "loss": 0.3583, "step": 2381 }, { "epoch": 1.4, "grad_norm": 0.1901165370683843, "learning_rate": 3.87702201697948e-05, "loss": 0.4553, "step": 2382 }, { "epoch": 1.4, "grad_norm": 0.18980970612244083, "learning_rate": 3.8695925609926475e-05, "loss": 0.4263, "step": 2383 }, { "epoch": 1.4, "grad_norm": 0.1927879848429132, "learning_rate": 3.862168522221927e-05, "loss": 0.4226, "step": 2384 }, { "epoch": 1.4, "grad_norm": 0.17468957362849613, "learning_rate": 3.854749907227674e-05, "loss": 0.3732, "step": 2385 }, { "epoch": 1.4, "grad_norm": 0.17586062824420354, "learning_rate": 3.8473367225654456e-05, "loss": 0.3876, "step": 2386 }, { "epoch": 1.4, "grad_norm": 0.18528957099228635, "learning_rate": 3.839928974786001e-05, "loss": 0.3906, "step": 2387 }, { "epoch": 1.41, "grad_norm": 0.18446951898251418, "learning_rate": 3.832526670435297e-05, "loss": 0.3969, "step": 2388 }, { "epoch": 1.41, "grad_norm": 0.1816721057010734, "learning_rate": 3.825129816054477e-05, "loss": 0.4029, "step": 2389 }, { "epoch": 1.41, "grad_norm": 0.1902020202566427, "learning_rate": 3.8177384181798716e-05, "loss": 0.394, "step": 2390 }, { "epoch": 1.41, "grad_norm": 0.18444549107773275, "learning_rate": 3.810352483342984e-05, "loss": 0.408, "step": 2391 }, { "epoch": 1.41, "grad_norm": 0.1844755285263522, "learning_rate": 3.802972018070495e-05, "loss": 0.4056, "step": 2392 }, { "epoch": 1.41, "grad_norm": 0.19822608684842405, "learning_rate": 3.795597028884256e-05, "loss": 0.3833, "step": 2393 }, { "epoch": 1.41, "grad_norm": 0.1829016568339407, "learning_rate": 3.7882275223012655e-05, "loss": 0.3792, "step": 2394 }, { "epoch": 1.41, "grad_norm": 0.1851866988160698, "learning_rate": 3.7808635048336916e-05, "loss": 0.4041, "step": 2395 }, { "epoch": 1.41, "grad_norm": 0.1919325341590549, "learning_rate": 3.773504982988845e-05, "loss": 0.4426, "step": 2396 }, { "epoch": 1.41, "grad_norm": 0.18604278311018183, "learning_rate": 3.766151963269182e-05, "loss": 0.4035, "step": 2397 }, { "epoch": 1.41, "grad_norm": 0.17858116193671145, "learning_rate": 3.7588044521722965e-05, "loss": 0.3996, "step": 2398 }, { "epoch": 1.41, "grad_norm": 0.18071850227923503, "learning_rate": 3.7514624561909176e-05, "loss": 0.4174, "step": 2399 }, { "epoch": 1.41, "grad_norm": 0.19733574061187267, "learning_rate": 3.74412598181289e-05, "loss": 0.39, "step": 2400 }, { "epoch": 1.41, "grad_norm": 0.21249491902581671, "learning_rate": 3.7367950355211935e-05, "loss": 0.4157, "step": 2401 }, { "epoch": 1.41, "grad_norm": 0.1857615154929432, "learning_rate": 3.7294696237939154e-05, "loss": 0.4084, "step": 2402 }, { "epoch": 1.41, "grad_norm": 0.17962808249632856, "learning_rate": 3.7221497531042496e-05, "loss": 0.3989, "step": 2403 }, { "epoch": 1.41, "grad_norm": 0.16814028788195468, "learning_rate": 3.714835429920499e-05, "loss": 0.3735, "step": 2404 }, { "epoch": 1.42, "grad_norm": 0.19696181008047892, "learning_rate": 3.7075266607060633e-05, "loss": 0.3955, "step": 2405 }, { "epoch": 1.42, "grad_norm": 0.1754037692198337, "learning_rate": 3.700223451919434e-05, "loss": 0.3961, "step": 2406 }, { "epoch": 1.42, "grad_norm": 0.17946772065216135, "learning_rate": 3.6929258100141884e-05, "loss": 0.3879, "step": 2407 }, { "epoch": 1.42, "grad_norm": 0.18460327674666568, "learning_rate": 3.685633741438984e-05, "loss": 0.3869, "step": 2408 }, { "epoch": 1.42, "grad_norm": 0.16424641965730152, "learning_rate": 3.6783472526375596e-05, "loss": 0.3463, "step": 2409 }, { "epoch": 1.42, "grad_norm": 0.17722507584039435, "learning_rate": 3.6710663500487096e-05, "loss": 0.363, "step": 2410 }, { "epoch": 1.42, "grad_norm": 0.18126901978881674, "learning_rate": 3.663791040106304e-05, "loss": 0.364, "step": 2411 }, { "epoch": 1.42, "grad_norm": 0.19065262082537573, "learning_rate": 3.6565213292392706e-05, "loss": 0.4063, "step": 2412 }, { "epoch": 1.42, "grad_norm": 0.1875234518047201, "learning_rate": 3.6492572238715806e-05, "loss": 0.3905, "step": 2413 }, { "epoch": 1.42, "grad_norm": 0.1809287317584865, "learning_rate": 3.641998730422257e-05, "loss": 0.4023, "step": 2414 }, { "epoch": 1.42, "grad_norm": 0.18186133990542913, "learning_rate": 3.6347458553053726e-05, "loss": 0.3903, "step": 2415 }, { "epoch": 1.42, "grad_norm": 0.1763716250964691, "learning_rate": 3.6274986049300186e-05, "loss": 0.3994, "step": 2416 }, { "epoch": 1.42, "grad_norm": 0.1957566899123632, "learning_rate": 3.620256985700328e-05, "loss": 0.398, "step": 2417 }, { "epoch": 1.42, "grad_norm": 0.18967866601938366, "learning_rate": 3.613021004015452e-05, "loss": 0.4, "step": 2418 }, { "epoch": 1.42, "grad_norm": 0.1838851249112009, "learning_rate": 3.605790666269566e-05, "loss": 0.4124, "step": 2419 }, { "epoch": 1.42, "grad_norm": 0.1793602355569485, "learning_rate": 3.5985659788518486e-05, "loss": 0.3836, "step": 2420 }, { "epoch": 1.42, "grad_norm": 0.184921318265869, "learning_rate": 3.591346948146495e-05, "loss": 0.4309, "step": 2421 }, { "epoch": 1.43, "grad_norm": 0.19167974540770724, "learning_rate": 3.584133580532696e-05, "loss": 0.3569, "step": 2422 }, { "epoch": 1.43, "grad_norm": 0.17829643344053805, "learning_rate": 3.57692588238464e-05, "loss": 0.3701, "step": 2423 }, { "epoch": 1.43, "grad_norm": 0.18187630985706627, "learning_rate": 3.569723860071505e-05, "loss": 0.4403, "step": 2424 }, { "epoch": 1.43, "grad_norm": 0.17957754897288555, "learning_rate": 3.5625275199574594e-05, "loss": 0.3984, "step": 2425 }, { "epoch": 1.43, "grad_norm": 0.1864438615057095, "learning_rate": 3.555336868401635e-05, "loss": 0.3834, "step": 2426 }, { "epoch": 1.43, "grad_norm": 0.18496320192598015, "learning_rate": 3.5481519117581544e-05, "loss": 0.41, "step": 2427 }, { "epoch": 1.43, "grad_norm": 0.17891376090065997, "learning_rate": 3.540972656376099e-05, "loss": 0.4013, "step": 2428 }, { "epoch": 1.43, "grad_norm": 0.199688789707727, "learning_rate": 3.533799108599509e-05, "loss": 0.4572, "step": 2429 }, { "epoch": 1.43, "grad_norm": 0.18778755143316214, "learning_rate": 3.526631274767389e-05, "loss": 0.3708, "step": 2430 }, { "epoch": 1.43, "grad_norm": 0.18606849486194985, "learning_rate": 3.51946916121369e-05, "loss": 0.4293, "step": 2431 }, { "epoch": 1.43, "grad_norm": 0.21202440704452852, "learning_rate": 3.512312774267309e-05, "loss": 0.4082, "step": 2432 }, { "epoch": 1.43, "grad_norm": 0.1883615861955945, "learning_rate": 3.505162120252083e-05, "loss": 0.3817, "step": 2433 }, { "epoch": 1.43, "grad_norm": 0.18141909696192082, "learning_rate": 3.4980172054867824e-05, "loss": 0.389, "step": 2434 }, { "epoch": 1.43, "grad_norm": 0.17858799183526933, "learning_rate": 3.490878036285109e-05, "loss": 0.3787, "step": 2435 }, { "epoch": 1.43, "grad_norm": 0.22220663739461033, "learning_rate": 3.483744618955678e-05, "loss": 0.3763, "step": 2436 }, { "epoch": 1.43, "grad_norm": 0.19995276389000388, "learning_rate": 3.4766169598020326e-05, "loss": 0.4001, "step": 2437 }, { "epoch": 1.44, "grad_norm": 0.18362650006688164, "learning_rate": 3.469495065122627e-05, "loss": 0.3968, "step": 2438 }, { "epoch": 1.44, "grad_norm": 0.18043154356240745, "learning_rate": 3.462378941210811e-05, "loss": 0.3749, "step": 2439 }, { "epoch": 1.44, "grad_norm": 0.20672200265320864, "learning_rate": 3.4552685943548446e-05, "loss": 0.408, "step": 2440 }, { "epoch": 1.44, "grad_norm": 0.18049793078586143, "learning_rate": 3.4481640308378826e-05, "loss": 0.3874, "step": 2441 }, { "epoch": 1.44, "grad_norm": 0.19112644199676984, "learning_rate": 3.441065256937966e-05, "loss": 0.3688, "step": 2442 }, { "epoch": 1.44, "grad_norm": 0.19132070837097626, "learning_rate": 3.4339722789280214e-05, "loss": 0.3848, "step": 2443 }, { "epoch": 1.44, "grad_norm": 0.18419524418528202, "learning_rate": 3.4268851030758564e-05, "loss": 0.3593, "step": 2444 }, { "epoch": 1.44, "grad_norm": 0.1861093369198648, "learning_rate": 3.4198037356441406e-05, "loss": 0.4068, "step": 2445 }, { "epoch": 1.44, "grad_norm": 0.169397778715837, "learning_rate": 3.412728182890422e-05, "loss": 0.3773, "step": 2446 }, { "epoch": 1.44, "grad_norm": 0.1809760516512152, "learning_rate": 3.405658451067107e-05, "loss": 0.3983, "step": 2447 }, { "epoch": 1.44, "grad_norm": 0.18549464871726554, "learning_rate": 3.3985945464214644e-05, "loss": 0.4013, "step": 2448 }, { "epoch": 1.44, "grad_norm": 0.18192212358832036, "learning_rate": 3.391536475195597e-05, "loss": 0.3962, "step": 2449 }, { "epoch": 1.44, "grad_norm": 0.18017976989631, "learning_rate": 3.3844842436264645e-05, "loss": 0.4006, "step": 2450 }, { "epoch": 1.44, "grad_norm": 0.18790923665047435, "learning_rate": 3.3774378579458756e-05, "loss": 0.4286, "step": 2451 }, { "epoch": 1.44, "grad_norm": 0.19054020336793273, "learning_rate": 3.370397324380453e-05, "loss": 0.3933, "step": 2452 }, { "epoch": 1.44, "grad_norm": 0.18812056518773732, "learning_rate": 3.363362649151661e-05, "loss": 0.4283, "step": 2453 }, { "epoch": 1.44, "grad_norm": 0.1832072022873427, "learning_rate": 3.356333838475788e-05, "loss": 0.4242, "step": 2454 }, { "epoch": 1.45, "grad_norm": 0.1862514942939368, "learning_rate": 3.349310898563928e-05, "loss": 0.4161, "step": 2455 }, { "epoch": 1.45, "grad_norm": 0.1855293596823939, "learning_rate": 3.342293835621999e-05, "loss": 0.3813, "step": 2456 }, { "epoch": 1.45, "grad_norm": 0.17457692598284152, "learning_rate": 3.335282655850727e-05, "loss": 0.3526, "step": 2457 }, { "epoch": 1.45, "grad_norm": 0.2221232547795512, "learning_rate": 3.328277365445621e-05, "loss": 0.4092, "step": 2458 }, { "epoch": 1.45, "grad_norm": 0.18556918217449087, "learning_rate": 3.321277970597013e-05, "loss": 0.4045, "step": 2459 }, { "epoch": 1.45, "grad_norm": 0.1849813046141209, "learning_rate": 3.314284477490005e-05, "loss": 0.4036, "step": 2460 }, { "epoch": 1.45, "grad_norm": 0.2052380403966287, "learning_rate": 3.307296892304496e-05, "loss": 0.454, "step": 2461 }, { "epoch": 1.45, "grad_norm": 0.1725542429051671, "learning_rate": 3.300315221215149e-05, "loss": 0.3752, "step": 2462 }, { "epoch": 1.45, "grad_norm": 0.1856827963688142, "learning_rate": 3.293339470391416e-05, "loss": 0.3819, "step": 2463 }, { "epoch": 1.45, "grad_norm": 0.19290279871871277, "learning_rate": 3.286369645997517e-05, "loss": 0.4052, "step": 2464 }, { "epoch": 1.45, "grad_norm": 0.1905484508500894, "learning_rate": 3.279405754192419e-05, "loss": 0.3614, "step": 2465 }, { "epoch": 1.45, "grad_norm": 0.1853102955289528, "learning_rate": 3.2724478011298655e-05, "loss": 0.3868, "step": 2466 }, { "epoch": 1.45, "grad_norm": 0.1883623687154909, "learning_rate": 3.265495792958341e-05, "loss": 0.3986, "step": 2467 }, { "epoch": 1.45, "grad_norm": 0.1881091390523702, "learning_rate": 3.2585497358210816e-05, "loss": 0.4036, "step": 2468 }, { "epoch": 1.45, "grad_norm": 0.1762018240680343, "learning_rate": 3.2516096358560635e-05, "loss": 0.3782, "step": 2469 }, { "epoch": 1.45, "grad_norm": 0.18259210530113523, "learning_rate": 3.244675499196e-05, "loss": 0.3968, "step": 2470 }, { "epoch": 1.45, "grad_norm": 0.1876496783835084, "learning_rate": 3.2377473319683284e-05, "loss": 0.4286, "step": 2471 }, { "epoch": 1.46, "grad_norm": 0.16527016945269227, "learning_rate": 3.2308251402952184e-05, "loss": 0.3237, "step": 2472 }, { "epoch": 1.46, "grad_norm": 0.17824915166116562, "learning_rate": 3.22390893029356e-05, "loss": 0.3884, "step": 2473 }, { "epoch": 1.46, "grad_norm": 0.17769070135048026, "learning_rate": 3.216998708074948e-05, "loss": 0.3859, "step": 2474 }, { "epoch": 1.46, "grad_norm": 0.20659366775907526, "learning_rate": 3.2100944797456946e-05, "loss": 0.3727, "step": 2475 }, { "epoch": 1.46, "grad_norm": 0.1842302165987917, "learning_rate": 3.2031962514068135e-05, "loss": 0.3815, "step": 2476 }, { "epoch": 1.46, "grad_norm": 0.1795355173432193, "learning_rate": 3.196304029154017e-05, "loss": 0.3929, "step": 2477 }, { "epoch": 1.46, "grad_norm": 0.20144333980424745, "learning_rate": 3.189417819077708e-05, "loss": 0.4487, "step": 2478 }, { "epoch": 1.46, "grad_norm": 0.1803881397904987, "learning_rate": 3.182537627262977e-05, "loss": 0.4067, "step": 2479 }, { "epoch": 1.46, "grad_norm": 0.18559576834927588, "learning_rate": 3.175663459789602e-05, "loss": 0.4021, "step": 2480 }, { "epoch": 1.46, "grad_norm": 0.18096980466872054, "learning_rate": 3.1687953227320255e-05, "loss": 0.3889, "step": 2481 }, { "epoch": 1.46, "grad_norm": 0.18947934870893382, "learning_rate": 3.161933222159371e-05, "loss": 0.4185, "step": 2482 }, { "epoch": 1.46, "grad_norm": 0.17612052321661906, "learning_rate": 3.155077164135428e-05, "loss": 0.3926, "step": 2483 }, { "epoch": 1.46, "grad_norm": 0.18156449422153734, "learning_rate": 3.148227154718638e-05, "loss": 0.3947, "step": 2484 }, { "epoch": 1.46, "grad_norm": 0.18913021680801503, "learning_rate": 3.141383199962106e-05, "loss": 0.4201, "step": 2485 }, { "epoch": 1.46, "grad_norm": 0.17258485658517256, "learning_rate": 3.134545305913582e-05, "loss": 0.3837, "step": 2486 }, { "epoch": 1.46, "grad_norm": 0.18993701372256167, "learning_rate": 3.1277134786154635e-05, "loss": 0.4257, "step": 2487 }, { "epoch": 1.46, "grad_norm": 0.18064607120490733, "learning_rate": 3.120887724104786e-05, "loss": 0.419, "step": 2488 }, { "epoch": 1.47, "grad_norm": 0.23587919214839248, "learning_rate": 3.114068048413218e-05, "loss": 0.4105, "step": 2489 }, { "epoch": 1.47, "grad_norm": 0.18212465867115668, "learning_rate": 3.107254457567059e-05, "loss": 0.4004, "step": 2490 }, { "epoch": 1.47, "grad_norm": 0.18614735503772217, "learning_rate": 3.100446957587224e-05, "loss": 0.3708, "step": 2491 }, { "epoch": 1.47, "grad_norm": 0.17980373001984692, "learning_rate": 3.093645554489254e-05, "loss": 0.4045, "step": 2492 }, { "epoch": 1.47, "grad_norm": 0.18516336402362785, "learning_rate": 3.0868502542833014e-05, "loss": 0.4186, "step": 2493 }, { "epoch": 1.47, "grad_norm": 0.19077688542561766, "learning_rate": 3.080061062974119e-05, "loss": 0.3711, "step": 2494 }, { "epoch": 1.47, "grad_norm": 0.18723506971423085, "learning_rate": 3.073277986561064e-05, "loss": 0.4253, "step": 2495 }, { "epoch": 1.47, "grad_norm": 0.17887609250397457, "learning_rate": 3.066501031038104e-05, "loss": 0.3347, "step": 2496 }, { "epoch": 1.47, "grad_norm": 0.20234366752403984, "learning_rate": 3.059730202393773e-05, "loss": 0.385, "step": 2497 }, { "epoch": 1.47, "grad_norm": 0.17325627043749642, "learning_rate": 3.052965506611212e-05, "loss": 0.3711, "step": 2498 }, { "epoch": 1.47, "grad_norm": 0.1738175346182311, "learning_rate": 3.0462069496681333e-05, "loss": 0.3447, "step": 2499 }, { "epoch": 1.47, "grad_norm": 0.2293883188659455, "learning_rate": 3.0394545375368212e-05, "loss": 0.3967, "step": 2500 }, { "epoch": 1.47, "grad_norm": 0.18780544734321805, "learning_rate": 3.0327082761841376e-05, "loss": 0.3927, "step": 2501 }, { "epoch": 1.47, "grad_norm": 0.21001427845942877, "learning_rate": 3.0259681715715094e-05, "loss": 0.425, "step": 2502 }, { "epoch": 1.47, "grad_norm": 0.1983145619953598, "learning_rate": 3.019234229654909e-05, "loss": 0.4236, "step": 2503 }, { "epoch": 1.47, "grad_norm": 0.20394563948631428, "learning_rate": 3.012506456384885e-05, "loss": 0.3835, "step": 2504 }, { "epoch": 1.48, "grad_norm": 0.2256167643229903, "learning_rate": 3.0057848577065194e-05, "loss": 0.4114, "step": 2505 }, { "epoch": 1.48, "grad_norm": 0.18958440413331032, "learning_rate": 2.9990694395594454e-05, "loss": 0.4128, "step": 2506 }, { "epoch": 1.48, "grad_norm": 0.21619989441807325, "learning_rate": 2.9923602078778267e-05, "loss": 0.4396, "step": 2507 }, { "epoch": 1.48, "grad_norm": 0.18543165865572497, "learning_rate": 2.9856571685903678e-05, "loss": 0.4119, "step": 2508 }, { "epoch": 1.48, "grad_norm": 0.18652376800981768, "learning_rate": 2.9789603276203006e-05, "loss": 0.4108, "step": 2509 }, { "epoch": 1.48, "grad_norm": 0.1877506760116236, "learning_rate": 2.972269690885372e-05, "loss": 0.3733, "step": 2510 }, { "epoch": 1.48, "grad_norm": 0.20117391667012713, "learning_rate": 2.9655852642978567e-05, "loss": 0.4053, "step": 2511 }, { "epoch": 1.48, "grad_norm": 0.19049978481040805, "learning_rate": 2.9589070537645346e-05, "loss": 0.3695, "step": 2512 }, { "epoch": 1.48, "grad_norm": 0.19377859226154356, "learning_rate": 2.952235065186697e-05, "loss": 0.4216, "step": 2513 }, { "epoch": 1.48, "grad_norm": 0.1877033492781889, "learning_rate": 2.945569304460136e-05, "loss": 0.3972, "step": 2514 }, { "epoch": 1.48, "grad_norm": 0.17561774343820402, "learning_rate": 2.9389097774751416e-05, "loss": 0.3944, "step": 2515 }, { "epoch": 1.48, "grad_norm": 0.23612382395272608, "learning_rate": 2.9322564901164872e-05, "loss": 0.4192, "step": 2516 }, { "epoch": 1.48, "grad_norm": 0.1804117654771822, "learning_rate": 2.9256094482634433e-05, "loss": 0.3803, "step": 2517 }, { "epoch": 1.48, "grad_norm": 0.1871646527168998, "learning_rate": 2.9189686577897547e-05, "loss": 0.363, "step": 2518 }, { "epoch": 1.48, "grad_norm": 0.18268971436832296, "learning_rate": 2.9123341245636494e-05, "loss": 0.3766, "step": 2519 }, { "epoch": 1.48, "grad_norm": 0.19640957670522896, "learning_rate": 2.9057058544478144e-05, "loss": 0.381, "step": 2520 }, { "epoch": 1.48, "grad_norm": 0.1806292879024908, "learning_rate": 2.8990838532994104e-05, "loss": 0.358, "step": 2521 }, { "epoch": 1.49, "grad_norm": 0.18220069066196212, "learning_rate": 2.8924681269700582e-05, "loss": 0.3923, "step": 2522 }, { "epoch": 1.49, "grad_norm": 0.1982043765920388, "learning_rate": 2.885858681305832e-05, "loss": 0.4287, "step": 2523 }, { "epoch": 1.49, "grad_norm": 0.17306560605324575, "learning_rate": 2.8792555221472573e-05, "loss": 0.3807, "step": 2524 }, { "epoch": 1.49, "grad_norm": 0.21490036640126703, "learning_rate": 2.8726586553293043e-05, "loss": 0.3915, "step": 2525 }, { "epoch": 1.49, "grad_norm": 0.1969710728166032, "learning_rate": 2.8660680866813782e-05, "loss": 0.412, "step": 2526 }, { "epoch": 1.49, "grad_norm": 0.18990724568801395, "learning_rate": 2.8594838220273256e-05, "loss": 0.3688, "step": 2527 }, { "epoch": 1.49, "grad_norm": 0.21050184435865085, "learning_rate": 2.8529058671854224e-05, "loss": 0.414, "step": 2528 }, { "epoch": 1.49, "grad_norm": 0.19588995136687634, "learning_rate": 2.846334227968359e-05, "loss": 0.3833, "step": 2529 }, { "epoch": 1.49, "grad_norm": 0.18766563465171782, "learning_rate": 2.8397689101832558e-05, "loss": 0.414, "step": 2530 }, { "epoch": 1.49, "grad_norm": 0.1837863513894188, "learning_rate": 2.8332099196316386e-05, "loss": 0.3927, "step": 2531 }, { "epoch": 1.49, "grad_norm": 0.18099305950603425, "learning_rate": 2.8266572621094588e-05, "loss": 0.3472, "step": 2532 }, { "epoch": 1.49, "grad_norm": 0.18324457098340038, "learning_rate": 2.8201109434070482e-05, "loss": 0.3938, "step": 2533 }, { "epoch": 1.49, "grad_norm": 0.18010794244730502, "learning_rate": 2.8135709693091516e-05, "loss": 0.3829, "step": 2534 }, { "epoch": 1.49, "grad_norm": 0.18178616693007432, "learning_rate": 2.807037345594907e-05, "loss": 0.3818, "step": 2535 }, { "epoch": 1.49, "grad_norm": 0.18482236442092914, "learning_rate": 2.8005100780378323e-05, "loss": 0.4043, "step": 2536 }, { "epoch": 1.49, "grad_norm": 0.1690501700146767, "learning_rate": 2.793989172405839e-05, "loss": 0.3519, "step": 2537 }, { "epoch": 1.49, "grad_norm": 0.16886286326694724, "learning_rate": 2.7874746344612114e-05, "loss": 0.367, "step": 2538 }, { "epoch": 1.5, "grad_norm": 0.1876546785525707, "learning_rate": 2.780966469960602e-05, "loss": 0.3876, "step": 2539 }, { "epoch": 1.5, "grad_norm": 0.2004024323089618, "learning_rate": 2.7744646846550448e-05, "loss": 0.4066, "step": 2540 }, { "epoch": 1.5, "grad_norm": 0.1785253701604336, "learning_rate": 2.7679692842899284e-05, "loss": 0.4027, "step": 2541 }, { "epoch": 1.5, "grad_norm": 0.1860290157709305, "learning_rate": 2.7614802746049938e-05, "loss": 0.395, "step": 2542 }, { "epoch": 1.5, "grad_norm": 0.16775418599650602, "learning_rate": 2.7549976613343452e-05, "loss": 0.3606, "step": 2543 }, { "epoch": 1.5, "grad_norm": 0.18567178220266478, "learning_rate": 2.7485214502064316e-05, "loss": 0.389, "step": 2544 }, { "epoch": 1.5, "grad_norm": 0.20351953451192095, "learning_rate": 2.7420516469440384e-05, "loss": 0.4492, "step": 2545 }, { "epoch": 1.5, "grad_norm": 0.18873302654643795, "learning_rate": 2.7355882572642944e-05, "loss": 0.427, "step": 2546 }, { "epoch": 1.5, "grad_norm": 0.2044457762700013, "learning_rate": 2.7291312868786624e-05, "loss": 0.3937, "step": 2547 }, { "epoch": 1.5, "grad_norm": 0.24462628450748491, "learning_rate": 2.7226807414929278e-05, "loss": 0.474, "step": 2548 }, { "epoch": 1.5, "grad_norm": 0.16432682018058387, "learning_rate": 2.7162366268072026e-05, "loss": 0.3517, "step": 2549 }, { "epoch": 1.5, "grad_norm": 0.19392679244276614, "learning_rate": 2.7097989485159137e-05, "loss": 0.429, "step": 2550 }, { "epoch": 1.5, "grad_norm": 0.16787805427224112, "learning_rate": 2.703367712307804e-05, "loss": 0.3644, "step": 2551 }, { "epoch": 1.5, "grad_norm": 0.20648943423458793, "learning_rate": 2.696942923865915e-05, "loss": 0.377, "step": 2552 }, { "epoch": 1.5, "grad_norm": 0.19572324346821007, "learning_rate": 2.6905245888676012e-05, "loss": 0.409, "step": 2553 }, { "epoch": 1.5, "grad_norm": 0.6184991261797215, "learning_rate": 2.68411271298451e-05, "loss": 0.4096, "step": 2554 }, { "epoch": 1.5, "grad_norm": 0.18889426170403964, "learning_rate": 2.6777073018825772e-05, "loss": 0.4038, "step": 2555 }, { "epoch": 1.51, "grad_norm": 0.18850468933905212, "learning_rate": 2.6713083612220314e-05, "loss": 0.3848, "step": 2556 }, { "epoch": 1.51, "grad_norm": 0.18923419740292627, "learning_rate": 2.6649158966573817e-05, "loss": 0.377, "step": 2557 }, { "epoch": 1.51, "grad_norm": 0.20595259493204982, "learning_rate": 2.6585299138374143e-05, "loss": 0.3942, "step": 2558 }, { "epoch": 1.51, "grad_norm": 0.20110742094369213, "learning_rate": 2.6521504184051892e-05, "loss": 0.4843, "step": 2559 }, { "epoch": 1.51, "grad_norm": 0.19875067878685518, "learning_rate": 2.6457774159980307e-05, "loss": 0.3911, "step": 2560 }, { "epoch": 1.51, "grad_norm": 0.18607601583404562, "learning_rate": 2.639410912247531e-05, "loss": 0.402, "step": 2561 }, { "epoch": 1.51, "grad_norm": 0.19804026395342145, "learning_rate": 2.63305091277953e-05, "loss": 0.3701, "step": 2562 }, { "epoch": 1.51, "grad_norm": 0.1851894040551206, "learning_rate": 2.6266974232141285e-05, "loss": 0.4279, "step": 2563 }, { "epoch": 1.51, "grad_norm": 0.20060869681257895, "learning_rate": 2.620350449165676e-05, "loss": 0.3891, "step": 2564 }, { "epoch": 1.51, "grad_norm": 0.17879505755560798, "learning_rate": 2.6140099962427533e-05, "loss": 0.3618, "step": 2565 }, { "epoch": 1.51, "grad_norm": 0.20607338545873524, "learning_rate": 2.6076760700481893e-05, "loss": 0.4254, "step": 2566 }, { "epoch": 1.51, "grad_norm": 0.18787894153970464, "learning_rate": 2.6013486761790427e-05, "loss": 0.3721, "step": 2567 }, { "epoch": 1.51, "grad_norm": 0.18901310393891138, "learning_rate": 2.5950278202265997e-05, "loss": 0.383, "step": 2568 }, { "epoch": 1.51, "grad_norm": 0.20353516701869728, "learning_rate": 2.588713507776368e-05, "loss": 0.3866, "step": 2569 }, { "epoch": 1.51, "grad_norm": 0.1959203531342037, "learning_rate": 2.582405744408076e-05, "loss": 0.3676, "step": 2570 }, { "epoch": 1.51, "grad_norm": 0.1728488366034335, "learning_rate": 2.5761045356956593e-05, "loss": 0.3619, "step": 2571 }, { "epoch": 1.52, "grad_norm": 0.17762490554986549, "learning_rate": 2.5698098872072652e-05, "loss": 0.3712, "step": 2572 }, { "epoch": 1.52, "grad_norm": 0.1943025267948541, "learning_rate": 2.5635218045052477e-05, "loss": 0.4136, "step": 2573 }, { "epoch": 1.52, "grad_norm": 0.17330976704038878, "learning_rate": 2.5572402931461493e-05, "loss": 0.393, "step": 2574 }, { "epoch": 1.52, "grad_norm": 0.18585903169270396, "learning_rate": 2.5509653586807125e-05, "loss": 0.3699, "step": 2575 }, { "epoch": 1.52, "grad_norm": 0.17889504516326968, "learning_rate": 2.5446970066538656e-05, "loss": 0.3894, "step": 2576 }, { "epoch": 1.52, "grad_norm": 0.19130109369636317, "learning_rate": 2.53843524260473e-05, "loss": 0.4073, "step": 2577 }, { "epoch": 1.52, "grad_norm": 0.19114939911466414, "learning_rate": 2.5321800720665856e-05, "loss": 0.4145, "step": 2578 }, { "epoch": 1.52, "grad_norm": 0.17470921774314174, "learning_rate": 2.5259315005669027e-05, "loss": 0.3453, "step": 2579 }, { "epoch": 1.52, "grad_norm": 0.18014253092191793, "learning_rate": 2.5196895336273163e-05, "loss": 0.3948, "step": 2580 }, { "epoch": 1.52, "grad_norm": 0.19376171678954113, "learning_rate": 2.513454176763618e-05, "loss": 0.3612, "step": 2581 }, { "epoch": 1.52, "grad_norm": 0.193545613814365, "learning_rate": 2.507225435485766e-05, "loss": 0.4536, "step": 2582 }, { "epoch": 1.52, "grad_norm": 0.1816454621102984, "learning_rate": 2.501003315297875e-05, "loss": 0.4009, "step": 2583 }, { "epoch": 1.52, "grad_norm": 0.18568315961597195, "learning_rate": 2.4947878216981945e-05, "loss": 0.3758, "step": 2584 }, { "epoch": 1.52, "grad_norm": 0.20625822474781916, "learning_rate": 2.4885789601791364e-05, "loss": 0.4358, "step": 2585 }, { "epoch": 1.52, "grad_norm": 0.19717438783756117, "learning_rate": 2.4823767362272455e-05, "loss": 0.3703, "step": 2586 }, { "epoch": 1.52, "grad_norm": 0.1857784997413179, "learning_rate": 2.4761811553231916e-05, "loss": 0.3822, "step": 2587 }, { "epoch": 1.52, "grad_norm": 0.20221940136029923, "learning_rate": 2.469992222941787e-05, "loss": 0.401, "step": 2588 }, { "epoch": 1.53, "grad_norm": 0.1933568592568705, "learning_rate": 2.4638099445519636e-05, "loss": 0.4101, "step": 2589 }, { "epoch": 1.53, "grad_norm": 0.17721112883493306, "learning_rate": 2.4576343256167766e-05, "loss": 0.3718, "step": 2590 }, { "epoch": 1.53, "grad_norm": 0.20592471637280918, "learning_rate": 2.4514653715933876e-05, "loss": 0.4132, "step": 2591 }, { "epoch": 1.53, "grad_norm": 0.19568943375143438, "learning_rate": 2.4453030879330784e-05, "loss": 0.4079, "step": 2592 }, { "epoch": 1.53, "grad_norm": 0.19080954173046305, "learning_rate": 2.4391474800812332e-05, "loss": 0.3834, "step": 2593 }, { "epoch": 1.53, "grad_norm": 0.17854767229872442, "learning_rate": 2.4329985534773358e-05, "loss": 0.4174, "step": 2594 }, { "epoch": 1.53, "grad_norm": 0.19135862577144352, "learning_rate": 2.4268563135549682e-05, "loss": 0.3876, "step": 2595 }, { "epoch": 1.53, "grad_norm": 0.18577643412648345, "learning_rate": 2.4207207657418042e-05, "loss": 0.3792, "step": 2596 }, { "epoch": 1.53, "grad_norm": 0.1829599821421232, "learning_rate": 2.4145919154595975e-05, "loss": 0.3881, "step": 2597 }, { "epoch": 1.53, "grad_norm": 0.2054197464506424, "learning_rate": 2.4084697681241906e-05, "loss": 0.399, "step": 2598 }, { "epoch": 1.53, "grad_norm": 0.1789713792446189, "learning_rate": 2.402354329145504e-05, "loss": 0.3907, "step": 2599 }, { "epoch": 1.53, "grad_norm": 0.1880784992003119, "learning_rate": 2.3962456039275206e-05, "loss": 0.4189, "step": 2600 }, { "epoch": 1.53, "grad_norm": 0.1954510623029123, "learning_rate": 2.3901435978682986e-05, "loss": 0.4032, "step": 2601 }, { "epoch": 1.53, "grad_norm": 0.18452853553069323, "learning_rate": 2.3840483163599582e-05, "loss": 0.3895, "step": 2602 }, { "epoch": 1.53, "grad_norm": 0.19828511642616103, "learning_rate": 2.3779597647886753e-05, "loss": 0.3709, "step": 2603 }, { "epoch": 1.53, "grad_norm": 0.17953056383513943, "learning_rate": 2.37187794853468e-05, "loss": 0.3992, "step": 2604 }, { "epoch": 1.53, "grad_norm": 0.2031943276255951, "learning_rate": 2.3658028729722502e-05, "loss": 0.437, "step": 2605 }, { "epoch": 1.54, "grad_norm": 0.19738049305029495, "learning_rate": 2.3597345434697093e-05, "loss": 0.393, "step": 2606 }, { "epoch": 1.54, "grad_norm": 0.1849596059995837, "learning_rate": 2.3536729653894118e-05, "loss": 0.3803, "step": 2607 }, { "epoch": 1.54, "grad_norm": 0.20204811834390532, "learning_rate": 2.3476181440877564e-05, "loss": 0.4105, "step": 2608 }, { "epoch": 1.54, "grad_norm": 0.24667469022404487, "learning_rate": 2.341570084915168e-05, "loss": 0.4185, "step": 2609 }, { "epoch": 1.54, "grad_norm": 0.20269213137231093, "learning_rate": 2.3355287932160884e-05, "loss": 0.4583, "step": 2610 }, { "epoch": 1.54, "grad_norm": 0.19877434103859673, "learning_rate": 2.329494274328988e-05, "loss": 0.4421, "step": 2611 }, { "epoch": 1.54, "grad_norm": 0.18687566792579868, "learning_rate": 2.3234665335863526e-05, "loss": 0.3901, "step": 2612 }, { "epoch": 1.54, "grad_norm": 0.19940981465040128, "learning_rate": 2.3174455763146717e-05, "loss": 0.4099, "step": 2613 }, { "epoch": 1.54, "grad_norm": 0.1823677638592108, "learning_rate": 2.3114314078344478e-05, "loss": 0.3668, "step": 2614 }, { "epoch": 1.54, "grad_norm": 0.19369581454319543, "learning_rate": 2.3054240334601805e-05, "loss": 0.3986, "step": 2615 }, { "epoch": 1.54, "grad_norm": 0.19415035095905245, "learning_rate": 2.2994234585003638e-05, "loss": 0.4157, "step": 2616 }, { "epoch": 1.54, "grad_norm": 0.1957312431961838, "learning_rate": 2.2934296882574847e-05, "loss": 0.4211, "step": 2617 }, { "epoch": 1.54, "grad_norm": 0.22121291068701435, "learning_rate": 2.287442728028021e-05, "loss": 0.4166, "step": 2618 }, { "epoch": 1.54, "grad_norm": 0.17201804281961042, "learning_rate": 2.2814625831024318e-05, "loss": 0.3583, "step": 2619 }, { "epoch": 1.54, "grad_norm": 0.18485977615841773, "learning_rate": 2.2754892587651434e-05, "loss": 0.3821, "step": 2620 }, { "epoch": 1.54, "grad_norm": 0.17947272170945186, "learning_rate": 2.2695227602945702e-05, "loss": 0.3834, "step": 2621 }, { "epoch": 1.54, "grad_norm": 0.19177743756919843, "learning_rate": 2.2635630929630904e-05, "loss": 0.392, "step": 2622 }, { "epoch": 1.55, "grad_norm": 0.18192144036137656, "learning_rate": 2.2576102620370364e-05, "loss": 0.3574, "step": 2623 }, { "epoch": 1.55, "grad_norm": 0.21560141298971364, "learning_rate": 2.251664272776709e-05, "loss": 0.4083, "step": 2624 }, { "epoch": 1.55, "grad_norm": 0.18320008573103155, "learning_rate": 2.2457251304363646e-05, "loss": 0.4345, "step": 2625 }, { "epoch": 1.55, "grad_norm": 0.17861793380088878, "learning_rate": 2.2397928402641988e-05, "loss": 0.3803, "step": 2626 }, { "epoch": 1.55, "grad_norm": 0.2066675002043351, "learning_rate": 2.2338674075023615e-05, "loss": 0.4367, "step": 2627 }, { "epoch": 1.55, "grad_norm": 0.18303979352935243, "learning_rate": 2.2279488373869416e-05, "loss": 0.3914, "step": 2628 }, { "epoch": 1.55, "grad_norm": 0.18526974271163213, "learning_rate": 2.2220371351479607e-05, "loss": 0.3683, "step": 2629 }, { "epoch": 1.55, "grad_norm": 0.19396331959195012, "learning_rate": 2.2161323060093742e-05, "loss": 0.432, "step": 2630 }, { "epoch": 1.55, "grad_norm": 0.17699796388171363, "learning_rate": 2.2102343551890627e-05, "loss": 0.332, "step": 2631 }, { "epoch": 1.55, "grad_norm": 0.1840940308979838, "learning_rate": 2.2043432878988313e-05, "loss": 0.3735, "step": 2632 }, { "epoch": 1.55, "grad_norm": 0.1833386406759228, "learning_rate": 2.198459109344395e-05, "loss": 0.3718, "step": 2633 }, { "epoch": 1.55, "grad_norm": 0.20277352966061787, "learning_rate": 2.1925818247253893e-05, "loss": 0.389, "step": 2634 }, { "epoch": 1.55, "grad_norm": 0.19414136777804827, "learning_rate": 2.186711439235356e-05, "loss": 0.3913, "step": 2635 }, { "epoch": 1.55, "grad_norm": 0.19538391282392667, "learning_rate": 2.180847958061737e-05, "loss": 0.4474, "step": 2636 }, { "epoch": 1.55, "grad_norm": 0.17521825056206217, "learning_rate": 2.174991386385876e-05, "loss": 0.3753, "step": 2637 }, { "epoch": 1.55, "grad_norm": 0.20633732426596804, "learning_rate": 2.169141729383011e-05, "loss": 0.393, "step": 2638 }, { "epoch": 1.56, "grad_norm": 0.1837522109909096, "learning_rate": 2.163298992222269e-05, "loss": 0.4211, "step": 2639 }, { "epoch": 1.56, "grad_norm": 0.1916084114095356, "learning_rate": 2.1574631800666635e-05, "loss": 0.3848, "step": 2640 }, { "epoch": 1.56, "grad_norm": 0.19292067652803527, "learning_rate": 2.1516342980730885e-05, "loss": 0.4343, "step": 2641 }, { "epoch": 1.56, "grad_norm": 0.18559141434436843, "learning_rate": 2.145812351392309e-05, "loss": 0.3903, "step": 2642 }, { "epoch": 1.56, "grad_norm": 0.18681580264278325, "learning_rate": 2.1399973451689682e-05, "loss": 0.4171, "step": 2643 }, { "epoch": 1.56, "grad_norm": 0.183518946539565, "learning_rate": 2.1341892845415766e-05, "loss": 0.4043, "step": 2644 }, { "epoch": 1.56, "grad_norm": 0.18031957343078933, "learning_rate": 2.1283881746424982e-05, "loss": 0.3932, "step": 2645 }, { "epoch": 1.56, "grad_norm": 0.19277340308330332, "learning_rate": 2.1225940205979645e-05, "loss": 0.3709, "step": 2646 }, { "epoch": 1.56, "grad_norm": 0.1793216787197076, "learning_rate": 2.1168068275280562e-05, "loss": 0.3646, "step": 2647 }, { "epoch": 1.56, "grad_norm": 0.18616507644272884, "learning_rate": 2.111026600546704e-05, "loss": 0.3917, "step": 2648 }, { "epoch": 1.56, "grad_norm": 0.20656052185088922, "learning_rate": 2.1052533447616817e-05, "loss": 0.4195, "step": 2649 }, { "epoch": 1.56, "grad_norm": 0.1871278424966707, "learning_rate": 2.0994870652746045e-05, "loss": 0.3503, "step": 2650 }, { "epoch": 1.56, "grad_norm": 0.17727320184190487, "learning_rate": 2.093727767180923e-05, "loss": 0.3923, "step": 2651 }, { "epoch": 1.56, "grad_norm": 0.20761253440270935, "learning_rate": 2.087975455569915e-05, "loss": 0.4767, "step": 2652 }, { "epoch": 1.56, "grad_norm": 0.18825241614671864, "learning_rate": 2.0822301355246877e-05, "loss": 0.3972, "step": 2653 }, { "epoch": 1.56, "grad_norm": 0.20401644685323356, "learning_rate": 2.0764918121221722e-05, "loss": 0.4012, "step": 2654 }, { "epoch": 1.56, "grad_norm": 0.17507114404738788, "learning_rate": 2.0707604904331103e-05, "loss": 0.3586, "step": 2655 }, { "epoch": 1.57, "grad_norm": 0.18781443778443094, "learning_rate": 2.0650361755220625e-05, "loss": 0.3892, "step": 2656 }, { "epoch": 1.57, "grad_norm": 0.1709386563523762, "learning_rate": 2.0593188724473956e-05, "loss": 0.344, "step": 2657 }, { "epoch": 1.57, "grad_norm": 0.1889048966807141, "learning_rate": 2.053608586261282e-05, "loss": 0.375, "step": 2658 }, { "epoch": 1.57, "grad_norm": 0.18455808123424355, "learning_rate": 2.04790532200969e-05, "loss": 0.3713, "step": 2659 }, { "epoch": 1.57, "grad_norm": 0.1754654094920607, "learning_rate": 2.042209084732387e-05, "loss": 0.3658, "step": 2660 }, { "epoch": 1.57, "grad_norm": 0.18022772531742068, "learning_rate": 2.0365198794629303e-05, "loss": 0.3729, "step": 2661 }, { "epoch": 1.57, "grad_norm": 0.19023673110705866, "learning_rate": 2.030837711228657e-05, "loss": 0.4075, "step": 2662 }, { "epoch": 1.57, "grad_norm": 0.19573207608030416, "learning_rate": 2.0251625850506927e-05, "loss": 0.4093, "step": 2663 }, { "epoch": 1.57, "grad_norm": 0.19666613310850647, "learning_rate": 2.0194945059439417e-05, "loss": 0.4187, "step": 2664 }, { "epoch": 1.57, "grad_norm": 0.19091841391757786, "learning_rate": 2.01383347891707e-05, "loss": 0.3908, "step": 2665 }, { "epoch": 1.57, "grad_norm": 0.20185751082030062, "learning_rate": 2.0081795089725253e-05, "loss": 0.4204, "step": 2666 }, { "epoch": 1.57, "grad_norm": 0.19302561622447217, "learning_rate": 2.0025326011065148e-05, "loss": 0.4143, "step": 2667 }, { "epoch": 1.57, "grad_norm": 0.18613507094571158, "learning_rate": 1.996892760308998e-05, "loss": 0.3884, "step": 2668 }, { "epoch": 1.57, "grad_norm": 0.18617443797375527, "learning_rate": 1.9912599915637e-05, "loss": 0.3809, "step": 2669 }, { "epoch": 1.57, "grad_norm": 0.19085092433993356, "learning_rate": 1.9856342998480913e-05, "loss": 0.3633, "step": 2670 }, { "epoch": 1.57, "grad_norm": 0.18701970725839678, "learning_rate": 1.9800156901333855e-05, "loss": 0.4068, "step": 2671 }, { "epoch": 1.57, "grad_norm": 0.1974135928674328, "learning_rate": 1.9744041673845448e-05, "loss": 0.4193, "step": 2672 }, { "epoch": 1.58, "grad_norm": 0.19094778297421405, "learning_rate": 1.9687997365602663e-05, "loss": 0.4151, "step": 2673 }, { "epoch": 1.58, "grad_norm": 0.19765550896862327, "learning_rate": 1.963202402612978e-05, "loss": 0.4001, "step": 2674 }, { "epoch": 1.58, "grad_norm": 0.1781031913824185, "learning_rate": 1.9576121704888396e-05, "loss": 0.3829, "step": 2675 }, { "epoch": 1.58, "grad_norm": 0.19762765974127958, "learning_rate": 1.9520290451277358e-05, "loss": 0.3953, "step": 2676 }, { "epoch": 1.58, "grad_norm": 0.18030388133141192, "learning_rate": 1.946453031463269e-05, "loss": 0.3794, "step": 2677 }, { "epoch": 1.58, "grad_norm": 0.19733771540385325, "learning_rate": 1.9408841344227547e-05, "loss": 0.426, "step": 2678 }, { "epoch": 1.58, "grad_norm": 0.18984191947112966, "learning_rate": 1.9353223589272252e-05, "loss": 0.3906, "step": 2679 }, { "epoch": 1.58, "grad_norm": 0.1901537855371303, "learning_rate": 1.92976770989142e-05, "loss": 0.3756, "step": 2680 }, { "epoch": 1.58, "grad_norm": 0.16848529460481557, "learning_rate": 1.9242201922237736e-05, "loss": 0.3277, "step": 2681 }, { "epoch": 1.58, "grad_norm": 0.18134548951153853, "learning_rate": 1.918679810826427e-05, "loss": 0.3688, "step": 2682 }, { "epoch": 1.58, "grad_norm": 0.18559056941559682, "learning_rate": 1.91314657059521e-05, "loss": 0.3886, "step": 2683 }, { "epoch": 1.58, "grad_norm": 0.1786840191849677, "learning_rate": 1.907620476419645e-05, "loss": 0.3788, "step": 2684 }, { "epoch": 1.58, "grad_norm": 0.1994322658620708, "learning_rate": 1.9021015331829396e-05, "loss": 0.3893, "step": 2685 }, { "epoch": 1.58, "grad_norm": 0.1817868235867725, "learning_rate": 1.896589745761982e-05, "loss": 0.3957, "step": 2686 }, { "epoch": 1.58, "grad_norm": 0.17602823761099434, "learning_rate": 1.891085119027334e-05, "loss": 0.3554, "step": 2687 }, { "epoch": 1.58, "grad_norm": 0.19267039852977308, "learning_rate": 1.885587657843232e-05, "loss": 0.4044, "step": 2688 }, { "epoch": 1.58, "grad_norm": 0.18301072528946535, "learning_rate": 1.8800973670675827e-05, "loss": 0.358, "step": 2689 }, { "epoch": 1.59, "grad_norm": 0.19200649093878824, "learning_rate": 1.874614251551957e-05, "loss": 0.3852, "step": 2690 }, { "epoch": 1.59, "grad_norm": 0.17449134464711386, "learning_rate": 1.8691383161415764e-05, "loss": 0.3737, "step": 2691 }, { "epoch": 1.59, "grad_norm": 0.19645308745000037, "learning_rate": 1.8636695656753278e-05, "loss": 0.4098, "step": 2692 }, { "epoch": 1.59, "grad_norm": 0.18024402249202065, "learning_rate": 1.8582080049857465e-05, "loss": 0.3609, "step": 2693 }, { "epoch": 1.59, "grad_norm": 0.1908219232395505, "learning_rate": 1.8527536388990106e-05, "loss": 0.4077, "step": 2694 }, { "epoch": 1.59, "grad_norm": 0.18326557695226708, "learning_rate": 1.8473064722349453e-05, "loss": 0.3833, "step": 2695 }, { "epoch": 1.59, "grad_norm": 0.18452653482256087, "learning_rate": 1.841866509807013e-05, "loss": 0.3728, "step": 2696 }, { "epoch": 1.59, "grad_norm": 0.17820866873765373, "learning_rate": 1.8364337564223057e-05, "loss": 0.3632, "step": 2697 }, { "epoch": 1.59, "grad_norm": 0.18998043544259907, "learning_rate": 1.831008216881548e-05, "loss": 0.4227, "step": 2698 }, { "epoch": 1.59, "grad_norm": 0.19715562552177654, "learning_rate": 1.8255898959790953e-05, "loss": 0.4085, "step": 2699 }, { "epoch": 1.59, "grad_norm": 0.18232780274953944, "learning_rate": 1.820178798502913e-05, "loss": 0.3419, "step": 2700 }, { "epoch": 1.59, "grad_norm": 0.17608648085623013, "learning_rate": 1.8147749292345917e-05, "loss": 0.344, "step": 2701 }, { "epoch": 1.59, "grad_norm": 0.1773025116918539, "learning_rate": 1.809378292949333e-05, "loss": 0.3661, "step": 2702 }, { "epoch": 1.59, "grad_norm": 0.20513926640304037, "learning_rate": 1.8039888944159444e-05, "loss": 0.421, "step": 2703 }, { "epoch": 1.59, "grad_norm": 0.18068919279336962, "learning_rate": 1.798606738396843e-05, "loss": 0.3815, "step": 2704 }, { "epoch": 1.59, "grad_norm": 0.17767261806592177, "learning_rate": 1.79323182964804e-05, "loss": 0.3626, "step": 2705 }, { "epoch": 1.6, "grad_norm": 0.18267825701494264, "learning_rate": 1.787864172919147e-05, "loss": 0.3943, "step": 2706 }, { "epoch": 1.6, "grad_norm": 0.19565515278454887, "learning_rate": 1.7825037729533632e-05, "loss": 0.3804, "step": 2707 }, { "epoch": 1.6, "grad_norm": 0.1985124018586052, "learning_rate": 1.7771506344874778e-05, "loss": 0.387, "step": 2708 }, { "epoch": 1.6, "grad_norm": 0.18707453082209516, "learning_rate": 1.7718047622518652e-05, "loss": 0.3809, "step": 2709 }, { "epoch": 1.6, "grad_norm": 0.18965578493204102, "learning_rate": 1.7664661609704704e-05, "loss": 0.3993, "step": 2710 }, { "epoch": 1.6, "grad_norm": 0.1963544372024192, "learning_rate": 1.761134835360826e-05, "loss": 0.4351, "step": 2711 }, { "epoch": 1.6, "grad_norm": 0.1988795587027005, "learning_rate": 1.755810790134029e-05, "loss": 0.4362, "step": 2712 }, { "epoch": 1.6, "grad_norm": 0.1842729514301608, "learning_rate": 1.750494029994737e-05, "loss": 0.356, "step": 2713 }, { "epoch": 1.6, "grad_norm": 0.19009932018603531, "learning_rate": 1.745184559641181e-05, "loss": 0.4163, "step": 2714 }, { "epoch": 1.6, "grad_norm": 0.1759133002009477, "learning_rate": 1.7398823837651447e-05, "loss": 0.354, "step": 2715 }, { "epoch": 1.6, "grad_norm": 0.18882431184786572, "learning_rate": 1.7345875070519624e-05, "loss": 0.3412, "step": 2716 }, { "epoch": 1.6, "grad_norm": 0.19579173000957384, "learning_rate": 1.729299934180525e-05, "loss": 0.3995, "step": 2717 }, { "epoch": 1.6, "grad_norm": 0.19977064472735956, "learning_rate": 1.7240196698232657e-05, "loss": 0.4335, "step": 2718 }, { "epoch": 1.6, "grad_norm": 0.18971473912529013, "learning_rate": 1.7187467186461626e-05, "loss": 0.406, "step": 2719 }, { "epoch": 1.6, "grad_norm": 0.17812152081411728, "learning_rate": 1.713481085308728e-05, "loss": 0.3564, "step": 2720 }, { "epoch": 1.6, "grad_norm": 0.2017249184456053, "learning_rate": 1.708222774464008e-05, "loss": 0.3942, "step": 2721 }, { "epoch": 1.6, "grad_norm": 0.19519933753526805, "learning_rate": 1.702971790758582e-05, "loss": 0.3991, "step": 2722 }, { "epoch": 1.61, "grad_norm": 0.18644811213473475, "learning_rate": 1.6977281388325472e-05, "loss": 0.4005, "step": 2723 }, { "epoch": 1.61, "grad_norm": 0.18195473908167434, "learning_rate": 1.6924918233195286e-05, "loss": 0.376, "step": 2724 }, { "epoch": 1.61, "grad_norm": 0.1965395495642495, "learning_rate": 1.687262848846668e-05, "loss": 0.4223, "step": 2725 }, { "epoch": 1.61, "grad_norm": 0.1778406839404565, "learning_rate": 1.6820412200346147e-05, "loss": 0.3916, "step": 2726 }, { "epoch": 1.61, "grad_norm": 0.19717405460574805, "learning_rate": 1.6768269414975314e-05, "loss": 0.4256, "step": 2727 }, { "epoch": 1.61, "grad_norm": 0.17888651432068586, "learning_rate": 1.6716200178430852e-05, "loss": 0.3389, "step": 2728 }, { "epoch": 1.61, "grad_norm": 0.2047795225110605, "learning_rate": 1.6664204536724436e-05, "loss": 0.3886, "step": 2729 }, { "epoch": 1.61, "grad_norm": 0.18488557166519895, "learning_rate": 1.6612282535802716e-05, "loss": 0.3952, "step": 2730 }, { "epoch": 1.61, "grad_norm": 0.19387533792584713, "learning_rate": 1.656043422154725e-05, "loss": 0.4038, "step": 2731 }, { "epoch": 1.61, "grad_norm": 0.19855401567331812, "learning_rate": 1.6508659639774503e-05, "loss": 0.3573, "step": 2732 }, { "epoch": 1.61, "grad_norm": 0.1910322360488063, "learning_rate": 1.6456958836235747e-05, "loss": 0.3986, "step": 2733 }, { "epoch": 1.61, "grad_norm": 0.17910459908780496, "learning_rate": 1.64053318566171e-05, "loss": 0.3815, "step": 2734 }, { "epoch": 1.61, "grad_norm": 0.1791557436007829, "learning_rate": 1.635377874653945e-05, "loss": 0.3654, "step": 2735 }, { "epoch": 1.61, "grad_norm": 0.19730991105935478, "learning_rate": 1.6302299551558353e-05, "loss": 0.3962, "step": 2736 }, { "epoch": 1.61, "grad_norm": 0.19454703085067263, "learning_rate": 1.6250894317164088e-05, "loss": 0.3852, "step": 2737 }, { "epoch": 1.61, "grad_norm": 0.18825033606033495, "learning_rate": 1.6199563088781588e-05, "loss": 0.4103, "step": 2738 }, { "epoch": 1.61, "grad_norm": 0.17855372979030418, "learning_rate": 1.6148305911770377e-05, "loss": 0.3717, "step": 2739 }, { "epoch": 1.62, "grad_norm": 0.17946602161319378, "learning_rate": 1.6097122831424538e-05, "loss": 0.3583, "step": 2740 }, { "epoch": 1.62, "grad_norm": 0.18081197865723575, "learning_rate": 1.604601389297271e-05, "loss": 0.3771, "step": 2741 }, { "epoch": 1.62, "grad_norm": 0.2597749579645915, "learning_rate": 1.5994979141577936e-05, "loss": 0.4466, "step": 2742 }, { "epoch": 1.62, "grad_norm": 0.1819143066308158, "learning_rate": 1.5944018622337764e-05, "loss": 0.3431, "step": 2743 }, { "epoch": 1.62, "grad_norm": 0.18034997883963755, "learning_rate": 1.5893132380284183e-05, "loss": 0.3839, "step": 2744 }, { "epoch": 1.62, "grad_norm": 0.196898440986257, "learning_rate": 1.584232046038343e-05, "loss": 0.4312, "step": 2745 }, { "epoch": 1.62, "grad_norm": 0.18466489312815543, "learning_rate": 1.5791582907536152e-05, "loss": 0.3577, "step": 2746 }, { "epoch": 1.62, "grad_norm": 0.19649178998426725, "learning_rate": 1.5740919766577288e-05, "loss": 0.417, "step": 2747 }, { "epoch": 1.62, "grad_norm": 0.22045623088641314, "learning_rate": 1.5690331082276023e-05, "loss": 0.3905, "step": 2748 }, { "epoch": 1.62, "grad_norm": 0.19000235214368424, "learning_rate": 1.5639816899335645e-05, "loss": 0.4092, "step": 2749 }, { "epoch": 1.62, "grad_norm": 0.1902486275954528, "learning_rate": 1.5589377262393735e-05, "loss": 0.3654, "step": 2750 }, { "epoch": 1.62, "grad_norm": 0.19778769806066843, "learning_rate": 1.5539012216021954e-05, "loss": 0.3993, "step": 2751 }, { "epoch": 1.62, "grad_norm": 0.3270150637834885, "learning_rate": 1.5488721804726003e-05, "loss": 0.3809, "step": 2752 }, { "epoch": 1.62, "grad_norm": 0.18103395554668633, "learning_rate": 1.5438506072945703e-05, "loss": 0.3828, "step": 2753 }, { "epoch": 1.62, "grad_norm": 0.18629332641729748, "learning_rate": 1.5388365065054845e-05, "loss": 0.365, "step": 2754 }, { "epoch": 1.62, "grad_norm": 0.1912439937088145, "learning_rate": 1.533829882536121e-05, "loss": 0.4163, "step": 2755 }, { "epoch": 1.62, "grad_norm": 0.2031011683329402, "learning_rate": 1.5288307398106484e-05, "loss": 0.4449, "step": 2756 }, { "epoch": 1.63, "grad_norm": 0.18874978222699812, "learning_rate": 1.5238390827466287e-05, "loss": 0.3684, "step": 2757 }, { "epoch": 1.63, "grad_norm": 0.18504340910662923, "learning_rate": 1.5188549157550013e-05, "loss": 0.3854, "step": 2758 }, { "epoch": 1.63, "grad_norm": 0.19077975170750153, "learning_rate": 1.5138782432400943e-05, "loss": 0.3871, "step": 2759 }, { "epoch": 1.63, "grad_norm": 0.21232319504204727, "learning_rate": 1.50890906959961e-05, "loss": 0.459, "step": 2760 }, { "epoch": 1.63, "grad_norm": 0.18086892611822036, "learning_rate": 1.5039473992246278e-05, "loss": 0.3655, "step": 2761 }, { "epoch": 1.63, "grad_norm": 0.1973248762783153, "learning_rate": 1.4989932364995873e-05, "loss": 0.4215, "step": 2762 }, { "epoch": 1.63, "grad_norm": 0.2036105383458429, "learning_rate": 1.4940465858023055e-05, "loss": 0.4068, "step": 2763 }, { "epoch": 1.63, "grad_norm": 0.18528333902490204, "learning_rate": 1.4891074515039548e-05, "loss": 0.3973, "step": 2764 }, { "epoch": 1.63, "grad_norm": 0.19995603824690625, "learning_rate": 1.4841758379690663e-05, "loss": 0.4255, "step": 2765 }, { "epoch": 1.63, "grad_norm": 0.17787180969113067, "learning_rate": 1.479251749555527e-05, "loss": 0.3738, "step": 2766 }, { "epoch": 1.63, "grad_norm": 0.1831308365493359, "learning_rate": 1.4743351906145741e-05, "loss": 0.3912, "step": 2767 }, { "epoch": 1.63, "grad_norm": 0.20430223291165012, "learning_rate": 1.4694261654907881e-05, "loss": 0.4028, "step": 2768 }, { "epoch": 1.63, "grad_norm": 0.19756371486336502, "learning_rate": 1.4645246785220934e-05, "loss": 0.3881, "step": 2769 }, { "epoch": 1.63, "grad_norm": 0.21343700040883345, "learning_rate": 1.4596307340397597e-05, "loss": 0.4174, "step": 2770 }, { "epoch": 1.63, "grad_norm": 0.18574834851208513, "learning_rate": 1.45474433636838e-05, "loss": 0.3787, "step": 2771 }, { "epoch": 1.63, "grad_norm": 0.186713913712865, "learning_rate": 1.4498654898258857e-05, "loss": 0.4028, "step": 2772 }, { "epoch": 1.63, "grad_norm": 0.17862363269208836, "learning_rate": 1.4449941987235371e-05, "loss": 0.3366, "step": 2773 }, { "epoch": 1.64, "grad_norm": 0.17991971311499375, "learning_rate": 1.4401304673659143e-05, "loss": 0.385, "step": 2774 }, { "epoch": 1.64, "grad_norm": 0.18266258486092432, "learning_rate": 1.4352743000509172e-05, "loss": 0.3758, "step": 2775 }, { "epoch": 1.64, "grad_norm": 0.19932881267671296, "learning_rate": 1.4304257010697642e-05, "loss": 0.4001, "step": 2776 }, { "epoch": 1.64, "grad_norm": 0.18660263876456015, "learning_rate": 1.4255846747069857e-05, "loss": 0.3784, "step": 2777 }, { "epoch": 1.64, "grad_norm": 0.18446968617714987, "learning_rate": 1.4207512252404143e-05, "loss": 0.3762, "step": 2778 }, { "epoch": 1.64, "grad_norm": 0.18536201861937218, "learning_rate": 1.4159253569411956e-05, "loss": 0.3468, "step": 2779 }, { "epoch": 1.64, "grad_norm": 0.17535615409314131, "learning_rate": 1.4111070740737731e-05, "loss": 0.3398, "step": 2780 }, { "epoch": 1.64, "grad_norm": 0.17771981382314816, "learning_rate": 1.406296380895883e-05, "loss": 0.3851, "step": 2781 }, { "epoch": 1.64, "grad_norm": 0.19105270450058937, "learning_rate": 1.4014932816585602e-05, "loss": 0.4133, "step": 2782 }, { "epoch": 1.64, "grad_norm": 0.1885733896714408, "learning_rate": 1.3966977806061277e-05, "loss": 0.3945, "step": 2783 }, { "epoch": 1.64, "grad_norm": 0.18694162099039605, "learning_rate": 1.3919098819761922e-05, "loss": 0.4086, "step": 2784 }, { "epoch": 1.64, "grad_norm": 0.19829924594841047, "learning_rate": 1.387129589999646e-05, "loss": 0.4231, "step": 2785 }, { "epoch": 1.64, "grad_norm": 0.19892181327078298, "learning_rate": 1.3823569089006604e-05, "loss": 0.4179, "step": 2786 }, { "epoch": 1.64, "grad_norm": 0.19725907955910138, "learning_rate": 1.3775918428966716e-05, "loss": 0.4141, "step": 2787 }, { "epoch": 1.64, "grad_norm": 0.18349490528053952, "learning_rate": 1.372834396198397e-05, "loss": 0.3965, "step": 2788 }, { "epoch": 1.64, "grad_norm": 0.19253183211826913, "learning_rate": 1.3680845730098191e-05, "loss": 0.3932, "step": 2789 }, { "epoch": 1.65, "grad_norm": 0.19353375854479396, "learning_rate": 1.3633423775281816e-05, "loss": 0.364, "step": 2790 }, { "epoch": 1.65, "grad_norm": 0.18771329690536745, "learning_rate": 1.3586078139439851e-05, "loss": 0.4034, "step": 2791 }, { "epoch": 1.65, "grad_norm": 0.192207218124855, "learning_rate": 1.3538808864409947e-05, "loss": 0.3893, "step": 2792 }, { "epoch": 1.65, "grad_norm": 0.18658210647129536, "learning_rate": 1.3491615991962225e-05, "loss": 0.3801, "step": 2793 }, { "epoch": 1.65, "grad_norm": 0.19794402673622702, "learning_rate": 1.3444499563799262e-05, "loss": 0.4126, "step": 2794 }, { "epoch": 1.65, "grad_norm": 0.19194076122619697, "learning_rate": 1.339745962155613e-05, "loss": 0.4238, "step": 2795 }, { "epoch": 1.65, "grad_norm": 0.19500823355157348, "learning_rate": 1.3350496206800334e-05, "loss": 0.4001, "step": 2796 }, { "epoch": 1.65, "grad_norm": 0.1834243107917586, "learning_rate": 1.3303609361031655e-05, "loss": 0.3846, "step": 2797 }, { "epoch": 1.65, "grad_norm": 0.1829659299315814, "learning_rate": 1.3256799125682317e-05, "loss": 0.3965, "step": 2798 }, { "epoch": 1.65, "grad_norm": 0.18958779724287603, "learning_rate": 1.3210065542116812e-05, "loss": 0.387, "step": 2799 }, { "epoch": 1.65, "grad_norm": 0.19480397222419207, "learning_rate": 1.316340865163188e-05, "loss": 0.3961, "step": 2800 }, { "epoch": 1.65, "grad_norm": 0.1814371475093744, "learning_rate": 1.31168284954565e-05, "loss": 0.3711, "step": 2801 }, { "epoch": 1.65, "grad_norm": 0.1899328058081375, "learning_rate": 1.3070325114751881e-05, "loss": 0.3997, "step": 2802 }, { "epoch": 1.65, "grad_norm": 0.19259551695163712, "learning_rate": 1.3023898550611313e-05, "loss": 0.4041, "step": 2803 }, { "epoch": 1.65, "grad_norm": 0.1825667974418399, "learning_rate": 1.297754884406025e-05, "loss": 0.3843, "step": 2804 }, { "epoch": 1.65, "grad_norm": 0.18195055678377903, "learning_rate": 1.293127603605625e-05, "loss": 0.3875, "step": 2805 }, { "epoch": 1.65, "grad_norm": 0.18700493318379097, "learning_rate": 1.2885080167488905e-05, "loss": 0.3876, "step": 2806 }, { "epoch": 1.66, "grad_norm": 0.18830079032859265, "learning_rate": 1.2838961279179762e-05, "loss": 0.3746, "step": 2807 }, { "epoch": 1.66, "grad_norm": 0.1735496795197628, "learning_rate": 1.279291941188241e-05, "loss": 0.3856, "step": 2808 }, { "epoch": 1.66, "grad_norm": 0.192993836978478, "learning_rate": 1.274695460628238e-05, "loss": 0.4133, "step": 2809 }, { "epoch": 1.66, "grad_norm": 0.1840059334442372, "learning_rate": 1.2701066902997061e-05, "loss": 0.3622, "step": 2810 }, { "epoch": 1.66, "grad_norm": 0.1823341646749537, "learning_rate": 1.2655256342575738e-05, "loss": 0.36, "step": 2811 }, { "epoch": 1.66, "grad_norm": 0.1817364403500079, "learning_rate": 1.2609522965499553e-05, "loss": 0.3747, "step": 2812 }, { "epoch": 1.66, "grad_norm": 0.17394778008068928, "learning_rate": 1.2563866812181357e-05, "loss": 0.391, "step": 2813 }, { "epoch": 1.66, "grad_norm": 0.18598102335942115, "learning_rate": 1.2518287922965854e-05, "loss": 0.3709, "step": 2814 }, { "epoch": 1.66, "grad_norm": 0.19552125120914168, "learning_rate": 1.2472786338129439e-05, "loss": 0.4518, "step": 2815 }, { "epoch": 1.66, "grad_norm": 0.19033443568738462, "learning_rate": 1.2427362097880168e-05, "loss": 0.4108, "step": 2816 }, { "epoch": 1.66, "grad_norm": 0.18646893524050182, "learning_rate": 1.238201524235778e-05, "loss": 0.3519, "step": 2817 }, { "epoch": 1.66, "grad_norm": 0.17270013189629863, "learning_rate": 1.2336745811633643e-05, "loss": 0.3615, "step": 2818 }, { "epoch": 1.66, "grad_norm": 0.1891145725571965, "learning_rate": 1.229155384571069e-05, "loss": 0.3945, "step": 2819 }, { "epoch": 1.66, "grad_norm": 0.18665124633114744, "learning_rate": 1.224643938452339e-05, "loss": 0.3618, "step": 2820 }, { "epoch": 1.66, "grad_norm": 0.20704796005105727, "learning_rate": 1.220140246793775e-05, "loss": 0.3846, "step": 2821 }, { "epoch": 1.66, "grad_norm": 0.19746824679309988, "learning_rate": 1.215644313575126e-05, "loss": 0.4023, "step": 2822 }, { "epoch": 1.66, "grad_norm": 0.1866997380267449, "learning_rate": 1.2111561427692786e-05, "loss": 0.4096, "step": 2823 }, { "epoch": 1.67, "grad_norm": 0.19478625576917388, "learning_rate": 1.2066757383422667e-05, "loss": 0.4302, "step": 2824 }, { "epoch": 1.67, "grad_norm": 0.2001333286875977, "learning_rate": 1.2022031042532612e-05, "loss": 0.3908, "step": 2825 }, { "epoch": 1.67, "grad_norm": 0.18803738328817945, "learning_rate": 1.1977382444545616e-05, "loss": 0.4035, "step": 2826 }, { "epoch": 1.67, "grad_norm": 0.18664571940089442, "learning_rate": 1.1932811628915996e-05, "loss": 0.4117, "step": 2827 }, { "epoch": 1.67, "grad_norm": 0.18372861978548846, "learning_rate": 1.1888318635029417e-05, "loss": 0.3907, "step": 2828 }, { "epoch": 1.67, "grad_norm": 0.18380904565923992, "learning_rate": 1.1843903502202636e-05, "loss": 0.3882, "step": 2829 }, { "epoch": 1.67, "grad_norm": 0.19358317009559659, "learning_rate": 1.1799566269683693e-05, "loss": 0.3826, "step": 2830 }, { "epoch": 1.67, "grad_norm": 0.17867740875213367, "learning_rate": 1.1755306976651793e-05, "loss": 0.3682, "step": 2831 }, { "epoch": 1.67, "grad_norm": 0.21554588740187808, "learning_rate": 1.1711125662217248e-05, "loss": 0.3772, "step": 2832 }, { "epoch": 1.67, "grad_norm": 0.20421582491112336, "learning_rate": 1.1667022365421432e-05, "loss": 0.3831, "step": 2833 }, { "epoch": 1.67, "grad_norm": 0.17962040426030446, "learning_rate": 1.1622997125236834e-05, "loss": 0.3695, "step": 2834 }, { "epoch": 1.67, "grad_norm": 0.18661938671499903, "learning_rate": 1.1579049980566947e-05, "loss": 0.3953, "step": 2835 }, { "epoch": 1.67, "grad_norm": 0.18399903543308602, "learning_rate": 1.153518097024624e-05, "loss": 0.3529, "step": 2836 }, { "epoch": 1.67, "grad_norm": 0.20792985146502785, "learning_rate": 1.1491390133040147e-05, "loss": 0.4302, "step": 2837 }, { "epoch": 1.67, "grad_norm": 0.18321892919222757, "learning_rate": 1.1447677507645049e-05, "loss": 0.4273, "step": 2838 }, { "epoch": 1.67, "grad_norm": 0.21343818023745764, "learning_rate": 1.1404043132688157e-05, "loss": 0.4064, "step": 2839 }, { "epoch": 1.67, "grad_norm": 0.18841704880001608, "learning_rate": 1.1360487046727576e-05, "loss": 0.4103, "step": 2840 }, { "epoch": 1.68, "grad_norm": 0.17912270675750835, "learning_rate": 1.1317009288252234e-05, "loss": 0.3582, "step": 2841 }, { "epoch": 1.68, "grad_norm": 0.19542193773343466, "learning_rate": 1.1273609895681813e-05, "loss": 0.4183, "step": 2842 }, { "epoch": 1.68, "grad_norm": 0.18945513945796663, "learning_rate": 1.1230288907366759e-05, "loss": 0.3921, "step": 2843 }, { "epoch": 1.68, "grad_norm": 0.17852007031481354, "learning_rate": 1.118704636158826e-05, "loss": 0.3624, "step": 2844 }, { "epoch": 1.68, "grad_norm": 0.18904710881927972, "learning_rate": 1.1143882296558162e-05, "loss": 0.3832, "step": 2845 }, { "epoch": 1.68, "grad_norm": 0.20283633611433424, "learning_rate": 1.1100796750418963e-05, "loss": 0.4434, "step": 2846 }, { "epoch": 1.68, "grad_norm": 0.19302674167647627, "learning_rate": 1.1057789761243776e-05, "loss": 0.3669, "step": 2847 }, { "epoch": 1.68, "grad_norm": 0.20066283789878311, "learning_rate": 1.1014861367036322e-05, "loss": 0.4058, "step": 2848 }, { "epoch": 1.68, "grad_norm": 0.18205872386879324, "learning_rate": 1.0972011605730814e-05, "loss": 0.3502, "step": 2849 }, { "epoch": 1.68, "grad_norm": 0.18209351538178542, "learning_rate": 1.0929240515192018e-05, "loss": 0.3612, "step": 2850 }, { "epoch": 1.68, "grad_norm": 0.1799463748541795, "learning_rate": 1.0886548133215212e-05, "loss": 0.3901, "step": 2851 }, { "epoch": 1.68, "grad_norm": 0.18457553645487926, "learning_rate": 1.0843934497526043e-05, "loss": 0.4013, "step": 2852 }, { "epoch": 1.68, "grad_norm": 0.18027717433183602, "learning_rate": 1.0801399645780642e-05, "loss": 0.3787, "step": 2853 }, { "epoch": 1.68, "grad_norm": 0.1788257136658512, "learning_rate": 1.0758943615565486e-05, "loss": 0.3503, "step": 2854 }, { "epoch": 1.68, "grad_norm": 0.17764028591316763, "learning_rate": 1.0716566444397425e-05, "loss": 0.3475, "step": 2855 }, { "epoch": 1.68, "grad_norm": 0.18189866888371164, "learning_rate": 1.067426816972359e-05, "loss": 0.3567, "step": 2856 }, { "epoch": 1.69, "grad_norm": 0.18276929910949236, "learning_rate": 1.0632048828921459e-05, "loss": 0.3738, "step": 2857 }, { "epoch": 1.69, "grad_norm": 0.18985574998616447, "learning_rate": 1.0589908459298659e-05, "loss": 0.3892, "step": 2858 }, { "epoch": 1.69, "grad_norm": 0.19425071062777888, "learning_rate": 1.05478470980931e-05, "loss": 0.4057, "step": 2859 }, { "epoch": 1.69, "grad_norm": 0.18272515913706877, "learning_rate": 1.0505864782472886e-05, "loss": 0.3598, "step": 2860 }, { "epoch": 1.69, "grad_norm": 0.19848730153196067, "learning_rate": 1.046396154953626e-05, "loss": 0.4218, "step": 2861 }, { "epoch": 1.69, "grad_norm": 0.20197995519491221, "learning_rate": 1.042213743631153e-05, "loss": 0.3715, "step": 2862 }, { "epoch": 1.69, "grad_norm": 0.18568926557727763, "learning_rate": 1.038039247975714e-05, "loss": 0.3887, "step": 2863 }, { "epoch": 1.69, "grad_norm": 0.19679607011383277, "learning_rate": 1.0338726716761593e-05, "loss": 0.4043, "step": 2864 }, { "epoch": 1.69, "grad_norm": 0.20749038036636144, "learning_rate": 1.0297140184143383e-05, "loss": 0.3977, "step": 2865 }, { "epoch": 1.69, "grad_norm": 0.18887797359179764, "learning_rate": 1.0255632918651014e-05, "loss": 0.3484, "step": 2866 }, { "epoch": 1.69, "grad_norm": 0.21288749940839216, "learning_rate": 1.0214204956962947e-05, "loss": 0.4228, "step": 2867 }, { "epoch": 1.69, "grad_norm": 0.19545998387534477, "learning_rate": 1.0172856335687509e-05, "loss": 0.364, "step": 2868 }, { "epoch": 1.69, "grad_norm": 0.19067129616360895, "learning_rate": 1.0131587091362982e-05, "loss": 0.4257, "step": 2869 }, { "epoch": 1.69, "grad_norm": 0.19945048332024073, "learning_rate": 1.0090397260457508e-05, "loss": 0.3868, "step": 2870 }, { "epoch": 1.69, "grad_norm": 0.17970331087008132, "learning_rate": 1.004928687936898e-05, "loss": 0.3876, "step": 2871 }, { "epoch": 1.69, "grad_norm": 0.17901897921602997, "learning_rate": 1.0008255984425141e-05, "loss": 0.3608, "step": 2872 }, { "epoch": 1.69, "grad_norm": 0.2074985731201921, "learning_rate": 9.967304611883543e-06, "loss": 0.4021, "step": 2873 }, { "epoch": 1.7, "grad_norm": 0.18134001903506683, "learning_rate": 9.926432797931351e-06, "loss": 0.3543, "step": 2874 }, { "epoch": 1.7, "grad_norm": 0.18272241073998963, "learning_rate": 9.885640578685518e-06, "loss": 0.3707, "step": 2875 }, { "epoch": 1.7, "grad_norm": 0.19096229524187255, "learning_rate": 9.84492799019261e-06, "loss": 0.4044, "step": 2876 }, { "epoch": 1.7, "grad_norm": 0.19114097422640478, "learning_rate": 9.804295068428881e-06, "loss": 0.4191, "step": 2877 }, { "epoch": 1.7, "grad_norm": 0.1927940842818163, "learning_rate": 9.763741849300124e-06, "loss": 0.4368, "step": 2878 }, { "epoch": 1.7, "grad_norm": 0.1932222722751544, "learning_rate": 9.723268368641735e-06, "loss": 0.367, "step": 2879 }, { "epoch": 1.7, "grad_norm": 0.18876583554332269, "learning_rate": 9.682874662218644e-06, "loss": 0.3693, "step": 2880 }, { "epoch": 1.7, "grad_norm": 0.19350688957925188, "learning_rate": 9.642560765725295e-06, "loss": 0.3935, "step": 2881 }, { "epoch": 1.7, "grad_norm": 0.19395538454804856, "learning_rate": 9.602326714785592e-06, "loss": 0.4062, "step": 2882 }, { "epoch": 1.7, "grad_norm": 0.18687737837536744, "learning_rate": 9.562172544952907e-06, "loss": 0.4056, "step": 2883 }, { "epoch": 1.7, "grad_norm": 0.17094450733612235, "learning_rate": 9.52209829170998e-06, "loss": 0.3433, "step": 2884 }, { "epoch": 1.7, "grad_norm": 0.19198528742425536, "learning_rate": 9.482103990468971e-06, "loss": 0.3576, "step": 2885 }, { "epoch": 1.7, "grad_norm": 0.191964636104055, "learning_rate": 9.44218967657139e-06, "loss": 0.4226, "step": 2886 }, { "epoch": 1.7, "grad_norm": 0.18284789434021473, "learning_rate": 9.40235538528802e-06, "loss": 0.3675, "step": 2887 }, { "epoch": 1.7, "grad_norm": 0.1905504505631898, "learning_rate": 9.362601151818984e-06, "loss": 0.387, "step": 2888 }, { "epoch": 1.7, "grad_norm": 0.2004227156859444, "learning_rate": 9.322927011293637e-06, "loss": 0.4, "step": 2889 }, { "epoch": 1.7, "grad_norm": 0.18446499947016987, "learning_rate": 9.283332998770555e-06, "loss": 0.3589, "step": 2890 }, { "epoch": 1.71, "grad_norm": 0.1792298555119123, "learning_rate": 9.24381914923752e-06, "loss": 0.3492, "step": 2891 }, { "epoch": 1.71, "grad_norm": 0.2002209834519155, "learning_rate": 9.204385497611467e-06, "loss": 0.4051, "step": 2892 }, { "epoch": 1.71, "grad_norm": 0.20471621996640477, "learning_rate": 9.165032078738489e-06, "loss": 0.4161, "step": 2893 }, { "epoch": 1.71, "grad_norm": 0.18319314415608173, "learning_rate": 9.125758927393724e-06, "loss": 0.3779, "step": 2894 }, { "epoch": 1.71, "grad_norm": 0.18673289511587204, "learning_rate": 9.086566078281434e-06, "loss": 0.3594, "step": 2895 }, { "epoch": 1.71, "grad_norm": 0.19224444681640385, "learning_rate": 9.047453566034914e-06, "loss": 0.3768, "step": 2896 }, { "epoch": 1.71, "grad_norm": 0.19306598799560065, "learning_rate": 9.008421425216417e-06, "loss": 0.4097, "step": 2897 }, { "epoch": 1.71, "grad_norm": 0.19161756654149822, "learning_rate": 8.969469690317245e-06, "loss": 0.3973, "step": 2898 }, { "epoch": 1.71, "grad_norm": 0.2012867849626023, "learning_rate": 8.930598395757595e-06, "loss": 0.4146, "step": 2899 }, { "epoch": 1.71, "grad_norm": 0.1741533788328624, "learning_rate": 8.891807575886624e-06, "loss": 0.3594, "step": 2900 }, { "epoch": 1.71, "grad_norm": 0.19231361448939627, "learning_rate": 8.853097264982357e-06, "loss": 0.356, "step": 2901 }, { "epoch": 1.71, "grad_norm": 0.20533186644938736, "learning_rate": 8.814467497251677e-06, "loss": 0.3739, "step": 2902 }, { "epoch": 1.71, "grad_norm": 0.18698584808737279, "learning_rate": 8.775918306830266e-06, "loss": 0.4046, "step": 2903 }, { "epoch": 1.71, "grad_norm": 0.2104814318764746, "learning_rate": 8.737449727782642e-06, "loss": 0.4079, "step": 2904 }, { "epoch": 1.71, "grad_norm": 0.20098428618573008, "learning_rate": 8.699061794102093e-06, "loss": 0.4048, "step": 2905 }, { "epoch": 1.71, "grad_norm": 0.18507212012634458, "learning_rate": 8.660754539710625e-06, "loss": 0.3818, "step": 2906 }, { "epoch": 1.71, "grad_norm": 0.2014523497453203, "learning_rate": 8.62252799845893e-06, "loss": 0.4183, "step": 2907 }, { "epoch": 1.72, "grad_norm": 0.18290858907170823, "learning_rate": 8.584382204126385e-06, "loss": 0.3671, "step": 2908 }, { "epoch": 1.72, "grad_norm": 0.18983843999716807, "learning_rate": 8.546317190421106e-06, "loss": 0.3863, "step": 2909 }, { "epoch": 1.72, "grad_norm": 0.19999120835019119, "learning_rate": 8.508332990979673e-06, "loss": 0.3815, "step": 2910 }, { "epoch": 1.72, "grad_norm": 0.1751899801069206, "learning_rate": 8.47042963936736e-06, "loss": 0.3701, "step": 2911 }, { "epoch": 1.72, "grad_norm": 0.19502109628725386, "learning_rate": 8.432607169077977e-06, "loss": 0.3755, "step": 2912 }, { "epoch": 1.72, "grad_norm": 0.19565978172851314, "learning_rate": 8.394865613533832e-06, "loss": 0.3673, "step": 2913 }, { "epoch": 1.72, "grad_norm": 0.19759320259101212, "learning_rate": 8.357205006085756e-06, "loss": 0.4318, "step": 2914 }, { "epoch": 1.72, "grad_norm": 0.1859624988201233, "learning_rate": 8.319625380013074e-06, "loss": 0.4083, "step": 2915 }, { "epoch": 1.72, "grad_norm": 0.19061883218141462, "learning_rate": 8.282126768523468e-06, "loss": 0.4053, "step": 2916 }, { "epoch": 1.72, "grad_norm": 0.18514292413007089, "learning_rate": 8.24470920475312e-06, "loss": 0.3414, "step": 2917 }, { "epoch": 1.72, "grad_norm": 0.19488059960726256, "learning_rate": 8.207372721766572e-06, "loss": 0.3973, "step": 2918 }, { "epoch": 1.72, "grad_norm": 0.17718229513021916, "learning_rate": 8.170117352556695e-06, "loss": 0.3763, "step": 2919 }, { "epoch": 1.72, "grad_norm": 0.20246941633114438, "learning_rate": 8.132943130044667e-06, "loss": 0.3813, "step": 2920 }, { "epoch": 1.72, "grad_norm": 0.19085709706361953, "learning_rate": 8.095850087080015e-06, "loss": 0.4044, "step": 2921 }, { "epoch": 1.72, "grad_norm": 0.2038351893649488, "learning_rate": 8.058838256440492e-06, "loss": 0.3962, "step": 2922 }, { "epoch": 1.72, "grad_norm": 0.21825503424463988, "learning_rate": 8.021907670832074e-06, "loss": 0.3541, "step": 2923 }, { "epoch": 1.73, "grad_norm": 0.180708799816823, "learning_rate": 7.985058362888975e-06, "loss": 0.4023, "step": 2924 }, { "epoch": 1.73, "grad_norm": 0.20126430165940656, "learning_rate": 7.948290365173584e-06, "loss": 0.4045, "step": 2925 }, { "epoch": 1.73, "grad_norm": 0.18866439227980727, "learning_rate": 7.91160371017644e-06, "loss": 0.3817, "step": 2926 }, { "epoch": 1.73, "grad_norm": 0.19487415886389572, "learning_rate": 7.874998430316172e-06, "loss": 0.407, "step": 2927 }, { "epoch": 1.73, "grad_norm": 0.18579899661300278, "learning_rate": 7.838474557939545e-06, "loss": 0.3885, "step": 2928 }, { "epoch": 1.73, "grad_norm": 0.18411710692418848, "learning_rate": 7.802032125321345e-06, "loss": 0.3622, "step": 2929 }, { "epoch": 1.73, "grad_norm": 0.20281391997383588, "learning_rate": 7.765671164664423e-06, "loss": 0.4359, "step": 2930 }, { "epoch": 1.73, "grad_norm": 0.18810742810049738, "learning_rate": 7.72939170809962e-06, "loss": 0.3533, "step": 2931 }, { "epoch": 1.73, "grad_norm": 0.1967944384539896, "learning_rate": 7.693193787685782e-06, "loss": 0.3973, "step": 2932 }, { "epoch": 1.73, "grad_norm": 0.17731253391948779, "learning_rate": 7.657077435409643e-06, "loss": 0.3552, "step": 2933 }, { "epoch": 1.73, "grad_norm": 0.1816288509685908, "learning_rate": 7.621042683185931e-06, "loss": 0.3718, "step": 2934 }, { "epoch": 1.73, "grad_norm": 0.20764843361130358, "learning_rate": 7.585089562857217e-06, "loss": 0.4214, "step": 2935 }, { "epoch": 1.73, "grad_norm": 0.19453851257013952, "learning_rate": 7.549218106193967e-06, "loss": 0.411, "step": 2936 }, { "epoch": 1.73, "grad_norm": 0.21279984261284868, "learning_rate": 7.513428344894458e-06, "loss": 0.3401, "step": 2937 }, { "epoch": 1.73, "grad_norm": 0.18343133868234535, "learning_rate": 7.4777203105848125e-06, "loss": 0.3941, "step": 2938 }, { "epoch": 1.73, "grad_norm": 0.21052952203623176, "learning_rate": 7.4420940348188806e-06, "loss": 0.4359, "step": 2939 }, { "epoch": 1.73, "grad_norm": 0.2265859274846711, "learning_rate": 7.406549549078312e-06, "loss": 0.4307, "step": 2940 }, { "epoch": 1.74, "grad_norm": 0.18545526211162194, "learning_rate": 7.371086884772471e-06, "loss": 0.3763, "step": 2941 }, { "epoch": 1.74, "grad_norm": 0.21364237947548975, "learning_rate": 7.335706073238391e-06, "loss": 0.4378, "step": 2942 }, { "epoch": 1.74, "grad_norm": 0.1893654587422108, "learning_rate": 7.300407145740806e-06, "loss": 0.3961, "step": 2943 }, { "epoch": 1.74, "grad_norm": 0.19588854106498704, "learning_rate": 7.265190133472089e-06, "loss": 0.3607, "step": 2944 }, { "epoch": 1.74, "grad_norm": 0.2191471293218054, "learning_rate": 7.230055067552211e-06, "loss": 0.4441, "step": 2945 }, { "epoch": 1.74, "grad_norm": 0.19082213620990843, "learning_rate": 7.1950019790287485e-06, "loss": 0.3766, "step": 2946 }, { "epoch": 1.74, "grad_norm": 0.19297814729942536, "learning_rate": 7.160030898876835e-06, "loss": 0.406, "step": 2947 }, { "epoch": 1.74, "grad_norm": 0.19765593940930495, "learning_rate": 7.12514185799914e-06, "loss": 0.4105, "step": 2948 }, { "epoch": 1.74, "grad_norm": 0.18194885663930477, "learning_rate": 7.090334887225792e-06, "loss": 0.3886, "step": 2949 }, { "epoch": 1.74, "grad_norm": 0.18021052671227944, "learning_rate": 7.055610017314463e-06, "loss": 0.3517, "step": 2950 }, { "epoch": 1.74, "grad_norm": 0.19181467145717865, "learning_rate": 7.020967278950253e-06, "loss": 0.3959, "step": 2951 }, { "epoch": 1.74, "grad_norm": 0.18275022534861474, "learning_rate": 6.98640670274564e-06, "loss": 0.3728, "step": 2952 }, { "epoch": 1.74, "grad_norm": 0.19157406478323333, "learning_rate": 6.9519283192405525e-06, "loss": 0.3882, "step": 2953 }, { "epoch": 1.74, "grad_norm": 0.18949485590643386, "learning_rate": 6.917532158902318e-06, "loss": 0.3978, "step": 2954 }, { "epoch": 1.74, "grad_norm": 0.1989258914761517, "learning_rate": 6.883218252125512e-06, "loss": 0.4186, "step": 2955 }, { "epoch": 1.74, "grad_norm": 0.18665299033631447, "learning_rate": 6.848986629232079e-06, "loss": 0.3755, "step": 2956 }, { "epoch": 1.74, "grad_norm": 0.19251540869035402, "learning_rate": 6.814837320471279e-06, "loss": 0.4082, "step": 2957 }, { "epoch": 1.75, "grad_norm": 0.18901165432013456, "learning_rate": 6.780770356019561e-06, "loss": 0.3702, "step": 2958 }, { "epoch": 1.75, "grad_norm": 0.19203320591549375, "learning_rate": 6.746785765980679e-06, "loss": 0.3636, "step": 2959 }, { "epoch": 1.75, "grad_norm": 0.22235792675190075, "learning_rate": 6.712883580385554e-06, "loss": 0.4052, "step": 2960 }, { "epoch": 1.75, "grad_norm": 0.18708099149404153, "learning_rate": 6.679063829192311e-06, "loss": 0.3926, "step": 2961 }, { "epoch": 1.75, "grad_norm": 0.18494080916307828, "learning_rate": 6.645326542286223e-06, "loss": 0.3751, "step": 2962 }, { "epoch": 1.75, "grad_norm": 0.18946623107272353, "learning_rate": 6.611671749479697e-06, "loss": 0.3647, "step": 2963 }, { "epoch": 1.75, "grad_norm": 0.18705353389459098, "learning_rate": 6.578099480512256e-06, "loss": 0.3778, "step": 2964 }, { "epoch": 1.75, "grad_norm": 0.1916670849042115, "learning_rate": 6.5446097650504355e-06, "loss": 0.3752, "step": 2965 }, { "epoch": 1.75, "grad_norm": 0.21603562470418328, "learning_rate": 6.5112026326878965e-06, "loss": 0.4069, "step": 2966 }, { "epoch": 1.75, "grad_norm": 0.19485054525464968, "learning_rate": 6.477878112945301e-06, "loss": 0.3866, "step": 2967 }, { "epoch": 1.75, "grad_norm": 0.1855274419319167, "learning_rate": 6.444636235270285e-06, "loss": 0.3967, "step": 2968 }, { "epoch": 1.75, "grad_norm": 0.19320539411671053, "learning_rate": 6.411477029037494e-06, "loss": 0.4241, "step": 2969 }, { "epoch": 1.75, "grad_norm": 0.20112976004377922, "learning_rate": 6.378400523548489e-06, "loss": 0.4192, "step": 2970 }, { "epoch": 1.75, "grad_norm": 0.18417473736719064, "learning_rate": 6.345406748031768e-06, "loss": 0.3716, "step": 2971 }, { "epoch": 1.75, "grad_norm": 0.1840100228031329, "learning_rate": 6.312495731642731e-06, "loss": 0.3885, "step": 2972 }, { "epoch": 1.75, "grad_norm": 0.22949045550103406, "learning_rate": 6.279667503463638e-06, "loss": 0.4162, "step": 2973 }, { "epoch": 1.75, "grad_norm": 0.19675031568235163, "learning_rate": 6.24692209250356e-06, "loss": 0.4307, "step": 2974 }, { "epoch": 1.76, "grad_norm": 0.18972862959837244, "learning_rate": 6.21425952769843e-06, "loss": 0.3751, "step": 2975 }, { "epoch": 1.76, "grad_norm": 0.17665794307839597, "learning_rate": 6.18167983791097e-06, "loss": 0.3554, "step": 2976 }, { "epoch": 1.76, "grad_norm": 0.1832311142244817, "learning_rate": 6.149183051930662e-06, "loss": 0.3818, "step": 2977 }, { "epoch": 1.76, "grad_norm": 0.19764654343188384, "learning_rate": 6.116769198473693e-06, "loss": 0.3851, "step": 2978 }, { "epoch": 1.76, "grad_norm": 0.18592846465779592, "learning_rate": 6.084438306183015e-06, "loss": 0.381, "step": 2979 }, { "epoch": 1.76, "grad_norm": 0.18860776190892709, "learning_rate": 6.052190403628244e-06, "loss": 0.3898, "step": 2980 }, { "epoch": 1.76, "grad_norm": 0.1932698353278176, "learning_rate": 6.020025519305672e-06, "loss": 0.4136, "step": 2981 }, { "epoch": 1.76, "grad_norm": 0.19722848524162306, "learning_rate": 5.98794368163823e-06, "loss": 0.4136, "step": 2982 }, { "epoch": 1.76, "grad_norm": 0.18199038244482568, "learning_rate": 5.955944918975476e-06, "loss": 0.36, "step": 2983 }, { "epoch": 1.76, "grad_norm": 0.19911096620484098, "learning_rate": 5.924029259593511e-06, "loss": 0.3826, "step": 2984 }, { "epoch": 1.76, "grad_norm": 0.19078439496902141, "learning_rate": 5.892196731695043e-06, "loss": 0.4054, "step": 2985 }, { "epoch": 1.76, "grad_norm": 0.19821026684553988, "learning_rate": 5.860447363409327e-06, "loss": 0.4049, "step": 2986 }, { "epoch": 1.76, "grad_norm": 0.20139870437661192, "learning_rate": 5.8287811827920865e-06, "loss": 0.4185, "step": 2987 }, { "epoch": 1.76, "grad_norm": 0.18318901706477717, "learning_rate": 5.7971982178255835e-06, "loss": 0.3834, "step": 2988 }, { "epoch": 1.76, "grad_norm": 0.20668888021525425, "learning_rate": 5.765698496418515e-06, "loss": 0.4023, "step": 2989 }, { "epoch": 1.76, "grad_norm": 0.1896019068304225, "learning_rate": 5.734282046406025e-06, "loss": 0.3881, "step": 2990 }, { "epoch": 1.77, "grad_norm": 0.1954645183614734, "learning_rate": 5.702948895549698e-06, "loss": 0.406, "step": 2991 }, { "epoch": 1.77, "grad_norm": 0.19180659148700469, "learning_rate": 5.671699071537473e-06, "loss": 0.4175, "step": 2992 }, { "epoch": 1.77, "grad_norm": 0.20921814241950143, "learning_rate": 5.6405326019836835e-06, "loss": 0.4527, "step": 2993 }, { "epoch": 1.77, "grad_norm": 0.17413574779071542, "learning_rate": 5.609449514428977e-06, "loss": 0.3323, "step": 2994 }, { "epoch": 1.77, "grad_norm": 0.18660960465501725, "learning_rate": 5.5784498363403605e-06, "loss": 0.3812, "step": 2995 }, { "epoch": 1.77, "grad_norm": 0.1925241579835085, "learning_rate": 5.547533595111109e-06, "loss": 0.3914, "step": 2996 }, { "epoch": 1.77, "grad_norm": 0.1841251913859023, "learning_rate": 5.5167008180607385e-06, "loss": 0.3804, "step": 2997 }, { "epoch": 1.77, "grad_norm": 0.18889703992146564, "learning_rate": 5.485951532435063e-06, "loss": 0.3536, "step": 2998 }, { "epoch": 1.77, "grad_norm": 0.1863670029115911, "learning_rate": 5.455285765406126e-06, "loss": 0.3524, "step": 2999 }, { "epoch": 1.77, "grad_norm": 0.1756904397321326, "learning_rate": 5.424703544072107e-06, "loss": 0.3421, "step": 3000 }, { "epoch": 1.77, "grad_norm": 0.2085191353918397, "learning_rate": 5.3942048954574e-06, "loss": 0.3811, "step": 3001 }, { "epoch": 1.77, "grad_norm": 0.19480406801885236, "learning_rate": 5.363789846512546e-06, "loss": 0.4007, "step": 3002 }, { "epoch": 1.77, "grad_norm": 0.18354644009378338, "learning_rate": 5.333458424114202e-06, "loss": 0.3883, "step": 3003 }, { "epoch": 1.77, "grad_norm": 0.2027662999476591, "learning_rate": 5.303210655065138e-06, "loss": 0.3775, "step": 3004 }, { "epoch": 1.77, "grad_norm": 0.17580837553023873, "learning_rate": 5.273046566094198e-06, "loss": 0.3537, "step": 3005 }, { "epoch": 1.77, "grad_norm": 0.20765612035318065, "learning_rate": 5.2429661838562684e-06, "loss": 0.465, "step": 3006 }, { "epoch": 1.77, "grad_norm": 0.19278792658868227, "learning_rate": 5.212969534932299e-06, "loss": 0.3653, "step": 3007 }, { "epoch": 1.78, "grad_norm": 0.19513017429795937, "learning_rate": 5.18305664582922e-06, "loss": 0.4025, "step": 3008 }, { "epoch": 1.78, "grad_norm": 0.19127354333428667, "learning_rate": 5.153227542979955e-06, "loss": 0.4056, "step": 3009 }, { "epoch": 1.78, "grad_norm": 0.20131318938926027, "learning_rate": 5.123482252743384e-06, "loss": 0.4293, "step": 3010 }, { "epoch": 1.78, "grad_norm": 0.20410517958580796, "learning_rate": 5.093820801404314e-06, "loss": 0.4065, "step": 3011 }, { "epoch": 1.78, "grad_norm": 0.18704971564363876, "learning_rate": 5.064243215173525e-06, "loss": 0.3946, "step": 3012 }, { "epoch": 1.78, "grad_norm": 0.17567995302964098, "learning_rate": 5.034749520187599e-06, "loss": 0.3911, "step": 3013 }, { "epoch": 1.78, "grad_norm": 0.18580034034263418, "learning_rate": 5.005339742509052e-06, "loss": 0.3824, "step": 3014 }, { "epoch": 1.78, "grad_norm": 0.18865028556075208, "learning_rate": 4.97601390812622e-06, "loss": 0.3644, "step": 3015 }, { "epoch": 1.78, "grad_norm": 0.1780081966739988, "learning_rate": 4.9467720429532626e-06, "loss": 0.3715, "step": 3016 }, { "epoch": 1.78, "grad_norm": 0.19988120677078622, "learning_rate": 4.917614172830165e-06, "loss": 0.4045, "step": 3017 }, { "epoch": 1.78, "grad_norm": 0.18819370470191035, "learning_rate": 4.888540323522639e-06, "loss": 0.3869, "step": 3018 }, { "epoch": 1.78, "grad_norm": 0.1795741313544112, "learning_rate": 4.859550520722212e-06, "loss": 0.3849, "step": 3019 }, { "epoch": 1.78, "grad_norm": 0.19368920456635835, "learning_rate": 4.8306447900460795e-06, "loss": 0.3751, "step": 3020 }, { "epoch": 1.78, "grad_norm": 0.19386497788215812, "learning_rate": 4.8018231570371775e-06, "loss": 0.4061, "step": 3021 }, { "epoch": 1.78, "grad_norm": 0.2043390708744847, "learning_rate": 4.773085647164155e-06, "loss": 0.4794, "step": 3022 }, { "epoch": 1.78, "grad_norm": 0.19633532001180456, "learning_rate": 4.744432285821254e-06, "loss": 0.4068, "step": 3023 }, { "epoch": 1.78, "grad_norm": 0.19373876041409038, "learning_rate": 4.7158630983284106e-06, "loss": 0.3835, "step": 3024 }, { "epoch": 1.79, "grad_norm": 0.17598407117172582, "learning_rate": 4.687378109931184e-06, "loss": 0.3597, "step": 3025 }, { "epoch": 1.79, "grad_norm": 0.19456749065947174, "learning_rate": 4.658977345800697e-06, "loss": 0.3696, "step": 3026 }, { "epoch": 1.79, "grad_norm": 0.20959412480815218, "learning_rate": 4.630660831033673e-06, "loss": 0.4393, "step": 3027 }, { "epoch": 1.79, "grad_norm": 0.18703532104575968, "learning_rate": 4.6024285906523855e-06, "loss": 0.383, "step": 3028 }, { "epoch": 1.79, "grad_norm": 0.20315455063945598, "learning_rate": 4.574280649604601e-06, "loss": 0.3698, "step": 3029 }, { "epoch": 1.79, "grad_norm": 0.18663063418997797, "learning_rate": 4.546217032763645e-06, "loss": 0.3697, "step": 3030 }, { "epoch": 1.79, "grad_norm": 0.19424594577337717, "learning_rate": 4.518237764928301e-06, "loss": 0.3825, "step": 3031 }, { "epoch": 1.79, "grad_norm": 0.19229780394926582, "learning_rate": 4.490342870822828e-06, "loss": 0.4045, "step": 3032 }, { "epoch": 1.79, "grad_norm": 0.19087778057394364, "learning_rate": 4.462532375096895e-06, "loss": 0.4272, "step": 3033 }, { "epoch": 1.79, "grad_norm": 0.1861966013754029, "learning_rate": 4.434806302325634e-06, "loss": 0.3641, "step": 3034 }, { "epoch": 1.79, "grad_norm": 0.2016550514880452, "learning_rate": 4.407164677009568e-06, "loss": 0.4028, "step": 3035 }, { "epoch": 1.79, "grad_norm": 0.18467782571726843, "learning_rate": 4.37960752357458e-06, "loss": 0.365, "step": 3036 }, { "epoch": 1.79, "grad_norm": 0.17953854507068792, "learning_rate": 4.3521348663719045e-06, "loss": 0.3518, "step": 3037 }, { "epoch": 1.79, "grad_norm": 0.18886356823427813, "learning_rate": 4.324746729678142e-06, "loss": 0.3646, "step": 3038 }, { "epoch": 1.79, "grad_norm": 0.2064592201774944, "learning_rate": 4.297443137695156e-06, "loss": 0.412, "step": 3039 }, { "epoch": 1.79, "grad_norm": 0.1944505486212464, "learning_rate": 4.270224114550147e-06, "loss": 0.3692, "step": 3040 }, { "epoch": 1.79, "grad_norm": 0.20673212324986717, "learning_rate": 4.243089684295576e-06, "loss": 0.4313, "step": 3041 }, { "epoch": 1.8, "grad_norm": 0.19246473196675232, "learning_rate": 4.216039870909094e-06, "loss": 0.3982, "step": 3042 }, { "epoch": 1.8, "grad_norm": 0.18922516737788958, "learning_rate": 4.189074698293693e-06, "loss": 0.3655, "step": 3043 }, { "epoch": 1.8, "grad_norm": 0.19078819475088432, "learning_rate": 4.1621941902774906e-06, "loss": 0.385, "step": 3044 }, { "epoch": 1.8, "grad_norm": 0.19181915011403114, "learning_rate": 4.1353983706137745e-06, "loss": 0.393, "step": 3045 }, { "epoch": 1.8, "grad_norm": 0.17059509967536488, "learning_rate": 4.108687262981048e-06, "loss": 0.3286, "step": 3046 }, { "epoch": 1.8, "grad_norm": 0.18191261156882021, "learning_rate": 4.0820608909829416e-06, "loss": 0.4133, "step": 3047 }, { "epoch": 1.8, "grad_norm": 0.18689231031706133, "learning_rate": 4.055519278148201e-06, "loss": 0.3549, "step": 3048 }, { "epoch": 1.8, "grad_norm": 0.19008551894380415, "learning_rate": 4.029062447930665e-06, "loss": 0.3862, "step": 3049 }, { "epoch": 1.8, "grad_norm": 0.17864720895991723, "learning_rate": 4.002690423709277e-06, "loss": 0.3628, "step": 3050 }, { "epoch": 1.8, "grad_norm": 0.20418332555315316, "learning_rate": 3.976403228788017e-06, "loss": 0.3849, "step": 3051 }, { "epoch": 1.8, "grad_norm": 0.22285648605209982, "learning_rate": 3.950200886395916e-06, "loss": 0.3925, "step": 3052 }, { "epoch": 1.8, "grad_norm": 0.19498916492484897, "learning_rate": 3.9240834196870195e-06, "loss": 0.3784, "step": 3053 }, { "epoch": 1.8, "grad_norm": 0.18714101236614172, "learning_rate": 3.898050851740398e-06, "loss": 0.3816, "step": 3054 }, { "epoch": 1.8, "grad_norm": 0.20015720362037948, "learning_rate": 3.872103205560052e-06, "loss": 0.3864, "step": 3055 }, { "epoch": 1.8, "grad_norm": 0.18942369033523968, "learning_rate": 3.846240504074961e-06, "loss": 0.3845, "step": 3056 }, { "epoch": 1.8, "grad_norm": 0.17566746603927816, "learning_rate": 3.820462770139066e-06, "loss": 0.3675, "step": 3057 }, { "epoch": 1.81, "grad_norm": 0.20157919882361208, "learning_rate": 3.7947700265311913e-06, "loss": 0.3932, "step": 3058 }, { "epoch": 1.81, "grad_norm": 0.19956018543699966, "learning_rate": 3.7691622959550754e-06, "loss": 0.4065, "step": 3059 }, { "epoch": 1.81, "grad_norm": 0.18289114123090686, "learning_rate": 3.743639601039317e-06, "loss": 0.3618, "step": 3060 }, { "epoch": 1.81, "grad_norm": 0.1937354421427421, "learning_rate": 3.718201964337409e-06, "loss": 0.4134, "step": 3061 }, { "epoch": 1.81, "grad_norm": 0.22282895589554233, "learning_rate": 3.6928494083276367e-06, "loss": 0.442, "step": 3062 }, { "epoch": 1.81, "grad_norm": 0.19567047724989875, "learning_rate": 3.6675819554131464e-06, "loss": 0.3947, "step": 3063 }, { "epoch": 1.81, "grad_norm": 0.2048875031927258, "learning_rate": 3.6423996279218442e-06, "loss": 0.41, "step": 3064 }, { "epoch": 1.81, "grad_norm": 0.20308999667135372, "learning_rate": 3.6173024481064187e-06, "loss": 0.4615, "step": 3065 }, { "epoch": 1.81, "grad_norm": 0.19228712713285978, "learning_rate": 3.5922904381443413e-06, "loss": 0.427, "step": 3066 }, { "epoch": 1.81, "grad_norm": 0.19252248368996067, "learning_rate": 3.5673636201378204e-06, "loss": 0.3685, "step": 3067 }, { "epoch": 1.81, "grad_norm": 0.20530020014607442, "learning_rate": 3.5425220161137474e-06, "loss": 0.4255, "step": 3068 }, { "epoch": 1.81, "grad_norm": 0.18145381029299965, "learning_rate": 3.51776564802373e-06, "loss": 0.3678, "step": 3069 }, { "epoch": 1.81, "grad_norm": 0.19310693625619313, "learning_rate": 3.4930945377440795e-06, "loss": 0.3899, "step": 3070 }, { "epoch": 1.81, "grad_norm": 0.1907414474503299, "learning_rate": 3.468508707075757e-06, "loss": 0.3819, "step": 3071 }, { "epoch": 1.81, "grad_norm": 0.1965869704282478, "learning_rate": 3.44400817774434e-06, "loss": 0.3789, "step": 3072 }, { "epoch": 1.81, "grad_norm": 0.21310167998529464, "learning_rate": 3.4195929714000654e-06, "loss": 0.4106, "step": 3073 }, { "epoch": 1.81, "grad_norm": 0.2014074107287134, "learning_rate": 3.3952631096177414e-06, "loss": 0.4039, "step": 3074 }, { "epoch": 1.82, "grad_norm": 0.18533783622734315, "learning_rate": 3.3710186138967704e-06, "loss": 0.3545, "step": 3075 }, { "epoch": 1.82, "grad_norm": 0.17866153267060225, "learning_rate": 3.3468595056611372e-06, "loss": 0.3836, "step": 3076 }, { "epoch": 1.82, "grad_norm": 0.1853015550309895, "learning_rate": 3.3227858062593765e-06, "loss": 0.4297, "step": 3077 }, { "epoch": 1.82, "grad_norm": 0.20097830071097617, "learning_rate": 3.2987975369644817e-06, "loss": 0.3992, "step": 3078 }, { "epoch": 1.82, "grad_norm": 0.189571690369022, "learning_rate": 3.274894718974031e-06, "loss": 0.3582, "step": 3079 }, { "epoch": 1.82, "grad_norm": 0.2347129525703833, "learning_rate": 3.251077373410105e-06, "loss": 0.372, "step": 3080 }, { "epoch": 1.82, "grad_norm": 0.18771010609318178, "learning_rate": 3.2273455213191585e-06, "loss": 0.3945, "step": 3081 }, { "epoch": 1.82, "grad_norm": 0.19502603116959955, "learning_rate": 3.203699183672193e-06, "loss": 0.4045, "step": 3082 }, { "epoch": 1.82, "grad_norm": 0.18252504680025133, "learning_rate": 3.180138381364606e-06, "loss": 0.3735, "step": 3083 }, { "epoch": 1.82, "grad_norm": 0.1796587965575206, "learning_rate": 3.156663135216209e-06, "loss": 0.3793, "step": 3084 }, { "epoch": 1.82, "grad_norm": 0.1980068683285886, "learning_rate": 3.133273465971209e-06, "loss": 0.4311, "step": 3085 }, { "epoch": 1.82, "grad_norm": 0.1984199641786852, "learning_rate": 3.109969394298218e-06, "loss": 0.4052, "step": 3086 }, { "epoch": 1.82, "grad_norm": 0.1969664531299117, "learning_rate": 3.0867509407901506e-06, "loss": 0.4003, "step": 3087 }, { "epoch": 1.82, "grad_norm": 0.1897604334827666, "learning_rate": 3.0636181259643514e-06, "loss": 0.3682, "step": 3088 }, { "epoch": 1.82, "grad_norm": 0.19750841186189524, "learning_rate": 3.040570970262402e-06, "loss": 0.3792, "step": 3089 }, { "epoch": 1.82, "grad_norm": 0.18310374879465044, "learning_rate": 3.0176094940502664e-06, "loss": 0.3848, "step": 3090 }, { "epoch": 1.82, "grad_norm": 0.20034511217222092, "learning_rate": 2.9947337176181144e-06, "loss": 0.4018, "step": 3091 }, { "epoch": 1.83, "grad_norm": 0.20004670284240827, "learning_rate": 2.971943661180465e-06, "loss": 0.4593, "step": 3092 }, { "epoch": 1.83, "grad_norm": 0.1814133588800584, "learning_rate": 2.9492393448760426e-06, "loss": 0.3799, "step": 3093 }, { "epoch": 1.83, "grad_norm": 0.2200272555047984, "learning_rate": 2.9266207887678088e-06, "loss": 0.4298, "step": 3094 }, { "epoch": 1.83, "grad_norm": 0.19018317992268535, "learning_rate": 2.9040880128429536e-06, "loss": 0.3931, "step": 3095 }, { "epoch": 1.83, "grad_norm": 0.20000109367751695, "learning_rate": 2.881641037012872e-06, "loss": 0.4114, "step": 3096 }, { "epoch": 1.83, "grad_norm": 0.19633599630494716, "learning_rate": 2.8592798811131416e-06, "loss": 0.4446, "step": 3097 }, { "epoch": 1.83, "grad_norm": 0.19319921014436206, "learning_rate": 2.837004564903478e-06, "loss": 0.4133, "step": 3098 }, { "epoch": 1.83, "grad_norm": 0.18140944372475734, "learning_rate": 2.8148151080677807e-06, "loss": 0.3665, "step": 3099 }, { "epoch": 1.83, "grad_norm": 0.17640229722588313, "learning_rate": 2.7927115302140317e-06, "loss": 0.3658, "step": 3100 }, { "epoch": 1.83, "grad_norm": 0.19508199702272827, "learning_rate": 2.770693850874373e-06, "loss": 0.3871, "step": 3101 }, { "epoch": 1.83, "grad_norm": 0.20731154915272468, "learning_rate": 2.748762089505019e-06, "loss": 0.4425, "step": 3102 }, { "epoch": 1.83, "grad_norm": 0.18689359827339425, "learning_rate": 2.7269162654862457e-06, "loss": 0.3844, "step": 3103 }, { "epoch": 1.83, "grad_norm": 0.20120781400415283, "learning_rate": 2.7051563981224216e-06, "loss": 0.3981, "step": 3104 }, { "epoch": 1.83, "grad_norm": 0.18363501072484445, "learning_rate": 2.683482506641932e-06, "loss": 0.3486, "step": 3105 }, { "epoch": 1.83, "grad_norm": 0.19875829323639832, "learning_rate": 2.661894610197213e-06, "loss": 0.4043, "step": 3106 }, { "epoch": 1.83, "grad_norm": 0.18863380125947674, "learning_rate": 2.6403927278646823e-06, "loss": 0.3701, "step": 3107 }, { "epoch": 1.83, "grad_norm": 0.19337787981726978, "learning_rate": 2.618976878644774e-06, "loss": 0.3881, "step": 3108 }, { "epoch": 1.84, "grad_norm": 0.1881175788170023, "learning_rate": 2.597647081461896e-06, "loss": 0.3917, "step": 3109 }, { "epoch": 1.84, "grad_norm": 0.18521342222222642, "learning_rate": 2.5764033551643917e-06, "loss": 0.364, "step": 3110 }, { "epoch": 1.84, "grad_norm": 0.1973306468994064, "learning_rate": 2.555245718524568e-06, "loss": 0.4051, "step": 3111 }, { "epoch": 1.84, "grad_norm": 0.17802966644073898, "learning_rate": 2.5341741902386583e-06, "loss": 0.3847, "step": 3112 }, { "epoch": 1.84, "grad_norm": 0.1868961775702084, "learning_rate": 2.5131887889267793e-06, "loss": 0.3714, "step": 3113 }, { "epoch": 1.84, "grad_norm": 0.19173975866165188, "learning_rate": 2.4922895331329743e-06, "loss": 0.406, "step": 3114 }, { "epoch": 1.84, "grad_norm": 0.2019551359323475, "learning_rate": 2.4714764413251602e-06, "loss": 0.378, "step": 3115 }, { "epoch": 1.84, "grad_norm": 0.1841730445013379, "learning_rate": 2.4507495318950804e-06, "loss": 0.3742, "step": 3116 }, { "epoch": 1.84, "grad_norm": 0.18756082157918272, "learning_rate": 2.4301088231583615e-06, "loss": 0.3568, "step": 3117 }, { "epoch": 1.84, "grad_norm": 0.19710909361068985, "learning_rate": 2.4095543333544244e-06, "loss": 0.4231, "step": 3118 }, { "epoch": 1.84, "grad_norm": 0.1843604977757972, "learning_rate": 2.389086080646541e-06, "loss": 0.3987, "step": 3119 }, { "epoch": 1.84, "grad_norm": 0.1917810789555228, "learning_rate": 2.368704083121731e-06, "loss": 0.3421, "step": 3120 }, { "epoch": 1.84, "grad_norm": 0.18416664870208654, "learning_rate": 2.348408358790832e-06, "loss": 0.3513, "step": 3121 }, { "epoch": 1.84, "grad_norm": 0.19071739974081112, "learning_rate": 2.328198925588432e-06, "loss": 0.3708, "step": 3122 }, { "epoch": 1.84, "grad_norm": 0.178307974776429, "learning_rate": 2.308075801372844e-06, "loss": 0.3498, "step": 3123 }, { "epoch": 1.84, "grad_norm": 0.19784957845065634, "learning_rate": 2.288039003926157e-06, "loss": 0.4153, "step": 3124 }, { "epoch": 1.85, "grad_norm": 0.18320277935679874, "learning_rate": 2.2680885509541616e-06, "loss": 0.4105, "step": 3125 }, { "epoch": 1.85, "grad_norm": 0.1936425061613634, "learning_rate": 2.2482244600863124e-06, "loss": 0.346, "step": 3126 }, { "epoch": 1.85, "grad_norm": 0.18912902067810736, "learning_rate": 2.22844674887579e-06, "loss": 0.3941, "step": 3127 }, { "epoch": 1.85, "grad_norm": 0.18476315688660908, "learning_rate": 2.2087554347994477e-06, "loss": 0.3699, "step": 3128 }, { "epoch": 1.85, "grad_norm": 0.19961368109564082, "learning_rate": 2.189150535257756e-06, "loss": 0.3892, "step": 3129 }, { "epoch": 1.85, "grad_norm": 0.17429966573414088, "learning_rate": 2.1696320675748447e-06, "loss": 0.3555, "step": 3130 }, { "epoch": 1.85, "grad_norm": 0.17919440916619483, "learning_rate": 2.1502000489984853e-06, "loss": 0.3526, "step": 3131 }, { "epoch": 1.85, "grad_norm": 0.2078267776964174, "learning_rate": 2.13085449670003e-06, "loss": 0.4291, "step": 3132 }, { "epoch": 1.85, "grad_norm": 0.19063798081819075, "learning_rate": 2.1115954277744264e-06, "loss": 0.4074, "step": 3133 }, { "epoch": 1.85, "grad_norm": 0.19081641574100974, "learning_rate": 2.0924228592402174e-06, "loss": 0.3689, "step": 3134 }, { "epoch": 1.85, "grad_norm": 0.1881455252475656, "learning_rate": 2.0733368080395054e-06, "loss": 0.3477, "step": 3135 }, { "epoch": 1.85, "grad_norm": 0.194375006291378, "learning_rate": 2.054337291037911e-06, "loss": 0.4095, "step": 3136 }, { "epoch": 1.85, "grad_norm": 0.19705529909778877, "learning_rate": 2.0354243250246263e-06, "loss": 0.3673, "step": 3137 }, { "epoch": 1.85, "grad_norm": 0.1845700451116757, "learning_rate": 2.016597926712349e-06, "loss": 0.4007, "step": 3138 }, { "epoch": 1.85, "grad_norm": 0.19741016984428209, "learning_rate": 1.997858112737261e-06, "loss": 0.3952, "step": 3139 }, { "epoch": 1.85, "grad_norm": 0.17693779240080532, "learning_rate": 1.9792048996590595e-06, "loss": 0.3786, "step": 3140 }, { "epoch": 1.85, "grad_norm": 0.19772498598639746, "learning_rate": 1.9606383039609043e-06, "loss": 0.3906, "step": 3141 }, { "epoch": 1.86, "grad_norm": 0.1891658783526746, "learning_rate": 1.942158342049405e-06, "loss": 0.4, "step": 3142 }, { "epoch": 1.86, "grad_norm": 0.2047918931876484, "learning_rate": 1.9237650302546318e-06, "loss": 0.4152, "step": 3143 }, { "epoch": 1.86, "grad_norm": 0.19761414033249414, "learning_rate": 1.9054583848300944e-06, "loss": 0.3712, "step": 3144 }, { "epoch": 1.86, "grad_norm": 0.19782434959629427, "learning_rate": 1.8872384219526752e-06, "loss": 0.3769, "step": 3145 }, { "epoch": 1.86, "grad_norm": 0.18935966535034518, "learning_rate": 1.8691051577227059e-06, "loss": 0.4046, "step": 3146 }, { "epoch": 1.86, "grad_norm": 0.2152916894984238, "learning_rate": 1.8510586081638803e-06, "loss": 0.4048, "step": 3147 }, { "epoch": 1.86, "grad_norm": 0.1932939479942431, "learning_rate": 1.8330987892232864e-06, "loss": 0.407, "step": 3148 }, { "epoch": 1.86, "grad_norm": 0.18254601435770682, "learning_rate": 1.8152257167713293e-06, "loss": 0.3636, "step": 3149 }, { "epoch": 1.86, "grad_norm": 0.202958366945704, "learning_rate": 1.7974394066018086e-06, "loss": 0.4151, "step": 3150 }, { "epoch": 1.86, "grad_norm": 0.19035842521279056, "learning_rate": 1.7797398744318294e-06, "loss": 0.424, "step": 3151 }, { "epoch": 1.86, "grad_norm": 0.19953066152325982, "learning_rate": 1.7621271359018143e-06, "loss": 0.4353, "step": 3152 }, { "epoch": 1.86, "grad_norm": 0.21452092387256122, "learning_rate": 1.7446012065755025e-06, "loss": 0.4264, "step": 3153 }, { "epoch": 1.86, "grad_norm": 0.18855721006001971, "learning_rate": 1.7271621019399165e-06, "loss": 0.3607, "step": 3154 }, { "epoch": 1.86, "grad_norm": 0.17457659478034973, "learning_rate": 1.7098098374053295e-06, "loss": 0.3699, "step": 3155 }, { "epoch": 1.86, "grad_norm": 0.19528073478928745, "learning_rate": 1.6925444283053316e-06, "loss": 0.3971, "step": 3156 }, { "epoch": 1.86, "grad_norm": 0.19440070159663722, "learning_rate": 1.6753658898967073e-06, "loss": 0.3982, "step": 3157 }, { "epoch": 1.86, "grad_norm": 0.20508767460402985, "learning_rate": 1.658274237359514e-06, "loss": 0.3942, "step": 3158 }, { "epoch": 1.87, "grad_norm": 0.1979654844075504, "learning_rate": 1.6412694857970256e-06, "loss": 0.3798, "step": 3159 }, { "epoch": 1.87, "grad_norm": 0.18833758719322716, "learning_rate": 1.6243516502356782e-06, "loss": 0.4258, "step": 3160 }, { "epoch": 1.87, "grad_norm": 0.19086125074224217, "learning_rate": 1.6075207456252016e-06, "loss": 0.3993, "step": 3161 }, { "epoch": 1.87, "grad_norm": 0.2028805157065144, "learning_rate": 1.5907767868383993e-06, "loss": 0.411, "step": 3162 }, { "epoch": 1.87, "grad_norm": 0.18946762384982935, "learning_rate": 1.5741197886713243e-06, "loss": 0.3613, "step": 3163 }, { "epoch": 1.87, "grad_norm": 0.1956602195587916, "learning_rate": 1.5575497658431359e-06, "loss": 0.4189, "step": 3164 }, { "epoch": 1.87, "grad_norm": 0.17588601530400924, "learning_rate": 1.541066732996166e-06, "loss": 0.3462, "step": 3165 }, { "epoch": 1.87, "grad_norm": 0.20371484183429103, "learning_rate": 1.524670704695852e-06, "loss": 0.3799, "step": 3166 }, { "epoch": 1.87, "grad_norm": 0.19179485563072282, "learning_rate": 1.5083616954307822e-06, "loss": 0.3779, "step": 3167 }, { "epoch": 1.87, "grad_norm": 0.1863133844938148, "learning_rate": 1.4921397196126063e-06, "loss": 0.3821, "step": 3168 }, { "epoch": 1.87, "grad_norm": 0.184740343430799, "learning_rate": 1.476004791576102e-06, "loss": 0.4011, "step": 3169 }, { "epoch": 1.87, "grad_norm": 0.1773568646290839, "learning_rate": 1.4599569255791312e-06, "loss": 0.3644, "step": 3170 }, { "epoch": 1.87, "grad_norm": 0.18876985980744976, "learning_rate": 1.443996135802572e-06, "loss": 0.3784, "step": 3171 }, { "epoch": 1.87, "grad_norm": 0.17067634819510838, "learning_rate": 1.4281224363504097e-06, "loss": 0.3303, "step": 3172 }, { "epoch": 1.87, "grad_norm": 0.19437246035111175, "learning_rate": 1.4123358412496457e-06, "loss": 0.3731, "step": 3173 }, { "epoch": 1.87, "grad_norm": 0.17734466145601968, "learning_rate": 1.396636364450299e-06, "loss": 0.3728, "step": 3174 }, { "epoch": 1.87, "grad_norm": 0.19118429102334034, "learning_rate": 1.38102401982545e-06, "loss": 0.3816, "step": 3175 }, { "epoch": 1.88, "grad_norm": 0.20028692342632082, "learning_rate": 1.3654988211711294e-06, "loss": 0.4221, "step": 3176 }, { "epoch": 1.88, "grad_norm": 0.19781252858163023, "learning_rate": 1.3500607822063972e-06, "loss": 0.3898, "step": 3177 }, { "epoch": 1.88, "grad_norm": 0.18896273683703604, "learning_rate": 1.3347099165732735e-06, "loss": 0.3712, "step": 3178 }, { "epoch": 1.88, "grad_norm": 0.20055040703887, "learning_rate": 1.3194462378367745e-06, "loss": 0.4106, "step": 3179 }, { "epoch": 1.88, "grad_norm": 0.20074952718463962, "learning_rate": 1.304269759484844e-06, "loss": 0.4222, "step": 3180 }, { "epoch": 1.88, "grad_norm": 0.19612121742334224, "learning_rate": 1.289180494928366e-06, "loss": 0.4322, "step": 3181 }, { "epoch": 1.88, "grad_norm": 0.2123229180067534, "learning_rate": 1.2741784575011738e-06, "loss": 0.3584, "step": 3182 }, { "epoch": 1.88, "grad_norm": 0.1893099685468654, "learning_rate": 1.2592636604600416e-06, "loss": 0.3594, "step": 3183 }, { "epoch": 1.88, "grad_norm": 0.18648008252336526, "learning_rate": 1.2444361169846043e-06, "loss": 0.398, "step": 3184 }, { "epoch": 1.88, "grad_norm": 0.18085502760305805, "learning_rate": 1.2296958401774138e-06, "loss": 0.3983, "step": 3185 }, { "epoch": 1.88, "grad_norm": 0.19751076143375046, "learning_rate": 1.2150428430639293e-06, "loss": 0.408, "step": 3186 }, { "epoch": 1.88, "grad_norm": 0.19590341226599803, "learning_rate": 1.2004771385924486e-06, "loss": 0.3865, "step": 3187 }, { "epoch": 1.88, "grad_norm": 0.20073541743544435, "learning_rate": 1.1859987396341754e-06, "loss": 0.3919, "step": 3188 }, { "epoch": 1.88, "grad_norm": 0.1794635256970725, "learning_rate": 1.1716076589831094e-06, "loss": 0.3932, "step": 3189 }, { "epoch": 1.88, "grad_norm": 0.18117681473041028, "learning_rate": 1.1573039093561556e-06, "loss": 0.3766, "step": 3190 }, { "epoch": 1.88, "grad_norm": 0.1974848791585883, "learning_rate": 1.1430875033929811e-06, "loss": 0.3996, "step": 3191 }, { "epoch": 1.88, "grad_norm": 0.19703745616617743, "learning_rate": 1.1289584536561148e-06, "loss": 0.326, "step": 3192 }, { "epoch": 1.89, "grad_norm": 0.2036115421172062, "learning_rate": 1.1149167726308807e-06, "loss": 0.3973, "step": 3193 }, { "epoch": 1.89, "grad_norm": 0.20659352979896078, "learning_rate": 1.1009624727253975e-06, "loss": 0.4069, "step": 3194 }, { "epoch": 1.89, "grad_norm": 0.18197354367933516, "learning_rate": 1.0870955662705573e-06, "loss": 0.3551, "step": 3195 }, { "epoch": 1.89, "grad_norm": 0.18674423376873953, "learning_rate": 1.073316065520058e-06, "loss": 0.357, "step": 3196 }, { "epoch": 1.89, "grad_norm": 0.19410304667862208, "learning_rate": 1.0596239826503152e-06, "loss": 0.3949, "step": 3197 }, { "epoch": 1.89, "grad_norm": 0.18686461985035407, "learning_rate": 1.0460193297605282e-06, "loss": 0.3947, "step": 3198 }, { "epoch": 1.89, "grad_norm": 0.19537836898327499, "learning_rate": 1.032502118872647e-06, "loss": 0.4062, "step": 3199 }, { "epoch": 1.89, "grad_norm": 0.18594513708780075, "learning_rate": 1.0190723619313169e-06, "loss": 0.3897, "step": 3200 }, { "epoch": 1.89, "grad_norm": 0.1936564842779298, "learning_rate": 1.005730070803912e-06, "loss": 0.388, "step": 3201 }, { "epoch": 1.89, "grad_norm": 0.20740314634305435, "learning_rate": 9.924752572805563e-07, "loss": 0.3689, "step": 3202 }, { "epoch": 1.89, "grad_norm": 0.1960230455699611, "learning_rate": 9.793079330740141e-07, "loss": 0.4205, "step": 3203 }, { "epoch": 1.89, "grad_norm": 0.20388983642637493, "learning_rate": 9.662281098197667e-07, "loss": 0.3772, "step": 3204 }, { "epoch": 1.89, "grad_norm": 0.1944219717447334, "learning_rate": 9.53235799075991e-07, "loss": 0.3784, "step": 3205 }, { "epoch": 1.89, "grad_norm": 0.1847623074417675, "learning_rate": 9.403310123235143e-07, "loss": 0.3827, "step": 3206 }, { "epoch": 1.89, "grad_norm": 0.1873068719937528, "learning_rate": 9.275137609658147e-07, "loss": 0.3805, "step": 3207 }, { "epoch": 1.89, "grad_norm": 0.19110057619780071, "learning_rate": 9.147840563290322e-07, "loss": 0.41, "step": 3208 }, { "epoch": 1.9, "grad_norm": 0.18875847827987116, "learning_rate": 9.021419096619355e-07, "loss": 0.3974, "step": 3209 }, { "epoch": 1.9, "grad_norm": 0.19056356308135405, "learning_rate": 8.895873321359216e-07, "loss": 0.3926, "step": 3210 }, { "epoch": 1.9, "grad_norm": 0.19827362278371313, "learning_rate": 8.771203348450163e-07, "loss": 0.4255, "step": 3211 }, { "epoch": 1.9, "grad_norm": 0.18716344753936573, "learning_rate": 8.647409288058405e-07, "loss": 0.3791, "step": 3212 }, { "epoch": 1.9, "grad_norm": 0.18713603666692794, "learning_rate": 8.524491249576328e-07, "loss": 0.3636, "step": 3213 }, { "epoch": 1.9, "grad_norm": 0.18462853991958883, "learning_rate": 8.402449341622153e-07, "loss": 0.4069, "step": 3214 }, { "epoch": 1.9, "grad_norm": 0.18918002939895678, "learning_rate": 8.281283672039619e-07, "loss": 0.3901, "step": 3215 }, { "epoch": 1.9, "grad_norm": 0.20052989184313472, "learning_rate": 8.16099434789852e-07, "loss": 0.4137, "step": 3216 }, { "epoch": 1.9, "grad_norm": 0.20948288799457052, "learning_rate": 8.041581475493942e-07, "loss": 0.3647, "step": 3217 }, { "epoch": 1.9, "grad_norm": 0.1935257493379553, "learning_rate": 7.92304516034692e-07, "loss": 0.3746, "step": 3218 }, { "epoch": 1.9, "grad_norm": 0.185075419298866, "learning_rate": 7.805385507203555e-07, "loss": 0.3931, "step": 3219 }, { "epoch": 1.9, "grad_norm": 0.1771901008403066, "learning_rate": 7.688602620035346e-07, "loss": 0.3668, "step": 3220 }, { "epoch": 1.9, "grad_norm": 0.20248014707155454, "learning_rate": 7.572696602038965e-07, "loss": 0.4483, "step": 3221 }, { "epoch": 1.9, "grad_norm": 0.20360597931556265, "learning_rate": 7.457667555636371e-07, "loss": 0.3947, "step": 3222 }, { "epoch": 1.9, "grad_norm": 0.2045453901260168, "learning_rate": 7.34351558247448e-07, "loss": 0.4156, "step": 3223 }, { "epoch": 1.9, "grad_norm": 0.21374774038862304, "learning_rate": 7.230240783425379e-07, "loss": 0.4915, "step": 3224 }, { "epoch": 1.9, "grad_norm": 0.1922782340904662, "learning_rate": 7.117843258585666e-07, "loss": 0.378, "step": 3225 }, { "epoch": 1.91, "grad_norm": 0.18629738802882323, "learning_rate": 7.006323107276891e-07, "loss": 0.3636, "step": 3226 }, { "epoch": 1.91, "grad_norm": 0.2028644937530157, "learning_rate": 6.895680428045336e-07, "loss": 0.4367, "step": 3227 }, { "epoch": 1.91, "grad_norm": 0.17539086550958038, "learning_rate": 6.785915318662128e-07, "loss": 0.352, "step": 3228 }, { "epoch": 1.91, "grad_norm": 0.18385558521543932, "learning_rate": 6.677027876122344e-07, "loss": 0.3749, "step": 3229 }, { "epoch": 1.91, "grad_norm": 0.1920429920106685, "learning_rate": 6.569018196645905e-07, "loss": 0.3889, "step": 3230 }, { "epoch": 1.91, "grad_norm": 0.1862841603189568, "learning_rate": 6.461886375677017e-07, "loss": 0.3825, "step": 3231 }, { "epoch": 1.91, "grad_norm": 0.2058739986931926, "learning_rate": 6.355632507884291e-07, "loss": 0.4188, "step": 3232 }, { "epoch": 1.91, "grad_norm": 0.18709350187825954, "learning_rate": 6.250256687160172e-07, "loss": 0.4149, "step": 3233 }, { "epoch": 1.91, "grad_norm": 0.17757746614569134, "learning_rate": 6.145759006621399e-07, "loss": 0.3358, "step": 3234 }, { "epoch": 1.91, "grad_norm": 0.20412227510225445, "learning_rate": 6.042139558608995e-07, "loss": 0.3813, "step": 3235 }, { "epoch": 1.91, "grad_norm": 0.18178197663692106, "learning_rate": 5.939398434687382e-07, "loss": 0.367, "step": 3236 }, { "epoch": 1.91, "grad_norm": 0.18738341111633122, "learning_rate": 5.83753572564516e-07, "loss": 0.3846, "step": 3237 }, { "epoch": 1.91, "grad_norm": 0.2000427655218693, "learning_rate": 5.736551521494881e-07, "loss": 0.4284, "step": 3238 }, { "epoch": 1.91, "grad_norm": 0.19278619160187718, "learning_rate": 5.636445911472276e-07, "loss": 0.3957, "step": 3239 }, { "epoch": 1.91, "grad_norm": 0.20909231431825567, "learning_rate": 5.537218984037251e-07, "loss": 0.4799, "step": 3240 }, { "epoch": 1.91, "grad_norm": 0.19110883915080917, "learning_rate": 5.438870826872777e-07, "loss": 0.4012, "step": 3241 }, { "epoch": 1.91, "grad_norm": 0.19636539580797252, "learning_rate": 5.341401526885781e-07, "loss": 0.3895, "step": 3242 }, { "epoch": 1.92, "grad_norm": 0.19441381825230175, "learning_rate": 5.244811170206143e-07, "loss": 0.3933, "step": 3243 }, { "epoch": 1.92, "grad_norm": 0.2175801342691988, "learning_rate": 5.149099842187254e-07, "loss": 0.3942, "step": 3244 }, { "epoch": 1.92, "grad_norm": 0.18605708598359258, "learning_rate": 5.054267627405574e-07, "loss": 0.3787, "step": 3245 }, { "epoch": 1.92, "grad_norm": 0.21149087709158926, "learning_rate": 4.960314609661065e-07, "loss": 0.4078, "step": 3246 }, { "epoch": 1.92, "grad_norm": 0.19567321474953228, "learning_rate": 4.86724087197643e-07, "loss": 0.4215, "step": 3247 }, { "epoch": 1.92, "grad_norm": 0.18749423037376656, "learning_rate": 4.775046496597546e-07, "loss": 0.398, "step": 3248 }, { "epoch": 1.92, "grad_norm": 0.17597971825430656, "learning_rate": 4.6837315649932435e-07, "loss": 0.3231, "step": 3249 }, { "epoch": 1.92, "grad_norm": 0.1754490734554022, "learning_rate": 4.593296157855087e-07, "loss": 0.3417, "step": 3250 }, { "epoch": 1.92, "grad_norm": 0.17688928271915183, "learning_rate": 4.503740355097597e-07, "loss": 0.3527, "step": 3251 }, { "epoch": 1.92, "grad_norm": 0.17858234662256542, "learning_rate": 4.415064235857913e-07, "loss": 0.3453, "step": 3252 }, { "epoch": 1.92, "grad_norm": 0.1991198742144274, "learning_rate": 4.3272678784959107e-07, "loss": 0.4018, "step": 3253 }, { "epoch": 1.92, "grad_norm": 0.1819706791664033, "learning_rate": 4.240351360593975e-07, "loss": 0.3614, "step": 3254 }, { "epoch": 1.92, "grad_norm": 0.20732741479735994, "learning_rate": 4.154314758957001e-07, "loss": 0.4432, "step": 3255 }, { "epoch": 1.92, "grad_norm": 0.19273612751874944, "learning_rate": 4.0691581496125063e-07, "loss": 0.3775, "step": 3256 }, { "epoch": 1.92, "grad_norm": 0.18697618780067982, "learning_rate": 3.984881607810187e-07, "loss": 0.378, "step": 3257 }, { "epoch": 1.92, "grad_norm": 0.1999416093998594, "learning_rate": 3.9014852080220263e-07, "loss": 0.395, "step": 3258 }, { "epoch": 1.92, "grad_norm": 0.19999565370341477, "learning_rate": 3.81896902394252e-07, "loss": 0.4032, "step": 3259 }, { "epoch": 1.93, "grad_norm": 0.17929856887972828, "learning_rate": 3.7373331284881187e-07, "loss": 0.3574, "step": 3260 }, { "epoch": 1.93, "grad_norm": 0.17548502665274135, "learning_rate": 3.656577593797561e-07, "loss": 0.3425, "step": 3261 }, { "epoch": 1.93, "grad_norm": 0.19698553973937918, "learning_rate": 3.576702491231432e-07, "loss": 0.386, "step": 3262 }, { "epoch": 1.93, "grad_norm": 0.2033553308059875, "learning_rate": 3.497707891372382e-07, "loss": 0.4253, "step": 3263 }, { "epoch": 1.93, "grad_norm": 0.20003502614490035, "learning_rate": 3.419593864025239e-07, "loss": 0.3575, "step": 3264 }, { "epoch": 1.93, "grad_norm": 0.19362347610384475, "learning_rate": 3.342360478216344e-07, "loss": 0.336, "step": 3265 }, { "epoch": 1.93, "grad_norm": 0.17962439452049042, "learning_rate": 3.2660078021941044e-07, "loss": 0.3472, "step": 3266 }, { "epoch": 1.93, "grad_norm": 0.19842186984852644, "learning_rate": 3.1905359034284377e-07, "loss": 0.4032, "step": 3267 }, { "epoch": 1.93, "grad_norm": 0.18901879026511056, "learning_rate": 3.11594484861133e-07, "loss": 0.3993, "step": 3268 }, { "epoch": 1.93, "grad_norm": 0.1908851117660752, "learning_rate": 3.042234703655833e-07, "loss": 0.4165, "step": 3269 }, { "epoch": 1.93, "grad_norm": 0.19852331588996355, "learning_rate": 2.969405533697178e-07, "loss": 0.4133, "step": 3270 }, { "epoch": 1.93, "grad_norm": 0.19497530192734594, "learning_rate": 2.8974574030917747e-07, "loss": 0.4006, "step": 3271 }, { "epoch": 1.93, "grad_norm": 0.1973867553405961, "learning_rate": 2.8263903754174316e-07, "loss": 0.4122, "step": 3272 }, { "epoch": 1.93, "grad_norm": 0.21082580950152388, "learning_rate": 2.756204513473581e-07, "loss": 0.4206, "step": 3273 }, { "epoch": 1.93, "grad_norm": 0.189060516990307, "learning_rate": 2.6868998792808353e-07, "loss": 0.4018, "step": 3274 }, { "epoch": 1.93, "grad_norm": 0.19300160359755386, "learning_rate": 2.6184765340810936e-07, "loss": 0.4001, "step": 3275 }, { "epoch": 1.94, "grad_norm": 0.17830842913917283, "learning_rate": 2.550934538337768e-07, "loss": 0.3481, "step": 3276 }, { "epoch": 1.94, "grad_norm": 0.18772421717848126, "learning_rate": 2.4842739517350055e-07, "loss": 0.3792, "step": 3277 }, { "epoch": 1.94, "grad_norm": 0.18819704260435957, "learning_rate": 2.418494833178464e-07, "loss": 0.3672, "step": 3278 }, { "epoch": 1.94, "grad_norm": 0.20097846173843922, "learning_rate": 2.3535972407947582e-07, "loss": 0.3814, "step": 3279 }, { "epoch": 1.94, "grad_norm": 0.18030992920333294, "learning_rate": 2.2895812319313482e-07, "loss": 0.3622, "step": 3280 }, { "epoch": 1.94, "grad_norm": 0.1789641489632021, "learning_rate": 2.2264468631569836e-07, "loss": 0.369, "step": 3281 }, { "epoch": 1.94, "grad_norm": 0.19429333891323297, "learning_rate": 2.1641941902611483e-07, "loss": 0.3832, "step": 3282 }, { "epoch": 1.94, "grad_norm": 0.2094149050617977, "learning_rate": 2.1028232682542836e-07, "loss": 0.4049, "step": 3283 }, { "epoch": 1.94, "grad_norm": 0.1981759250210741, "learning_rate": 2.0423341513675643e-07, "loss": 0.4095, "step": 3284 }, { "epoch": 1.94, "grad_norm": 0.19249954341521933, "learning_rate": 1.9827268930530106e-07, "loss": 0.3773, "step": 3285 }, { "epoch": 1.94, "grad_norm": 0.2164196991626135, "learning_rate": 1.9240015459832671e-07, "loss": 0.4315, "step": 3286 }, { "epoch": 1.94, "grad_norm": 0.1941807089426153, "learning_rate": 1.8661581620519342e-07, "loss": 0.3948, "step": 3287 }, { "epoch": 1.94, "grad_norm": 0.19712036540070402, "learning_rate": 1.8091967923731246e-07, "loss": 0.3887, "step": 3288 }, { "epoch": 1.94, "grad_norm": 0.19252783894527362, "learning_rate": 1.7531174872813526e-07, "loss": 0.3564, "step": 3289 }, { "epoch": 1.94, "grad_norm": 0.19529647080333504, "learning_rate": 1.6979202963318674e-07, "loss": 0.3806, "step": 3290 }, { "epoch": 1.94, "grad_norm": 0.18334403219059206, "learning_rate": 1.64360526830043e-07, "loss": 0.3665, "step": 3291 }, { "epoch": 1.94, "grad_norm": 0.18263929788467487, "learning_rate": 1.590172451183314e-07, "loss": 0.3843, "step": 3292 }, { "epoch": 1.95, "grad_norm": 0.17630257086085244, "learning_rate": 1.537621892197083e-07, "loss": 0.3321, "step": 3293 }, { "epoch": 1.95, "grad_norm": 0.2045431522577309, "learning_rate": 1.4859536377785922e-07, "loss": 0.4665, "step": 3294 }, { "epoch": 1.95, "grad_norm": 0.20490350736046714, "learning_rate": 1.4351677335854296e-07, "loss": 0.4077, "step": 3295 }, { "epoch": 1.95, "grad_norm": 0.18550515027639283, "learning_rate": 1.385264224495142e-07, "loss": 0.3445, "step": 3296 }, { "epoch": 1.95, "grad_norm": 0.19548530453578192, "learning_rate": 1.3362431546055655e-07, "loss": 0.3709, "step": 3297 }, { "epoch": 1.95, "grad_norm": 0.19785000290190885, "learning_rate": 1.288104567234827e-07, "loss": 0.3882, "step": 3298 }, { "epoch": 1.95, "grad_norm": 0.1998704411519638, "learning_rate": 1.2408485049213438e-07, "loss": 0.4262, "step": 3299 }, { "epoch": 1.95, "grad_norm": 0.18703231547572702, "learning_rate": 1.1944750094234902e-07, "loss": 0.376, "step": 3300 }, { "epoch": 1.95, "grad_norm": 0.1891653601869006, "learning_rate": 1.1489841217197094e-07, "loss": 0.3859, "step": 3301 }, { "epoch": 1.95, "grad_norm": 0.19006481114825025, "learning_rate": 1.1043758820088457e-07, "loss": 0.3897, "step": 3302 }, { "epoch": 1.95, "grad_norm": 0.20352257802392035, "learning_rate": 1.0606503297094783e-07, "loss": 0.4363, "step": 3303 }, { "epoch": 1.95, "grad_norm": 0.1961056459668248, "learning_rate": 1.0178075034601442e-07, "loss": 0.3704, "step": 3304 }, { "epoch": 1.95, "grad_norm": 0.17970682190325277, "learning_rate": 9.758474411196706e-08, "loss": 0.3542, "step": 3305 }, { "epoch": 1.95, "grad_norm": 0.19172082350713635, "learning_rate": 9.347701797665087e-08, "loss": 0.3978, "step": 3306 }, { "epoch": 1.95, "grad_norm": 0.18947042563958155, "learning_rate": 8.945757556991785e-08, "loss": 0.3871, "step": 3307 }, { "epoch": 1.95, "grad_norm": 0.18833139446786673, "learning_rate": 8.55264204436046e-08, "loss": 0.3766, "step": 3308 }, { "epoch": 1.95, "grad_norm": 0.1946344551939453, "learning_rate": 8.168355607152122e-08, "loss": 0.4039, "step": 3309 }, { "epoch": 1.96, "grad_norm": 0.19530497465373736, "learning_rate": 7.792898584946251e-08, "loss": 0.4009, "step": 3310 }, { "epoch": 1.96, "grad_norm": 0.19579497930103548, "learning_rate": 7.426271309520783e-08, "loss": 0.4132, "step": 3311 }, { "epoch": 1.96, "grad_norm": 0.18688070611383054, "learning_rate": 7.068474104852118e-08, "loss": 0.3821, "step": 3312 }, { "epoch": 1.96, "grad_norm": 0.19621019324623737, "learning_rate": 6.719507287110682e-08, "loss": 0.4158, "step": 3313 }, { "epoch": 1.96, "grad_norm": 0.18754582228914488, "learning_rate": 6.37937116466758e-08, "loss": 0.3592, "step": 3314 }, { "epoch": 1.96, "grad_norm": 0.200554814568333, "learning_rate": 6.048066038086831e-08, "loss": 0.4315, "step": 3315 }, { "epoch": 1.96, "grad_norm": 0.18446685190761059, "learning_rate": 5.725592200130914e-08, "loss": 0.3737, "step": 3316 }, { "epoch": 1.96, "grad_norm": 0.20120474113690331, "learning_rate": 5.4119499357585533e-08, "loss": 0.3744, "step": 3317 }, { "epoch": 1.96, "grad_norm": 0.20872833620428924, "learning_rate": 5.107139522123605e-08, "loss": 0.4176, "step": 3318 }, { "epoch": 1.96, "grad_norm": 0.18038584535807978, "learning_rate": 4.811161228576166e-08, "loss": 0.383, "step": 3319 }, { "epoch": 1.96, "grad_norm": 0.194073111066338, "learning_rate": 4.524015316662577e-08, "loss": 0.3758, "step": 3320 }, { "epoch": 1.96, "grad_norm": 0.20421077426568254, "learning_rate": 4.24570204011987e-08, "loss": 0.4027, "step": 3321 }, { "epoch": 1.96, "grad_norm": 0.18616814783372337, "learning_rate": 3.9762216448868684e-08, "loss": 0.3928, "step": 3322 }, { "epoch": 1.96, "grad_norm": 0.18864241678818086, "learning_rate": 3.715574369090869e-08, "loss": 0.3676, "step": 3323 }, { "epoch": 1.96, "grad_norm": 0.21193430089157575, "learning_rate": 3.463760443057629e-08, "loss": 0.4087, "step": 3324 }, { "epoch": 1.96, "grad_norm": 0.18544078878476022, "learning_rate": 3.2207800893069296e-08, "loss": 0.3837, "step": 3325 }, { "epoch": 1.96, "grad_norm": 0.18432018287612154, "learning_rate": 2.9866335225481324e-08, "loss": 0.3882, "step": 3326 }, { "epoch": 1.97, "grad_norm": 0.20191828889615152, "learning_rate": 2.7613209496912816e-08, "loss": 0.4403, "step": 3327 }, { "epoch": 1.97, "grad_norm": 0.18871253627610182, "learning_rate": 2.5448425698360034e-08, "loss": 0.4026, "step": 3328 }, { "epoch": 1.97, "grad_norm": 0.19537379284916057, "learning_rate": 2.337198574274835e-08, "loss": 0.419, "step": 3329 }, { "epoch": 1.97, "grad_norm": 0.18290952042136385, "learning_rate": 2.1383891464965554e-08, "loss": 0.4221, "step": 3330 }, { "epoch": 1.97, "grad_norm": 0.19543074561388615, "learning_rate": 1.948414462181747e-08, "loss": 0.3414, "step": 3331 }, { "epoch": 1.97, "grad_norm": 0.18511344425764348, "learning_rate": 1.7672746892039014e-08, "loss": 0.3627, "step": 3332 }, { "epoch": 1.97, "grad_norm": 0.19153383455638, "learning_rate": 1.5949699876294244e-08, "loss": 0.3863, "step": 3333 }, { "epoch": 1.97, "grad_norm": 0.1892980675269156, "learning_rate": 1.4315005097176314e-08, "loss": 0.409, "step": 3334 }, { "epoch": 1.97, "grad_norm": 0.18954529984839885, "learning_rate": 1.2768663999207509e-08, "loss": 0.3934, "step": 3335 }, { "epoch": 1.97, "grad_norm": 0.19572839344458184, "learning_rate": 1.1310677948839221e-08, "loss": 0.4073, "step": 3336 }, { "epoch": 1.97, "grad_norm": 0.19384437473620314, "learning_rate": 9.94104823441866e-09, "loss": 0.3871, "step": 3337 }, { "epoch": 1.97, "grad_norm": 0.21376883962625398, "learning_rate": 8.659776066255454e-09, "loss": 0.3812, "step": 3338 }, { "epoch": 1.97, "grad_norm": 0.18417626940971296, "learning_rate": 7.466862576555045e-09, "loss": 0.3715, "step": 3339 }, { "epoch": 1.97, "grad_norm": 0.20914923552989872, "learning_rate": 6.362308819451989e-09, "loss": 0.416, "step": 3340 }, { "epoch": 1.97, "grad_norm": 0.19634958034455302, "learning_rate": 5.346115771009963e-09, "loss": 0.3908, "step": 3341 }, { "epoch": 1.97, "grad_norm": 0.19227965875344546, "learning_rate": 4.418284329188449e-09, "loss": 0.361, "step": 3342 }, { "epoch": 1.98, "grad_norm": 0.19660676906902444, "learning_rate": 3.5788153138871515e-09, "loss": 0.3805, "step": 3343 }, { "epoch": 1.98, "grad_norm": 0.20634066208383187, "learning_rate": 2.8277094669126868e-09, "loss": 0.4423, "step": 3344 }, { "epoch": 1.98, "grad_norm": 0.18674281963839826, "learning_rate": 2.1649674519785836e-09, "loss": 0.3768, "step": 3345 }, { "epoch": 1.98, "grad_norm": 0.19693858756590374, "learning_rate": 1.590589854749691e-09, "loss": 0.404, "step": 3346 }, { "epoch": 1.98, "grad_norm": 0.17745029377636273, "learning_rate": 1.1045771827533635e-09, "loss": 0.391, "step": 3347 }, { "epoch": 1.98, "grad_norm": 0.21627082820922133, "learning_rate": 7.069298654793777e-10, "loss": 0.4235, "step": 3348 }, { "epoch": 1.98, "grad_norm": 0.19713330297784887, "learning_rate": 3.9764825430221866e-10, "loss": 0.4102, "step": 3349 }, { "epoch": 1.98, "grad_norm": 0.1868699746011865, "learning_rate": 1.767326225365906e-10, "loss": 0.4033, "step": 3350 }, { "epoch": 1.98, "grad_norm": 0.20307699817299302, "learning_rate": 4.4183165393008044e-11, "loss": 0.3578, "step": 3351 }, { "epoch": 1.98, "grad_norm": 0.19399685831381294, "learning_rate": 0.0, "loss": 0.3825, "step": 3352 } ], "logging_steps": 1, "max_steps": 3352, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 838, "total_flos": 3318986116694016.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }