diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4325610316580605, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004325610316580605, + "grad_norm": 10.074225425720215, + "learning_rate": 8.000000000000001e-07, + "loss": 3.4502, + "step": 1 + }, + { + "epoch": 0.000865122063316121, + "grad_norm": 10.193795204162598, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.4309, + "step": 2 + }, + { + "epoch": 0.0012976830949741815, + "grad_norm": 9.579278945922852, + "learning_rate": 2.4000000000000003e-06, + "loss": 3.4075, + "step": 3 + }, + { + "epoch": 0.001730244126632242, + "grad_norm": 9.713829040527344, + "learning_rate": 3.2000000000000003e-06, + "loss": 3.4737, + "step": 4 + }, + { + "epoch": 0.0021628051582903026, + "grad_norm": 9.49286937713623, + "learning_rate": 4.000000000000001e-06, + "loss": 3.4149, + "step": 5 + }, + { + "epoch": 0.002595366189948363, + "grad_norm": 10.680686950683594, + "learning_rate": 4.800000000000001e-06, + "loss": 3.4793, + "step": 6 + }, + { + "epoch": 0.0030279272216064235, + "grad_norm": 10.119192123413086, + "learning_rate": 5.600000000000001e-06, + "loss": 3.3707, + "step": 7 + }, + { + "epoch": 0.003460488253264484, + "grad_norm": 10.217940330505371, + "learning_rate": 6.4000000000000006e-06, + "loss": 3.4339, + "step": 8 + }, + { + "epoch": 0.0038930492849225447, + "grad_norm": 10.2111234664917, + "learning_rate": 7.2000000000000005e-06, + "loss": 3.3944, + "step": 9 + }, + { + "epoch": 0.004325610316580605, + "grad_norm": 9.485647201538086, + "learning_rate": 8.000000000000001e-06, + "loss": 3.3145, + "step": 10 + }, + { + "epoch": 0.004758171348238666, + "grad_norm": 10.536123275756836, + "learning_rate": 8.8e-06, + "loss": 3.37, + "step": 11 + }, + { + "epoch": 0.005190732379896726, + "grad_norm": 10.602726936340332, + "learning_rate": 9.600000000000001e-06, + "loss": 3.4304, + "step": 12 + }, + { + "epoch": 0.0056232934115547865, + "grad_norm": 10.237251281738281, + "learning_rate": 1.04e-05, + "loss": 3.3628, + "step": 13 + }, + { + "epoch": 0.006055854443212847, + "grad_norm": 10.820761680603027, + "learning_rate": 1.1200000000000001e-05, + "loss": 3.4281, + "step": 14 + }, + { + "epoch": 0.006488415474870907, + "grad_norm": 10.799651145935059, + "learning_rate": 1.2e-05, + "loss": 3.4524, + "step": 15 + }, + { + "epoch": 0.006920976506528968, + "grad_norm": 11.567938804626465, + "learning_rate": 1.2800000000000001e-05, + "loss": 3.4891, + "step": 16 + }, + { + "epoch": 0.007353537538187028, + "grad_norm": 11.040711402893066, + "learning_rate": 1.3600000000000002e-05, + "loss": 3.3775, + "step": 17 + }, + { + "epoch": 0.0077860985698450895, + "grad_norm": 10.833939552307129, + "learning_rate": 1.4400000000000001e-05, + "loss": 3.4044, + "step": 18 + }, + { + "epoch": 0.008218659601503149, + "grad_norm": 11.41971206665039, + "learning_rate": 1.5200000000000002e-05, + "loss": 3.4054, + "step": 19 + }, + { + "epoch": 0.00865122063316121, + "grad_norm": 11.41285514831543, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.2957, + "step": 20 + }, + { + "epoch": 0.00908378166481927, + "grad_norm": 11.056841850280762, + "learning_rate": 1.6800000000000002e-05, + "loss": 3.3388, + "step": 21 + }, + { + "epoch": 0.009516342696477331, + "grad_norm": 11.64125919342041, + "learning_rate": 1.76e-05, + "loss": 3.3148, + "step": 22 + }, + { + "epoch": 0.00994890372813539, + "grad_norm": 11.749085426330566, + "learning_rate": 1.8400000000000003e-05, + "loss": 3.3344, + "step": 23 + }, + { + "epoch": 0.010381464759793452, + "grad_norm": 10.946850776672363, + "learning_rate": 1.9200000000000003e-05, + "loss": 3.3136, + "step": 24 + }, + { + "epoch": 0.010814025791451513, + "grad_norm": 11.453843116760254, + "learning_rate": 2e-05, + "loss": 3.2952, + "step": 25 + }, + { + "epoch": 0.011246586823109573, + "grad_norm": 10.720843315124512, + "learning_rate": 1.99912510936133e-05, + "loss": 3.3089, + "step": 26 + }, + { + "epoch": 0.011679147854767634, + "grad_norm": 10.230990409851074, + "learning_rate": 1.9982502187226597e-05, + "loss": 3.1701, + "step": 27 + }, + { + "epoch": 0.012111708886425694, + "grad_norm": 10.450053215026855, + "learning_rate": 1.9973753280839896e-05, + "loss": 3.3022, + "step": 28 + }, + { + "epoch": 0.012544269918083755, + "grad_norm": 9.970443725585938, + "learning_rate": 1.9965004374453195e-05, + "loss": 3.1913, + "step": 29 + }, + { + "epoch": 0.012976830949741815, + "grad_norm": 8.793647766113281, + "learning_rate": 1.9956255468066494e-05, + "loss": 3.1559, + "step": 30 + }, + { + "epoch": 0.013409391981399876, + "grad_norm": 9.256972312927246, + "learning_rate": 1.9947506561679793e-05, + "loss": 3.0502, + "step": 31 + }, + { + "epoch": 0.013841953013057936, + "grad_norm": 9.748438835144043, + "learning_rate": 1.993875765529309e-05, + "loss": 3.1057, + "step": 32 + }, + { + "epoch": 0.014274514044715997, + "grad_norm": 9.952435493469238, + "learning_rate": 1.993000874890639e-05, + "loss": 3.0494, + "step": 33 + }, + { + "epoch": 0.014707075076374056, + "grad_norm": 9.690461158752441, + "learning_rate": 1.9921259842519688e-05, + "loss": 3.0637, + "step": 34 + }, + { + "epoch": 0.015139636108032118, + "grad_norm": 9.847665786743164, + "learning_rate": 1.9912510936132984e-05, + "loss": 3.039, + "step": 35 + }, + { + "epoch": 0.015572197139690179, + "grad_norm": 8.799947738647461, + "learning_rate": 1.9903762029746283e-05, + "loss": 3.0492, + "step": 36 + }, + { + "epoch": 0.01600475817134824, + "grad_norm": 9.004206657409668, + "learning_rate": 1.9895013123359582e-05, + "loss": 2.9891, + "step": 37 + }, + { + "epoch": 0.016437319203006298, + "grad_norm": 9.604561805725098, + "learning_rate": 1.9886264216972878e-05, + "loss": 2.9614, + "step": 38 + }, + { + "epoch": 0.01686988023466436, + "grad_norm": 9.404193878173828, + "learning_rate": 1.9877515310586177e-05, + "loss": 2.9704, + "step": 39 + }, + { + "epoch": 0.01730244126632242, + "grad_norm": 10.134177207946777, + "learning_rate": 1.9868766404199476e-05, + "loss": 2.8793, + "step": 40 + }, + { + "epoch": 0.01773500229798048, + "grad_norm": 8.96322250366211, + "learning_rate": 1.9860017497812775e-05, + "loss": 2.913, + "step": 41 + }, + { + "epoch": 0.01816756332963854, + "grad_norm": 8.704127311706543, + "learning_rate": 1.9851268591426075e-05, + "loss": 2.8886, + "step": 42 + }, + { + "epoch": 0.018600124361296603, + "grad_norm": 9.069153785705566, + "learning_rate": 1.984251968503937e-05, + "loss": 2.8725, + "step": 43 + }, + { + "epoch": 0.019032685392954662, + "grad_norm": 9.031394004821777, + "learning_rate": 1.983377077865267e-05, + "loss": 2.8581, + "step": 44 + }, + { + "epoch": 0.019465246424612722, + "grad_norm": 8.829975128173828, + "learning_rate": 1.982502187226597e-05, + "loss": 2.8044, + "step": 45 + }, + { + "epoch": 0.01989780745627078, + "grad_norm": 8.565601348876953, + "learning_rate": 1.9816272965879265e-05, + "loss": 2.8094, + "step": 46 + }, + { + "epoch": 0.020330368487928845, + "grad_norm": 8.82313060760498, + "learning_rate": 1.9807524059492564e-05, + "loss": 2.7984, + "step": 47 + }, + { + "epoch": 0.020762929519586904, + "grad_norm": 9.0712251663208, + "learning_rate": 1.9798775153105863e-05, + "loss": 2.6778, + "step": 48 + }, + { + "epoch": 0.021195490551244964, + "grad_norm": 9.187145233154297, + "learning_rate": 1.9790026246719162e-05, + "loss": 2.7568, + "step": 49 + }, + { + "epoch": 0.021628051582903027, + "grad_norm": 9.249820709228516, + "learning_rate": 1.978127734033246e-05, + "loss": 2.7787, + "step": 50 + }, + { + "epoch": 0.022060612614561086, + "grad_norm": 8.687273025512695, + "learning_rate": 1.977252843394576e-05, + "loss": 2.7557, + "step": 51 + }, + { + "epoch": 0.022493173646219146, + "grad_norm": 9.046874046325684, + "learning_rate": 1.9763779527559057e-05, + "loss": 2.6655, + "step": 52 + }, + { + "epoch": 0.022925734677877205, + "grad_norm": 9.147759437561035, + "learning_rate": 1.9755030621172356e-05, + "loss": 2.6384, + "step": 53 + }, + { + "epoch": 0.02335829570953527, + "grad_norm": 8.494421005249023, + "learning_rate": 1.9746281714785655e-05, + "loss": 2.668, + "step": 54 + }, + { + "epoch": 0.023790856741193328, + "grad_norm": 9.623828887939453, + "learning_rate": 1.973753280839895e-05, + "loss": 2.5583, + "step": 55 + }, + { + "epoch": 0.024223417772851388, + "grad_norm": 8.837956428527832, + "learning_rate": 1.972878390201225e-05, + "loss": 2.657, + "step": 56 + }, + { + "epoch": 0.024655978804509447, + "grad_norm": 9.294939041137695, + "learning_rate": 1.972003499562555e-05, + "loss": 2.5761, + "step": 57 + }, + { + "epoch": 0.02508853983616751, + "grad_norm": 9.436785697937012, + "learning_rate": 1.9711286089238845e-05, + "loss": 2.5929, + "step": 58 + }, + { + "epoch": 0.02552110086782557, + "grad_norm": 8.875762939453125, + "learning_rate": 1.9702537182852148e-05, + "loss": 2.6005, + "step": 59 + }, + { + "epoch": 0.02595366189948363, + "grad_norm": 8.302167892456055, + "learning_rate": 1.9693788276465443e-05, + "loss": 2.5536, + "step": 60 + }, + { + "epoch": 0.026386222931141692, + "grad_norm": 8.504595756530762, + "learning_rate": 1.9685039370078743e-05, + "loss": 2.479, + "step": 61 + }, + { + "epoch": 0.026818783962799752, + "grad_norm": 9.210023880004883, + "learning_rate": 1.9676290463692042e-05, + "loss": 2.4938, + "step": 62 + }, + { + "epoch": 0.02725134499445781, + "grad_norm": 8.98862075805664, + "learning_rate": 1.9667541557305338e-05, + "loss": 2.4786, + "step": 63 + }, + { + "epoch": 0.02768390602611587, + "grad_norm": 8.775471687316895, + "learning_rate": 1.9658792650918637e-05, + "loss": 2.4588, + "step": 64 + }, + { + "epoch": 0.028116467057773934, + "grad_norm": 8.455652236938477, + "learning_rate": 1.9650043744531936e-05, + "loss": 2.5419, + "step": 65 + }, + { + "epoch": 0.028549028089431994, + "grad_norm": 8.13263988494873, + "learning_rate": 1.9641294838145232e-05, + "loss": 2.4738, + "step": 66 + }, + { + "epoch": 0.028981589121090053, + "grad_norm": 8.96052360534668, + "learning_rate": 1.963254593175853e-05, + "loss": 2.3496, + "step": 67 + }, + { + "epoch": 0.029414150152748113, + "grad_norm": 8.10035228729248, + "learning_rate": 1.962379702537183e-05, + "loss": 2.4487, + "step": 68 + }, + { + "epoch": 0.029846711184406176, + "grad_norm": 8.069640159606934, + "learning_rate": 1.961504811898513e-05, + "loss": 2.4545, + "step": 69 + }, + { + "epoch": 0.030279272216064235, + "grad_norm": 8.711186408996582, + "learning_rate": 1.960629921259843e-05, + "loss": 2.3936, + "step": 70 + }, + { + "epoch": 0.030711833247722295, + "grad_norm": 9.187376022338867, + "learning_rate": 1.9597550306211725e-05, + "loss": 2.2613, + "step": 71 + }, + { + "epoch": 0.031144394279380358, + "grad_norm": 8.515965461730957, + "learning_rate": 1.9588801399825024e-05, + "loss": 2.4022, + "step": 72 + }, + { + "epoch": 0.031576955311038414, + "grad_norm": 8.901565551757812, + "learning_rate": 1.9580052493438323e-05, + "loss": 2.3848, + "step": 73 + }, + { + "epoch": 0.03200951634269648, + "grad_norm": 8.479150772094727, + "learning_rate": 1.957130358705162e-05, + "loss": 2.2304, + "step": 74 + }, + { + "epoch": 0.03244207737435454, + "grad_norm": 8.931798934936523, + "learning_rate": 1.9562554680664918e-05, + "loss": 2.3045, + "step": 75 + }, + { + "epoch": 0.032874638406012596, + "grad_norm": 8.757261276245117, + "learning_rate": 1.9553805774278217e-05, + "loss": 2.3887, + "step": 76 + }, + { + "epoch": 0.03330719943767066, + "grad_norm": 8.421159744262695, + "learning_rate": 1.9545056867891513e-05, + "loss": 2.3631, + "step": 77 + }, + { + "epoch": 0.03373976046932872, + "grad_norm": 8.453868865966797, + "learning_rate": 1.9536307961504816e-05, + "loss": 2.1783, + "step": 78 + }, + { + "epoch": 0.03417232150098678, + "grad_norm": 8.190146446228027, + "learning_rate": 1.952755905511811e-05, + "loss": 2.3526, + "step": 79 + }, + { + "epoch": 0.03460488253264484, + "grad_norm": 9.044288635253906, + "learning_rate": 1.951881014873141e-05, + "loss": 2.2243, + "step": 80 + }, + { + "epoch": 0.0350374435643029, + "grad_norm": 7.598982334136963, + "learning_rate": 1.951006124234471e-05, + "loss": 2.3016, + "step": 81 + }, + { + "epoch": 0.03547000459596096, + "grad_norm": 8.639673233032227, + "learning_rate": 1.9501312335958006e-05, + "loss": 2.2283, + "step": 82 + }, + { + "epoch": 0.035902565627619024, + "grad_norm": 8.017471313476562, + "learning_rate": 1.9492563429571305e-05, + "loss": 2.2479, + "step": 83 + }, + { + "epoch": 0.03633512665927708, + "grad_norm": 8.051676750183105, + "learning_rate": 1.9483814523184604e-05, + "loss": 2.2333, + "step": 84 + }, + { + "epoch": 0.03676768769093514, + "grad_norm": 9.102787971496582, + "learning_rate": 1.94750656167979e-05, + "loss": 2.1107, + "step": 85 + }, + { + "epoch": 0.037200248722593206, + "grad_norm": 7.457028388977051, + "learning_rate": 1.94663167104112e-05, + "loss": 2.307, + "step": 86 + }, + { + "epoch": 0.03763280975425126, + "grad_norm": 8.147334098815918, + "learning_rate": 1.9457567804024498e-05, + "loss": 2.1986, + "step": 87 + }, + { + "epoch": 0.038065370785909325, + "grad_norm": 8.232810974121094, + "learning_rate": 1.9448818897637797e-05, + "loss": 2.1094, + "step": 88 + }, + { + "epoch": 0.03849793181756739, + "grad_norm": 8.264384269714355, + "learning_rate": 1.9440069991251097e-05, + "loss": 2.2574, + "step": 89 + }, + { + "epoch": 0.038930492849225444, + "grad_norm": 8.394896507263184, + "learning_rate": 1.9431321084864392e-05, + "loss": 2.2311, + "step": 90 + }, + { + "epoch": 0.03936305388088351, + "grad_norm": 8.641265869140625, + "learning_rate": 1.9422572178477692e-05, + "loss": 2.1939, + "step": 91 + }, + { + "epoch": 0.03979561491254156, + "grad_norm": 7.687323570251465, + "learning_rate": 1.941382327209099e-05, + "loss": 2.1942, + "step": 92 + }, + { + "epoch": 0.040228175944199626, + "grad_norm": 7.773819923400879, + "learning_rate": 1.9405074365704287e-05, + "loss": 2.2004, + "step": 93 + }, + { + "epoch": 0.04066073697585769, + "grad_norm": 8.127384185791016, + "learning_rate": 1.9396325459317586e-05, + "loss": 2.1736, + "step": 94 + }, + { + "epoch": 0.041093298007515745, + "grad_norm": 9.069780349731445, + "learning_rate": 1.9387576552930885e-05, + "loss": 2.0747, + "step": 95 + }, + { + "epoch": 0.04152585903917381, + "grad_norm": 7.772279739379883, + "learning_rate": 1.9378827646544184e-05, + "loss": 2.1285, + "step": 96 + }, + { + "epoch": 0.04195842007083187, + "grad_norm": 8.430010795593262, + "learning_rate": 1.937007874015748e-05, + "loss": 2.2845, + "step": 97 + }, + { + "epoch": 0.04239098110248993, + "grad_norm": 7.560873508453369, + "learning_rate": 1.9361329833770783e-05, + "loss": 2.1905, + "step": 98 + }, + { + "epoch": 0.04282354213414799, + "grad_norm": 9.867399215698242, + "learning_rate": 1.935258092738408e-05, + "loss": 2.1375, + "step": 99 + }, + { + "epoch": 0.043256103165806054, + "grad_norm": 9.414137840270996, + "learning_rate": 1.9343832020997378e-05, + "loss": 2.059, + "step": 100 + }, + { + "epoch": 0.04368866419746411, + "grad_norm": 8.068355560302734, + "learning_rate": 1.9335083114610677e-05, + "loss": 2.1939, + "step": 101 + }, + { + "epoch": 0.04412122522912217, + "grad_norm": 7.778262615203857, + "learning_rate": 1.9326334208223973e-05, + "loss": 2.2688, + "step": 102 + }, + { + "epoch": 0.04455378626078023, + "grad_norm": 8.01554012298584, + "learning_rate": 1.9317585301837272e-05, + "loss": 2.1017, + "step": 103 + }, + { + "epoch": 0.04498634729243829, + "grad_norm": 7.774407386779785, + "learning_rate": 1.930883639545057e-05, + "loss": 2.0725, + "step": 104 + }, + { + "epoch": 0.045418908324096355, + "grad_norm": 8.284996032714844, + "learning_rate": 1.9300087489063867e-05, + "loss": 2.0856, + "step": 105 + }, + { + "epoch": 0.04585146935575441, + "grad_norm": 10.201037406921387, + "learning_rate": 1.9291338582677166e-05, + "loss": 2.3246, + "step": 106 + }, + { + "epoch": 0.046284030387412474, + "grad_norm": 7.818156719207764, + "learning_rate": 1.9282589676290465e-05, + "loss": 2.041, + "step": 107 + }, + { + "epoch": 0.04671659141907054, + "grad_norm": 10.078817367553711, + "learning_rate": 1.9273840769903765e-05, + "loss": 2.0881, + "step": 108 + }, + { + "epoch": 0.04714915245072859, + "grad_norm": 7.91151762008667, + "learning_rate": 1.9265091863517064e-05, + "loss": 2.0693, + "step": 109 + }, + { + "epoch": 0.047581713482386656, + "grad_norm": 8.433507919311523, + "learning_rate": 1.925634295713036e-05, + "loss": 2.1478, + "step": 110 + }, + { + "epoch": 0.04801427451404472, + "grad_norm": 8.348559379577637, + "learning_rate": 1.924759405074366e-05, + "loss": 2.0982, + "step": 111 + }, + { + "epoch": 0.048446835545702775, + "grad_norm": 8.86949348449707, + "learning_rate": 1.9238845144356958e-05, + "loss": 2.0297, + "step": 112 + }, + { + "epoch": 0.04887939657736084, + "grad_norm": 8.434800148010254, + "learning_rate": 1.9230096237970254e-05, + "loss": 2.1681, + "step": 113 + }, + { + "epoch": 0.049311957609018894, + "grad_norm": 9.036213874816895, + "learning_rate": 1.9221347331583553e-05, + "loss": 2.047, + "step": 114 + }, + { + "epoch": 0.04974451864067696, + "grad_norm": 9.136820793151855, + "learning_rate": 1.9212598425196852e-05, + "loss": 2.0121, + "step": 115 + }, + { + "epoch": 0.05017707967233502, + "grad_norm": 10.170919418334961, + "learning_rate": 1.9203849518810148e-05, + "loss": 2.1075, + "step": 116 + }, + { + "epoch": 0.050609640703993077, + "grad_norm": 8.642443656921387, + "learning_rate": 1.919510061242345e-05, + "loss": 2.1603, + "step": 117 + }, + { + "epoch": 0.05104220173565114, + "grad_norm": 8.400376319885254, + "learning_rate": 1.9186351706036747e-05, + "loss": 2.0528, + "step": 118 + }, + { + "epoch": 0.0514747627673092, + "grad_norm": 10.212697982788086, + "learning_rate": 1.9177602799650046e-05, + "loss": 2.0214, + "step": 119 + }, + { + "epoch": 0.05190732379896726, + "grad_norm": 8.315373420715332, + "learning_rate": 1.9168853893263345e-05, + "loss": 2.0902, + "step": 120 + }, + { + "epoch": 0.05233988483062532, + "grad_norm": 9.261432647705078, + "learning_rate": 1.916010498687664e-05, + "loss": 1.9828, + "step": 121 + }, + { + "epoch": 0.052772445862283385, + "grad_norm": 9.281742095947266, + "learning_rate": 1.915135608048994e-05, + "loss": 2.0072, + "step": 122 + }, + { + "epoch": 0.05320500689394144, + "grad_norm": 9.036360740661621, + "learning_rate": 1.914260717410324e-05, + "loss": 2.0366, + "step": 123 + }, + { + "epoch": 0.053637567925599504, + "grad_norm": 9.62345027923584, + "learning_rate": 1.9133858267716535e-05, + "loss": 2.0936, + "step": 124 + }, + { + "epoch": 0.05407012895725756, + "grad_norm": 8.963865280151367, + "learning_rate": 1.9125109361329834e-05, + "loss": 1.9454, + "step": 125 + }, + { + "epoch": 0.05450268998891562, + "grad_norm": 9.449174880981445, + "learning_rate": 1.9116360454943133e-05, + "loss": 2.0623, + "step": 126 + }, + { + "epoch": 0.054935251020573686, + "grad_norm": 9.598356246948242, + "learning_rate": 1.9107611548556433e-05, + "loss": 2.0225, + "step": 127 + }, + { + "epoch": 0.05536781205223174, + "grad_norm": 8.742138862609863, + "learning_rate": 1.9098862642169732e-05, + "loss": 2.0985, + "step": 128 + }, + { + "epoch": 0.055800373083889805, + "grad_norm": 9.00941276550293, + "learning_rate": 1.9090113735783028e-05, + "loss": 1.9905, + "step": 129 + }, + { + "epoch": 0.05623293411554787, + "grad_norm": 10.214828491210938, + "learning_rate": 1.9081364829396327e-05, + "loss": 2.0573, + "step": 130 + }, + { + "epoch": 0.056665495147205924, + "grad_norm": 8.646235466003418, + "learning_rate": 1.9072615923009626e-05, + "loss": 1.9669, + "step": 131 + }, + { + "epoch": 0.05709805617886399, + "grad_norm": 9.02608585357666, + "learning_rate": 1.9063867016622922e-05, + "loss": 1.9952, + "step": 132 + }, + { + "epoch": 0.05753061721052205, + "grad_norm": 9.741292953491211, + "learning_rate": 1.905511811023622e-05, + "loss": 2.0118, + "step": 133 + }, + { + "epoch": 0.057963178242180106, + "grad_norm": 9.34339427947998, + "learning_rate": 1.904636920384952e-05, + "loss": 2.0352, + "step": 134 + }, + { + "epoch": 0.05839573927383817, + "grad_norm": 9.542576789855957, + "learning_rate": 1.9037620297462816e-05, + "loss": 2.0067, + "step": 135 + }, + { + "epoch": 0.058828300305496226, + "grad_norm": 9.324414253234863, + "learning_rate": 1.902887139107612e-05, + "loss": 2.0344, + "step": 136 + }, + { + "epoch": 0.05926086133715429, + "grad_norm": 10.529446601867676, + "learning_rate": 1.9020122484689415e-05, + "loss": 1.905, + "step": 137 + }, + { + "epoch": 0.05969342236881235, + "grad_norm": 9.371881484985352, + "learning_rate": 1.9011373578302714e-05, + "loss": 1.9717, + "step": 138 + }, + { + "epoch": 0.06012598340047041, + "grad_norm": 10.534412384033203, + "learning_rate": 1.9002624671916013e-05, + "loss": 2.1448, + "step": 139 + }, + { + "epoch": 0.06055854443212847, + "grad_norm": 9.028956413269043, + "learning_rate": 1.899387576552931e-05, + "loss": 2.0132, + "step": 140 + }, + { + "epoch": 0.060991105463786534, + "grad_norm": 9.384864807128906, + "learning_rate": 1.8985126859142608e-05, + "loss": 1.9173, + "step": 141 + }, + { + "epoch": 0.06142366649544459, + "grad_norm": 9.299654960632324, + "learning_rate": 1.8976377952755907e-05, + "loss": 1.9669, + "step": 142 + }, + { + "epoch": 0.06185622752710265, + "grad_norm": 10.067913055419922, + "learning_rate": 1.8967629046369206e-05, + "loss": 1.9765, + "step": 143 + }, + { + "epoch": 0.062288788558760716, + "grad_norm": 10.266395568847656, + "learning_rate": 1.8958880139982502e-05, + "loss": 1.9275, + "step": 144 + }, + { + "epoch": 0.06272134959041878, + "grad_norm": 9.555045127868652, + "learning_rate": 1.89501312335958e-05, + "loss": 1.9977, + "step": 145 + }, + { + "epoch": 0.06315391062207683, + "grad_norm": 9.367684364318848, + "learning_rate": 1.89413823272091e-05, + "loss": 1.9813, + "step": 146 + }, + { + "epoch": 0.06358647165373489, + "grad_norm": 9.195287704467773, + "learning_rate": 1.89326334208224e-05, + "loss": 1.982, + "step": 147 + }, + { + "epoch": 0.06401903268539295, + "grad_norm": 11.219182014465332, + "learning_rate": 1.89238845144357e-05, + "loss": 2.0398, + "step": 148 + }, + { + "epoch": 0.06445159371705102, + "grad_norm": 10.429877281188965, + "learning_rate": 1.8915135608048995e-05, + "loss": 1.9312, + "step": 149 + }, + { + "epoch": 0.06488415474870908, + "grad_norm": 10.788617134094238, + "learning_rate": 1.8906386701662294e-05, + "loss": 1.9711, + "step": 150 + }, + { + "epoch": 0.06531671578036713, + "grad_norm": 11.090737342834473, + "learning_rate": 1.8897637795275593e-05, + "loss": 2.0233, + "step": 151 + }, + { + "epoch": 0.06574927681202519, + "grad_norm": 10.588007926940918, + "learning_rate": 1.888888888888889e-05, + "loss": 2.0727, + "step": 152 + }, + { + "epoch": 0.06618183784368326, + "grad_norm": 10.38557243347168, + "learning_rate": 1.888013998250219e-05, + "loss": 1.9655, + "step": 153 + }, + { + "epoch": 0.06661439887534132, + "grad_norm": 9.603656768798828, + "learning_rate": 1.8871391076115488e-05, + "loss": 1.9975, + "step": 154 + }, + { + "epoch": 0.06704695990699938, + "grad_norm": 9.86426830291748, + "learning_rate": 1.8862642169728787e-05, + "loss": 1.9229, + "step": 155 + }, + { + "epoch": 0.06747952093865744, + "grad_norm": 9.31710147857666, + "learning_rate": 1.8853893263342086e-05, + "loss": 1.9682, + "step": 156 + }, + { + "epoch": 0.0679120819703155, + "grad_norm": 10.599958419799805, + "learning_rate": 1.8845144356955382e-05, + "loss": 1.9162, + "step": 157 + }, + { + "epoch": 0.06834464300197356, + "grad_norm": 9.712906837463379, + "learning_rate": 1.883639545056868e-05, + "loss": 1.9684, + "step": 158 + }, + { + "epoch": 0.06877720403363162, + "grad_norm": 10.723039627075195, + "learning_rate": 1.882764654418198e-05, + "loss": 1.9253, + "step": 159 + }, + { + "epoch": 0.06920976506528968, + "grad_norm": 9.580291748046875, + "learning_rate": 1.8818897637795276e-05, + "loss": 2.0012, + "step": 160 + }, + { + "epoch": 0.06964232609694775, + "grad_norm": 10.511956214904785, + "learning_rate": 1.8810148731408575e-05, + "loss": 1.9845, + "step": 161 + }, + { + "epoch": 0.0700748871286058, + "grad_norm": 11.413307189941406, + "learning_rate": 1.8801399825021874e-05, + "loss": 1.9733, + "step": 162 + }, + { + "epoch": 0.07050744816026386, + "grad_norm": 9.986128807067871, + "learning_rate": 1.879265091863517e-05, + "loss": 1.8991, + "step": 163 + }, + { + "epoch": 0.07094000919192192, + "grad_norm": 10.067076683044434, + "learning_rate": 1.878390201224847e-05, + "loss": 1.9415, + "step": 164 + }, + { + "epoch": 0.07137257022357998, + "grad_norm": 10.817390441894531, + "learning_rate": 1.877515310586177e-05, + "loss": 1.965, + "step": 165 + }, + { + "epoch": 0.07180513125523805, + "grad_norm": 10.907906532287598, + "learning_rate": 1.8766404199475068e-05, + "loss": 1.8497, + "step": 166 + }, + { + "epoch": 0.07223769228689611, + "grad_norm": 12.694193840026855, + "learning_rate": 1.8757655293088367e-05, + "loss": 1.8588, + "step": 167 + }, + { + "epoch": 0.07267025331855416, + "grad_norm": 11.461287498474121, + "learning_rate": 1.8748906386701663e-05, + "loss": 1.9708, + "step": 168 + }, + { + "epoch": 0.07310281435021222, + "grad_norm": 12.703743934631348, + "learning_rate": 1.8740157480314962e-05, + "loss": 1.9838, + "step": 169 + }, + { + "epoch": 0.07353537538187029, + "grad_norm": 9.77420425415039, + "learning_rate": 1.873140857392826e-05, + "loss": 1.9551, + "step": 170 + }, + { + "epoch": 0.07396793641352835, + "grad_norm": 11.191120147705078, + "learning_rate": 1.8722659667541557e-05, + "loss": 2.001, + "step": 171 + }, + { + "epoch": 0.07440049744518641, + "grad_norm": 9.07374095916748, + "learning_rate": 1.8713910761154856e-05, + "loss": 1.874, + "step": 172 + }, + { + "epoch": 0.07483305847684446, + "grad_norm": 10.750893592834473, + "learning_rate": 1.8705161854768156e-05, + "loss": 1.8673, + "step": 173 + }, + { + "epoch": 0.07526561950850252, + "grad_norm": 10.637674331665039, + "learning_rate": 1.8696412948381455e-05, + "loss": 1.9057, + "step": 174 + }, + { + "epoch": 0.07569818054016059, + "grad_norm": 11.109024047851562, + "learning_rate": 1.8687664041994754e-05, + "loss": 1.9262, + "step": 175 + }, + { + "epoch": 0.07613074157181865, + "grad_norm": 10.31051254272461, + "learning_rate": 1.867891513560805e-05, + "loss": 1.9316, + "step": 176 + }, + { + "epoch": 0.07656330260347671, + "grad_norm": 10.382652282714844, + "learning_rate": 1.867016622922135e-05, + "loss": 1.8514, + "step": 177 + }, + { + "epoch": 0.07699586363513478, + "grad_norm": 11.28693962097168, + "learning_rate": 1.8661417322834648e-05, + "loss": 1.9258, + "step": 178 + }, + { + "epoch": 0.07742842466679282, + "grad_norm": 9.876675605773926, + "learning_rate": 1.8652668416447944e-05, + "loss": 1.935, + "step": 179 + }, + { + "epoch": 0.07786098569845089, + "grad_norm": 10.760346412658691, + "learning_rate": 1.8643919510061243e-05, + "loss": 1.8479, + "step": 180 + }, + { + "epoch": 0.07829354673010895, + "grad_norm": 10.346819877624512, + "learning_rate": 1.8635170603674542e-05, + "loss": 1.8821, + "step": 181 + }, + { + "epoch": 0.07872610776176701, + "grad_norm": 10.750500679016113, + "learning_rate": 1.8626421697287838e-05, + "loss": 1.8996, + "step": 182 + }, + { + "epoch": 0.07915866879342508, + "grad_norm": 11.348615646362305, + "learning_rate": 1.8617672790901137e-05, + "loss": 1.8542, + "step": 183 + }, + { + "epoch": 0.07959122982508313, + "grad_norm": 10.184338569641113, + "learning_rate": 1.8608923884514437e-05, + "loss": 1.9138, + "step": 184 + }, + { + "epoch": 0.08002379085674119, + "grad_norm": 12.307705879211426, + "learning_rate": 1.8600174978127736e-05, + "loss": 1.8653, + "step": 185 + }, + { + "epoch": 0.08045635188839925, + "grad_norm": 9.707239151000977, + "learning_rate": 1.8591426071741035e-05, + "loss": 1.9195, + "step": 186 + }, + { + "epoch": 0.08088891292005732, + "grad_norm": 10.568989753723145, + "learning_rate": 1.858267716535433e-05, + "loss": 1.8921, + "step": 187 + }, + { + "epoch": 0.08132147395171538, + "grad_norm": 10.401839256286621, + "learning_rate": 1.857392825896763e-05, + "loss": 1.9463, + "step": 188 + }, + { + "epoch": 0.08175403498337344, + "grad_norm": 11.29499626159668, + "learning_rate": 1.856517935258093e-05, + "loss": 1.8881, + "step": 189 + }, + { + "epoch": 0.08218659601503149, + "grad_norm": 12.853240013122559, + "learning_rate": 1.855643044619423e-05, + "loss": 1.8063, + "step": 190 + }, + { + "epoch": 0.08261915704668955, + "grad_norm": 12.939081192016602, + "learning_rate": 1.8547681539807524e-05, + "loss": 1.8993, + "step": 191 + }, + { + "epoch": 0.08305171807834762, + "grad_norm": 11.397017478942871, + "learning_rate": 1.8538932633420824e-05, + "loss": 1.9577, + "step": 192 + }, + { + "epoch": 0.08348427911000568, + "grad_norm": 10.588729858398438, + "learning_rate": 1.8530183727034123e-05, + "loss": 1.9725, + "step": 193 + }, + { + "epoch": 0.08391684014166374, + "grad_norm": 10.367941856384277, + "learning_rate": 1.8521434820647422e-05, + "loss": 1.9014, + "step": 194 + }, + { + "epoch": 0.08434940117332179, + "grad_norm": 10.358473777770996, + "learning_rate": 1.851268591426072e-05, + "loss": 1.9505, + "step": 195 + }, + { + "epoch": 0.08478196220497985, + "grad_norm": 10.932351112365723, + "learning_rate": 1.8503937007874017e-05, + "loss": 1.9628, + "step": 196 + }, + { + "epoch": 0.08521452323663792, + "grad_norm": 10.701963424682617, + "learning_rate": 1.8495188101487316e-05, + "loss": 1.8751, + "step": 197 + }, + { + "epoch": 0.08564708426829598, + "grad_norm": 12.282137870788574, + "learning_rate": 1.8486439195100615e-05, + "loss": 1.7744, + "step": 198 + }, + { + "epoch": 0.08607964529995404, + "grad_norm": 11.679487228393555, + "learning_rate": 1.847769028871391e-05, + "loss": 1.8378, + "step": 199 + }, + { + "epoch": 0.08651220633161211, + "grad_norm": 15.917386054992676, + "learning_rate": 1.846894138232721e-05, + "loss": 1.8038, + "step": 200 + }, + { + "epoch": 0.08694476736327016, + "grad_norm": 13.143421173095703, + "learning_rate": 1.846019247594051e-05, + "loss": 1.9285, + "step": 201 + }, + { + "epoch": 0.08737732839492822, + "grad_norm": 12.354574203491211, + "learning_rate": 1.8451443569553805e-05, + "loss": 1.7922, + "step": 202 + }, + { + "epoch": 0.08780988942658628, + "grad_norm": 13.790152549743652, + "learning_rate": 1.8442694663167108e-05, + "loss": 1.9562, + "step": 203 + }, + { + "epoch": 0.08824245045824435, + "grad_norm": 11.429886817932129, + "learning_rate": 1.8433945756780404e-05, + "loss": 1.7733, + "step": 204 + }, + { + "epoch": 0.08867501148990241, + "grad_norm": 11.52943229675293, + "learning_rate": 1.8425196850393703e-05, + "loss": 1.8627, + "step": 205 + }, + { + "epoch": 0.08910757252156046, + "grad_norm": 11.378132820129395, + "learning_rate": 1.8416447944007002e-05, + "loss": 1.7967, + "step": 206 + }, + { + "epoch": 0.08954013355321852, + "grad_norm": 10.589323997497559, + "learning_rate": 1.8407699037620298e-05, + "loss": 1.9194, + "step": 207 + }, + { + "epoch": 0.08997269458487658, + "grad_norm": 10.294939994812012, + "learning_rate": 1.8398950131233597e-05, + "loss": 1.8982, + "step": 208 + }, + { + "epoch": 0.09040525561653465, + "grad_norm": 10.35794448852539, + "learning_rate": 1.8390201224846896e-05, + "loss": 1.9172, + "step": 209 + }, + { + "epoch": 0.09083781664819271, + "grad_norm": 11.790461540222168, + "learning_rate": 1.8381452318460192e-05, + "loss": 1.8459, + "step": 210 + }, + { + "epoch": 0.09127037767985077, + "grad_norm": 12.333897590637207, + "learning_rate": 1.837270341207349e-05, + "loss": 1.7778, + "step": 211 + }, + { + "epoch": 0.09170293871150882, + "grad_norm": 12.425847053527832, + "learning_rate": 1.836395450568679e-05, + "loss": 1.8701, + "step": 212 + }, + { + "epoch": 0.09213549974316688, + "grad_norm": 11.710013389587402, + "learning_rate": 1.835520559930009e-05, + "loss": 1.7047, + "step": 213 + }, + { + "epoch": 0.09256806077482495, + "grad_norm": 11.748705863952637, + "learning_rate": 1.834645669291339e-05, + "loss": 1.8291, + "step": 214 + }, + { + "epoch": 0.09300062180648301, + "grad_norm": 11.944961547851562, + "learning_rate": 1.8337707786526685e-05, + "loss": 1.8243, + "step": 215 + }, + { + "epoch": 0.09343318283814107, + "grad_norm": 11.755900382995605, + "learning_rate": 1.8328958880139984e-05, + "loss": 1.883, + "step": 216 + }, + { + "epoch": 0.09386574386979912, + "grad_norm": 12.01804256439209, + "learning_rate": 1.8320209973753283e-05, + "loss": 1.8377, + "step": 217 + }, + { + "epoch": 0.09429830490145719, + "grad_norm": 11.53456974029541, + "learning_rate": 1.831146106736658e-05, + "loss": 1.7949, + "step": 218 + }, + { + "epoch": 0.09473086593311525, + "grad_norm": 10.804949760437012, + "learning_rate": 1.830271216097988e-05, + "loss": 1.8741, + "step": 219 + }, + { + "epoch": 0.09516342696477331, + "grad_norm": 11.649739265441895, + "learning_rate": 1.8293963254593178e-05, + "loss": 1.8424, + "step": 220 + }, + { + "epoch": 0.09559598799643138, + "grad_norm": 11.372157096862793, + "learning_rate": 1.8285214348206473e-05, + "loss": 1.7853, + "step": 221 + }, + { + "epoch": 0.09602854902808944, + "grad_norm": 10.01382064819336, + "learning_rate": 1.8276465441819776e-05, + "loss": 1.8669, + "step": 222 + }, + { + "epoch": 0.09646111005974749, + "grad_norm": 11.315641403198242, + "learning_rate": 1.8267716535433072e-05, + "loss": 1.7636, + "step": 223 + }, + { + "epoch": 0.09689367109140555, + "grad_norm": 12.253576278686523, + "learning_rate": 1.825896762904637e-05, + "loss": 1.8279, + "step": 224 + }, + { + "epoch": 0.09732623212306361, + "grad_norm": 11.940933227539062, + "learning_rate": 1.825021872265967e-05, + "loss": 1.9796, + "step": 225 + }, + { + "epoch": 0.09775879315472168, + "grad_norm": 11.986372947692871, + "learning_rate": 1.8241469816272966e-05, + "loss": 1.8362, + "step": 226 + }, + { + "epoch": 0.09819135418637974, + "grad_norm": 12.15494155883789, + "learning_rate": 1.8232720909886265e-05, + "loss": 1.8801, + "step": 227 + }, + { + "epoch": 0.09862391521803779, + "grad_norm": 11.447915077209473, + "learning_rate": 1.8223972003499564e-05, + "loss": 1.8247, + "step": 228 + }, + { + "epoch": 0.09905647624969585, + "grad_norm": 11.581623077392578, + "learning_rate": 1.821522309711286e-05, + "loss": 1.8369, + "step": 229 + }, + { + "epoch": 0.09948903728135391, + "grad_norm": 11.14620590209961, + "learning_rate": 1.820647419072616e-05, + "loss": 1.8653, + "step": 230 + }, + { + "epoch": 0.09992159831301198, + "grad_norm": 12.83015251159668, + "learning_rate": 1.819772528433946e-05, + "loss": 1.8845, + "step": 231 + }, + { + "epoch": 0.10035415934467004, + "grad_norm": 11.10083293914795, + "learning_rate": 1.8188976377952758e-05, + "loss": 1.837, + "step": 232 + }, + { + "epoch": 0.1007867203763281, + "grad_norm": 12.611995697021484, + "learning_rate": 1.8180227471566057e-05, + "loss": 1.8799, + "step": 233 + }, + { + "epoch": 0.10121928140798615, + "grad_norm": 14.688155174255371, + "learning_rate": 1.8171478565179353e-05, + "loss": 1.9175, + "step": 234 + }, + { + "epoch": 0.10165184243964422, + "grad_norm": 11.024300575256348, + "learning_rate": 1.8162729658792652e-05, + "loss": 1.7136, + "step": 235 + }, + { + "epoch": 0.10208440347130228, + "grad_norm": 14.889878273010254, + "learning_rate": 1.815398075240595e-05, + "loss": 1.7828, + "step": 236 + }, + { + "epoch": 0.10251696450296034, + "grad_norm": 11.804996490478516, + "learning_rate": 1.814523184601925e-05, + "loss": 1.8391, + "step": 237 + }, + { + "epoch": 0.1029495255346184, + "grad_norm": 11.816452026367188, + "learning_rate": 1.8136482939632546e-05, + "loss": 1.8498, + "step": 238 + }, + { + "epoch": 0.10338208656627645, + "grad_norm": 13.521631240844727, + "learning_rate": 1.8127734033245846e-05, + "loss": 1.7968, + "step": 239 + }, + { + "epoch": 0.10381464759793452, + "grad_norm": 11.820068359375, + "learning_rate": 1.8118985126859145e-05, + "loss": 1.8115, + "step": 240 + }, + { + "epoch": 0.10424720862959258, + "grad_norm": 14.849625587463379, + "learning_rate": 1.811023622047244e-05, + "loss": 1.7471, + "step": 241 + }, + { + "epoch": 0.10467976966125064, + "grad_norm": 11.63190746307373, + "learning_rate": 1.8101487314085743e-05, + "loss": 1.7383, + "step": 242 + }, + { + "epoch": 0.1051123306929087, + "grad_norm": 11.374736785888672, + "learning_rate": 1.809273840769904e-05, + "loss": 1.8412, + "step": 243 + }, + { + "epoch": 0.10554489172456677, + "grad_norm": 14.540604591369629, + "learning_rate": 1.8083989501312338e-05, + "loss": 1.7489, + "step": 244 + }, + { + "epoch": 0.10597745275622482, + "grad_norm": 12.734722137451172, + "learning_rate": 1.8075240594925637e-05, + "loss": 1.8997, + "step": 245 + }, + { + "epoch": 0.10641001378788288, + "grad_norm": 13.632735252380371, + "learning_rate": 1.8066491688538933e-05, + "loss": 1.8283, + "step": 246 + }, + { + "epoch": 0.10684257481954094, + "grad_norm": 13.188791275024414, + "learning_rate": 1.8057742782152232e-05, + "loss": 1.7936, + "step": 247 + }, + { + "epoch": 0.10727513585119901, + "grad_norm": 12.36187744140625, + "learning_rate": 1.804899387576553e-05, + "loss": 1.7985, + "step": 248 + }, + { + "epoch": 0.10770769688285707, + "grad_norm": 13.20405101776123, + "learning_rate": 1.8040244969378827e-05, + "loss": 1.7463, + "step": 249 + }, + { + "epoch": 0.10814025791451512, + "grad_norm": 12.463266372680664, + "learning_rate": 1.8031496062992127e-05, + "loss": 1.7374, + "step": 250 + }, + { + "epoch": 0.10857281894617318, + "grad_norm": 13.607605934143066, + "learning_rate": 1.8022747156605426e-05, + "loss": 1.7598, + "step": 251 + }, + { + "epoch": 0.10900537997783125, + "grad_norm": 13.48353099822998, + "learning_rate": 1.8013998250218725e-05, + "loss": 1.7661, + "step": 252 + }, + { + "epoch": 0.10943794100948931, + "grad_norm": 13.995173454284668, + "learning_rate": 1.8005249343832024e-05, + "loss": 1.7972, + "step": 253 + }, + { + "epoch": 0.10987050204114737, + "grad_norm": 14.007890701293945, + "learning_rate": 1.799650043744532e-05, + "loss": 1.7641, + "step": 254 + }, + { + "epoch": 0.11030306307280544, + "grad_norm": 16.224395751953125, + "learning_rate": 1.798775153105862e-05, + "loss": 1.8158, + "step": 255 + }, + { + "epoch": 0.11073562410446348, + "grad_norm": 15.336459159851074, + "learning_rate": 1.797900262467192e-05, + "loss": 1.7918, + "step": 256 + }, + { + "epoch": 0.11116818513612155, + "grad_norm": 14.77893352508545, + "learning_rate": 1.7970253718285214e-05, + "loss": 1.8369, + "step": 257 + }, + { + "epoch": 0.11160074616777961, + "grad_norm": 12.808744430541992, + "learning_rate": 1.7961504811898514e-05, + "loss": 1.7846, + "step": 258 + }, + { + "epoch": 0.11203330719943767, + "grad_norm": 11.46886920928955, + "learning_rate": 1.7952755905511813e-05, + "loss": 1.8555, + "step": 259 + }, + { + "epoch": 0.11246586823109574, + "grad_norm": 13.576665878295898, + "learning_rate": 1.794400699912511e-05, + "loss": 1.8384, + "step": 260 + }, + { + "epoch": 0.11289842926275379, + "grad_norm": 13.28268814086914, + "learning_rate": 1.793525809273841e-05, + "loss": 1.809, + "step": 261 + }, + { + "epoch": 0.11333099029441185, + "grad_norm": 11.743298530578613, + "learning_rate": 1.7926509186351707e-05, + "loss": 1.7515, + "step": 262 + }, + { + "epoch": 0.11376355132606991, + "grad_norm": 12.474933624267578, + "learning_rate": 1.7917760279965006e-05, + "loss": 1.7624, + "step": 263 + }, + { + "epoch": 0.11419611235772797, + "grad_norm": 13.138879776000977, + "learning_rate": 1.7909011373578305e-05, + "loss": 1.8352, + "step": 264 + }, + { + "epoch": 0.11462867338938604, + "grad_norm": 11.728463172912598, + "learning_rate": 1.79002624671916e-05, + "loss": 1.8015, + "step": 265 + }, + { + "epoch": 0.1150612344210441, + "grad_norm": 11.462135314941406, + "learning_rate": 1.78915135608049e-05, + "loss": 1.7868, + "step": 266 + }, + { + "epoch": 0.11549379545270215, + "grad_norm": 12.589030265808105, + "learning_rate": 1.78827646544182e-05, + "loss": 1.6825, + "step": 267 + }, + { + "epoch": 0.11592635648436021, + "grad_norm": 14.064338684082031, + "learning_rate": 1.7874015748031495e-05, + "loss": 1.7202, + "step": 268 + }, + { + "epoch": 0.11635891751601828, + "grad_norm": 12.65044116973877, + "learning_rate": 1.7865266841644795e-05, + "loss": 1.7865, + "step": 269 + }, + { + "epoch": 0.11679147854767634, + "grad_norm": 13.943512916564941, + "learning_rate": 1.7856517935258094e-05, + "loss": 1.7506, + "step": 270 + }, + { + "epoch": 0.1172240395793344, + "grad_norm": 13.109914779663086, + "learning_rate": 1.7847769028871393e-05, + "loss": 1.7659, + "step": 271 + }, + { + "epoch": 0.11765660061099245, + "grad_norm": 12.937542915344238, + "learning_rate": 1.7839020122484692e-05, + "loss": 1.8309, + "step": 272 + }, + { + "epoch": 0.11808916164265051, + "grad_norm": 12.564200401306152, + "learning_rate": 1.7830271216097988e-05, + "loss": 1.8121, + "step": 273 + }, + { + "epoch": 0.11852172267430858, + "grad_norm": 12.3132905960083, + "learning_rate": 1.7821522309711287e-05, + "loss": 1.8254, + "step": 274 + }, + { + "epoch": 0.11895428370596664, + "grad_norm": 10.902737617492676, + "learning_rate": 1.7812773403324587e-05, + "loss": 1.7647, + "step": 275 + }, + { + "epoch": 0.1193868447376247, + "grad_norm": 11.777158737182617, + "learning_rate": 1.7804024496937882e-05, + "loss": 1.776, + "step": 276 + }, + { + "epoch": 0.11981940576928277, + "grad_norm": 13.244769096374512, + "learning_rate": 1.779527559055118e-05, + "loss": 1.7909, + "step": 277 + }, + { + "epoch": 0.12025196680094082, + "grad_norm": 13.334715843200684, + "learning_rate": 1.778652668416448e-05, + "loss": 1.7326, + "step": 278 + }, + { + "epoch": 0.12068452783259888, + "grad_norm": 11.51339340209961, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.7133, + "step": 279 + }, + { + "epoch": 0.12111708886425694, + "grad_norm": 13.407567024230957, + "learning_rate": 1.776902887139108e-05, + "loss": 1.8828, + "step": 280 + }, + { + "epoch": 0.121549649895915, + "grad_norm": 10.572535514831543, + "learning_rate": 1.7760279965004375e-05, + "loss": 1.8534, + "step": 281 + }, + { + "epoch": 0.12198221092757307, + "grad_norm": 12.543910026550293, + "learning_rate": 1.7751531058617674e-05, + "loss": 1.7876, + "step": 282 + }, + { + "epoch": 0.12241477195923112, + "grad_norm": 12.414093017578125, + "learning_rate": 1.7742782152230973e-05, + "loss": 1.7249, + "step": 283 + }, + { + "epoch": 0.12284733299088918, + "grad_norm": 12.800058364868164, + "learning_rate": 1.7734033245844273e-05, + "loss": 1.8623, + "step": 284 + }, + { + "epoch": 0.12327989402254724, + "grad_norm": 14.119792938232422, + "learning_rate": 1.772528433945757e-05, + "loss": 1.8873, + "step": 285 + }, + { + "epoch": 0.1237124550542053, + "grad_norm": 15.86700439453125, + "learning_rate": 1.7716535433070868e-05, + "loss": 1.6621, + "step": 286 + }, + { + "epoch": 0.12414501608586337, + "grad_norm": 12.095022201538086, + "learning_rate": 1.7707786526684167e-05, + "loss": 1.8166, + "step": 287 + }, + { + "epoch": 0.12457757711752143, + "grad_norm": 14.493571281433105, + "learning_rate": 1.7699037620297463e-05, + "loss": 1.9194, + "step": 288 + }, + { + "epoch": 0.1250101381491795, + "grad_norm": 12.202247619628906, + "learning_rate": 1.7690288713910762e-05, + "loss": 1.7355, + "step": 289 + }, + { + "epoch": 0.12544269918083756, + "grad_norm": 13.065779685974121, + "learning_rate": 1.768153980752406e-05, + "loss": 1.7089, + "step": 290 + }, + { + "epoch": 0.1258752602124956, + "grad_norm": 11.993766784667969, + "learning_rate": 1.767279090113736e-05, + "loss": 1.7966, + "step": 291 + }, + { + "epoch": 0.12630782124415366, + "grad_norm": 12.460329055786133, + "learning_rate": 1.766404199475066e-05, + "loss": 1.8666, + "step": 292 + }, + { + "epoch": 0.12674038227581172, + "grad_norm": 12.044783592224121, + "learning_rate": 1.7655293088363955e-05, + "loss": 1.7071, + "step": 293 + }, + { + "epoch": 0.12717294330746978, + "grad_norm": 14.959409713745117, + "learning_rate": 1.7646544181977255e-05, + "loss": 1.8463, + "step": 294 + }, + { + "epoch": 0.12760550433912785, + "grad_norm": 15.096085548400879, + "learning_rate": 1.7637795275590554e-05, + "loss": 1.8419, + "step": 295 + }, + { + "epoch": 0.1280380653707859, + "grad_norm": 12.444029808044434, + "learning_rate": 1.762904636920385e-05, + "loss": 1.7926, + "step": 296 + }, + { + "epoch": 0.12847062640244397, + "grad_norm": 11.794731140136719, + "learning_rate": 1.762029746281715e-05, + "loss": 1.7099, + "step": 297 + }, + { + "epoch": 0.12890318743410203, + "grad_norm": 12.06802749633789, + "learning_rate": 1.7611548556430448e-05, + "loss": 1.7576, + "step": 298 + }, + { + "epoch": 0.1293357484657601, + "grad_norm": 13.183172225952148, + "learning_rate": 1.7602799650043747e-05, + "loss": 1.7683, + "step": 299 + }, + { + "epoch": 0.12976830949741816, + "grad_norm": 12.662933349609375, + "learning_rate": 1.7594050743657046e-05, + "loss": 1.7571, + "step": 300 + }, + { + "epoch": 0.13020087052907622, + "grad_norm": 13.42810344696045, + "learning_rate": 1.7585301837270342e-05, + "loss": 1.8103, + "step": 301 + }, + { + "epoch": 0.13063343156073426, + "grad_norm": 12.349446296691895, + "learning_rate": 1.757655293088364e-05, + "loss": 1.8039, + "step": 302 + }, + { + "epoch": 0.13106599259239232, + "grad_norm": 12.955143928527832, + "learning_rate": 1.756780402449694e-05, + "loss": 1.7638, + "step": 303 + }, + { + "epoch": 0.13149855362405038, + "grad_norm": 12.840434074401855, + "learning_rate": 1.7559055118110236e-05, + "loss": 1.6824, + "step": 304 + }, + { + "epoch": 0.13193111465570845, + "grad_norm": 12.311904907226562, + "learning_rate": 1.7550306211723536e-05, + "loss": 1.7549, + "step": 305 + }, + { + "epoch": 0.1323636756873665, + "grad_norm": 14.21117877960205, + "learning_rate": 1.7541557305336835e-05, + "loss": 1.8167, + "step": 306 + }, + { + "epoch": 0.13279623671902457, + "grad_norm": 15.326268196105957, + "learning_rate": 1.753280839895013e-05, + "loss": 1.7197, + "step": 307 + }, + { + "epoch": 0.13322879775068264, + "grad_norm": 12.805957794189453, + "learning_rate": 1.752405949256343e-05, + "loss": 1.7673, + "step": 308 + }, + { + "epoch": 0.1336613587823407, + "grad_norm": 14.023848533630371, + "learning_rate": 1.751531058617673e-05, + "loss": 1.684, + "step": 309 + }, + { + "epoch": 0.13409391981399876, + "grad_norm": 14.051582336425781, + "learning_rate": 1.7506561679790028e-05, + "loss": 1.7981, + "step": 310 + }, + { + "epoch": 0.13452648084565683, + "grad_norm": 14.332208633422852, + "learning_rate": 1.7497812773403328e-05, + "loss": 1.7263, + "step": 311 + }, + { + "epoch": 0.1349590418773149, + "grad_norm": 13.481989860534668, + "learning_rate": 1.7489063867016623e-05, + "loss": 1.844, + "step": 312 + }, + { + "epoch": 0.13539160290897292, + "grad_norm": 13.109614372253418, + "learning_rate": 1.7480314960629923e-05, + "loss": 1.7525, + "step": 313 + }, + { + "epoch": 0.135824163940631, + "grad_norm": 12.776349067687988, + "learning_rate": 1.7471566054243222e-05, + "loss": 1.8072, + "step": 314 + }, + { + "epoch": 0.13625672497228905, + "grad_norm": 12.574403762817383, + "learning_rate": 1.7462817147856518e-05, + "loss": 1.7611, + "step": 315 + }, + { + "epoch": 0.1366892860039471, + "grad_norm": 13.381563186645508, + "learning_rate": 1.7454068241469817e-05, + "loss": 1.6969, + "step": 316 + }, + { + "epoch": 0.13712184703560518, + "grad_norm": 12.077308654785156, + "learning_rate": 1.7445319335083116e-05, + "loss": 1.8818, + "step": 317 + }, + { + "epoch": 0.13755440806726324, + "grad_norm": 12.719555854797363, + "learning_rate": 1.7436570428696415e-05, + "loss": 1.7705, + "step": 318 + }, + { + "epoch": 0.1379869690989213, + "grad_norm": 14.66213607788086, + "learning_rate": 1.7427821522309714e-05, + "loss": 1.6998, + "step": 319 + }, + { + "epoch": 0.13841953013057937, + "grad_norm": 13.779396057128906, + "learning_rate": 1.741907261592301e-05, + "loss": 1.7427, + "step": 320 + }, + { + "epoch": 0.13885209116223743, + "grad_norm": 14.436192512512207, + "learning_rate": 1.741032370953631e-05, + "loss": 1.746, + "step": 321 + }, + { + "epoch": 0.1392846521938955, + "grad_norm": 13.372818946838379, + "learning_rate": 1.740157480314961e-05, + "loss": 1.7303, + "step": 322 + }, + { + "epoch": 0.13971721322555355, + "grad_norm": 12.75151538848877, + "learning_rate": 1.7392825896762904e-05, + "loss": 1.7359, + "step": 323 + }, + { + "epoch": 0.1401497742572116, + "grad_norm": 14.832432746887207, + "learning_rate": 1.7384076990376204e-05, + "loss": 1.7595, + "step": 324 + }, + { + "epoch": 0.14058233528886965, + "grad_norm": 12.097084045410156, + "learning_rate": 1.7375328083989503e-05, + "loss": 1.8151, + "step": 325 + }, + { + "epoch": 0.14101489632052772, + "grad_norm": 15.27542781829834, + "learning_rate": 1.73665791776028e-05, + "loss": 1.7799, + "step": 326 + }, + { + "epoch": 0.14144745735218578, + "grad_norm": 13.651289939880371, + "learning_rate": 1.7357830271216098e-05, + "loss": 1.788, + "step": 327 + }, + { + "epoch": 0.14188001838384384, + "grad_norm": 12.324203491210938, + "learning_rate": 1.7349081364829397e-05, + "loss": 1.7426, + "step": 328 + }, + { + "epoch": 0.1423125794155019, + "grad_norm": 13.443849563598633, + "learning_rate": 1.7340332458442696e-05, + "loss": 1.7648, + "step": 329 + }, + { + "epoch": 0.14274514044715997, + "grad_norm": 15.6445894241333, + "learning_rate": 1.7331583552055995e-05, + "loss": 1.819, + "step": 330 + }, + { + "epoch": 0.14317770147881803, + "grad_norm": 14.16102409362793, + "learning_rate": 1.7322834645669295e-05, + "loss": 1.7316, + "step": 331 + }, + { + "epoch": 0.1436102625104761, + "grad_norm": 14.051115989685059, + "learning_rate": 1.731408573928259e-05, + "loss": 1.7395, + "step": 332 + }, + { + "epoch": 0.14404282354213416, + "grad_norm": 13.74497127532959, + "learning_rate": 1.730533683289589e-05, + "loss": 1.7931, + "step": 333 + }, + { + "epoch": 0.14447538457379222, + "grad_norm": 14.392535209655762, + "learning_rate": 1.729658792650919e-05, + "loss": 1.8631, + "step": 334 + }, + { + "epoch": 0.14490794560545026, + "grad_norm": 12.427038192749023, + "learning_rate": 1.7287839020122485e-05, + "loss": 1.7007, + "step": 335 + }, + { + "epoch": 0.14534050663710832, + "grad_norm": 13.627511024475098, + "learning_rate": 1.7279090113735784e-05, + "loss": 1.7324, + "step": 336 + }, + { + "epoch": 0.14577306766876638, + "grad_norm": 13.441803932189941, + "learning_rate": 1.7270341207349083e-05, + "loss": 1.7494, + "step": 337 + }, + { + "epoch": 0.14620562870042444, + "grad_norm": 15.683148384094238, + "learning_rate": 1.7261592300962382e-05, + "loss": 1.6773, + "step": 338 + }, + { + "epoch": 0.1466381897320825, + "grad_norm": 14.39584732055664, + "learning_rate": 1.725284339457568e-05, + "loss": 1.6515, + "step": 339 + }, + { + "epoch": 0.14707075076374057, + "grad_norm": 13.478503227233887, + "learning_rate": 1.7244094488188977e-05, + "loss": 1.7104, + "step": 340 + }, + { + "epoch": 0.14750331179539863, + "grad_norm": 13.521794319152832, + "learning_rate": 1.7235345581802277e-05, + "loss": 1.8039, + "step": 341 + }, + { + "epoch": 0.1479358728270567, + "grad_norm": 13.772107124328613, + "learning_rate": 1.7226596675415576e-05, + "loss": 1.7705, + "step": 342 + }, + { + "epoch": 0.14836843385871476, + "grad_norm": 12.936812400817871, + "learning_rate": 1.721784776902887e-05, + "loss": 1.6975, + "step": 343 + }, + { + "epoch": 0.14880099489037282, + "grad_norm": 13.462157249450684, + "learning_rate": 1.720909886264217e-05, + "loss": 1.7893, + "step": 344 + }, + { + "epoch": 0.14923355592203089, + "grad_norm": 12.636610984802246, + "learning_rate": 1.720034995625547e-05, + "loss": 1.7366, + "step": 345 + }, + { + "epoch": 0.14966611695368892, + "grad_norm": 12.809752464294434, + "learning_rate": 1.7191601049868766e-05, + "loss": 1.7415, + "step": 346 + }, + { + "epoch": 0.15009867798534698, + "grad_norm": 13.976734161376953, + "learning_rate": 1.718285214348207e-05, + "loss": 1.735, + "step": 347 + }, + { + "epoch": 0.15053123901700505, + "grad_norm": 12.980571746826172, + "learning_rate": 1.7174103237095364e-05, + "loss": 1.7759, + "step": 348 + }, + { + "epoch": 0.1509638000486631, + "grad_norm": 15.27466106414795, + "learning_rate": 1.7165354330708663e-05, + "loss": 1.7244, + "step": 349 + }, + { + "epoch": 0.15139636108032117, + "grad_norm": 14.315723419189453, + "learning_rate": 1.7156605424321963e-05, + "loss": 1.7684, + "step": 350 + }, + { + "epoch": 0.15182892211197924, + "grad_norm": 13.90048599243164, + "learning_rate": 1.714785651793526e-05, + "loss": 1.7475, + "step": 351 + }, + { + "epoch": 0.1522614831436373, + "grad_norm": 13.61722469329834, + "learning_rate": 1.7139107611548558e-05, + "loss": 1.6943, + "step": 352 + }, + { + "epoch": 0.15269404417529536, + "grad_norm": 14.208736419677734, + "learning_rate": 1.7130358705161857e-05, + "loss": 1.7269, + "step": 353 + }, + { + "epoch": 0.15312660520695343, + "grad_norm": 12.213624954223633, + "learning_rate": 1.7121609798775153e-05, + "loss": 1.7076, + "step": 354 + }, + { + "epoch": 0.1535591662386115, + "grad_norm": 14.757974624633789, + "learning_rate": 1.7112860892388452e-05, + "loss": 1.714, + "step": 355 + }, + { + "epoch": 0.15399172727026955, + "grad_norm": 13.660282135009766, + "learning_rate": 1.710411198600175e-05, + "loss": 1.7442, + "step": 356 + }, + { + "epoch": 0.1544242883019276, + "grad_norm": 13.772233963012695, + "learning_rate": 1.709536307961505e-05, + "loss": 1.7667, + "step": 357 + }, + { + "epoch": 0.15485684933358565, + "grad_norm": 14.523663520812988, + "learning_rate": 1.708661417322835e-05, + "loss": 1.6005, + "step": 358 + }, + { + "epoch": 0.1552894103652437, + "grad_norm": 14.664237976074219, + "learning_rate": 1.7077865266841645e-05, + "loss": 1.6852, + "step": 359 + }, + { + "epoch": 0.15572197139690178, + "grad_norm": 14.592924118041992, + "learning_rate": 1.7069116360454945e-05, + "loss": 1.7847, + "step": 360 + }, + { + "epoch": 0.15615453242855984, + "grad_norm": 13.490509033203125, + "learning_rate": 1.7060367454068244e-05, + "loss": 1.7272, + "step": 361 + }, + { + "epoch": 0.1565870934602179, + "grad_norm": 14.302976608276367, + "learning_rate": 1.705161854768154e-05, + "loss": 1.6654, + "step": 362 + }, + { + "epoch": 0.15701965449187597, + "grad_norm": 13.850686073303223, + "learning_rate": 1.704286964129484e-05, + "loss": 1.6631, + "step": 363 + }, + { + "epoch": 0.15745221552353403, + "grad_norm": 12.717830657958984, + "learning_rate": 1.7034120734908138e-05, + "loss": 1.7526, + "step": 364 + }, + { + "epoch": 0.1578847765551921, + "grad_norm": 14.529204368591309, + "learning_rate": 1.7025371828521434e-05, + "loss": 1.6707, + "step": 365 + }, + { + "epoch": 0.15831733758685015, + "grad_norm": 13.817510604858398, + "learning_rate": 1.7016622922134736e-05, + "loss": 1.7353, + "step": 366 + }, + { + "epoch": 0.15874989861850822, + "grad_norm": 15.481966018676758, + "learning_rate": 1.7007874015748032e-05, + "loss": 1.6683, + "step": 367 + }, + { + "epoch": 0.15918245965016625, + "grad_norm": 14.049205780029297, + "learning_rate": 1.699912510936133e-05, + "loss": 1.7273, + "step": 368 + }, + { + "epoch": 0.15961502068182432, + "grad_norm": 14.65227222442627, + "learning_rate": 1.699037620297463e-05, + "loss": 1.6432, + "step": 369 + }, + { + "epoch": 0.16004758171348238, + "grad_norm": 14.594015121459961, + "learning_rate": 1.6981627296587927e-05, + "loss": 1.8021, + "step": 370 + }, + { + "epoch": 0.16048014274514044, + "grad_norm": 13.285860061645508, + "learning_rate": 1.6972878390201226e-05, + "loss": 1.6821, + "step": 371 + }, + { + "epoch": 0.1609127037767985, + "grad_norm": 14.19520378112793, + "learning_rate": 1.6964129483814525e-05, + "loss": 1.6678, + "step": 372 + }, + { + "epoch": 0.16134526480845657, + "grad_norm": 14.195395469665527, + "learning_rate": 1.695538057742782e-05, + "loss": 1.7482, + "step": 373 + }, + { + "epoch": 0.16177782584011463, + "grad_norm": 14.017548561096191, + "learning_rate": 1.694663167104112e-05, + "loss": 1.7071, + "step": 374 + }, + { + "epoch": 0.1622103868717727, + "grad_norm": 14.560656547546387, + "learning_rate": 1.693788276465442e-05, + "loss": 1.711, + "step": 375 + }, + { + "epoch": 0.16264294790343076, + "grad_norm": 14.667351722717285, + "learning_rate": 1.692913385826772e-05, + "loss": 1.6147, + "step": 376 + }, + { + "epoch": 0.16307550893508882, + "grad_norm": 15.273613929748535, + "learning_rate": 1.6920384951881018e-05, + "loss": 1.7753, + "step": 377 + }, + { + "epoch": 0.16350806996674688, + "grad_norm": 15.08775806427002, + "learning_rate": 1.6911636045494317e-05, + "loss": 1.8437, + "step": 378 + }, + { + "epoch": 0.16394063099840492, + "grad_norm": 16.35053825378418, + "learning_rate": 1.6902887139107613e-05, + "loss": 1.6818, + "step": 379 + }, + { + "epoch": 0.16437319203006298, + "grad_norm": 14.675270080566406, + "learning_rate": 1.6894138232720912e-05, + "loss": 1.8182, + "step": 380 + }, + { + "epoch": 0.16480575306172104, + "grad_norm": 12.950959205627441, + "learning_rate": 1.688538932633421e-05, + "loss": 1.6691, + "step": 381 + }, + { + "epoch": 0.1652383140933791, + "grad_norm": 15.214534759521484, + "learning_rate": 1.6876640419947507e-05, + "loss": 1.7198, + "step": 382 + }, + { + "epoch": 0.16567087512503717, + "grad_norm": 14.145734786987305, + "learning_rate": 1.6867891513560806e-05, + "loss": 1.8248, + "step": 383 + }, + { + "epoch": 0.16610343615669523, + "grad_norm": 13.487668991088867, + "learning_rate": 1.6859142607174105e-05, + "loss": 1.7785, + "step": 384 + }, + { + "epoch": 0.1665359971883533, + "grad_norm": 15.348024368286133, + "learning_rate": 1.68503937007874e-05, + "loss": 1.6533, + "step": 385 + }, + { + "epoch": 0.16696855822001136, + "grad_norm": 13.304719924926758, + "learning_rate": 1.6841644794400704e-05, + "loss": 1.7462, + "step": 386 + }, + { + "epoch": 0.16740111925166942, + "grad_norm": 15.1539888381958, + "learning_rate": 1.6832895888014e-05, + "loss": 1.7745, + "step": 387 + }, + { + "epoch": 0.16783368028332749, + "grad_norm": 14.753242492675781, + "learning_rate": 1.68241469816273e-05, + "loss": 1.7696, + "step": 388 + }, + { + "epoch": 0.16826624131498555, + "grad_norm": 14.071645736694336, + "learning_rate": 1.6815398075240598e-05, + "loss": 1.7495, + "step": 389 + }, + { + "epoch": 0.16869880234664358, + "grad_norm": 13.17452335357666, + "learning_rate": 1.6806649168853894e-05, + "loss": 1.6654, + "step": 390 + }, + { + "epoch": 0.16913136337830165, + "grad_norm": 13.739713668823242, + "learning_rate": 1.6797900262467193e-05, + "loss": 1.871, + "step": 391 + }, + { + "epoch": 0.1695639244099597, + "grad_norm": 14.904608726501465, + "learning_rate": 1.6789151356080492e-05, + "loss": 1.7308, + "step": 392 + }, + { + "epoch": 0.16999648544161777, + "grad_norm": 13.76181697845459, + "learning_rate": 1.6780402449693788e-05, + "loss": 1.7218, + "step": 393 + }, + { + "epoch": 0.17042904647327584, + "grad_norm": 15.682334899902344, + "learning_rate": 1.6771653543307087e-05, + "loss": 1.7135, + "step": 394 + }, + { + "epoch": 0.1708616075049339, + "grad_norm": 14.770147323608398, + "learning_rate": 1.6762904636920386e-05, + "loss": 1.6647, + "step": 395 + }, + { + "epoch": 0.17129416853659196, + "grad_norm": 14.868924140930176, + "learning_rate": 1.6754155730533686e-05, + "loss": 1.8061, + "step": 396 + }, + { + "epoch": 0.17172672956825002, + "grad_norm": 13.417051315307617, + "learning_rate": 1.6745406824146985e-05, + "loss": 1.7588, + "step": 397 + }, + { + "epoch": 0.1721592905999081, + "grad_norm": 15.186949729919434, + "learning_rate": 1.673665791776028e-05, + "loss": 1.7662, + "step": 398 + }, + { + "epoch": 0.17259185163156615, + "grad_norm": 14.485387802124023, + "learning_rate": 1.672790901137358e-05, + "loss": 1.7135, + "step": 399 + }, + { + "epoch": 0.17302441266322421, + "grad_norm": 14.660444259643555, + "learning_rate": 1.671916010498688e-05, + "loss": 1.6872, + "step": 400 + }, + { + "epoch": 0.17345697369488225, + "grad_norm": 16.875911712646484, + "learning_rate": 1.6710411198600175e-05, + "loss": 1.6135, + "step": 401 + }, + { + "epoch": 0.1738895347265403, + "grad_norm": 15.121524810791016, + "learning_rate": 1.6701662292213474e-05, + "loss": 1.6156, + "step": 402 + }, + { + "epoch": 0.17432209575819838, + "grad_norm": 14.13774585723877, + "learning_rate": 1.6692913385826773e-05, + "loss": 1.7311, + "step": 403 + }, + { + "epoch": 0.17475465678985644, + "grad_norm": 15.118247032165527, + "learning_rate": 1.668416447944007e-05, + "loss": 1.7038, + "step": 404 + }, + { + "epoch": 0.1751872178215145, + "grad_norm": 17.146011352539062, + "learning_rate": 1.667541557305337e-05, + "loss": 1.6302, + "step": 405 + }, + { + "epoch": 0.17561977885317256, + "grad_norm": 13.198725700378418, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.7395, + "step": 406 + }, + { + "epoch": 0.17605233988483063, + "grad_norm": 14.39787483215332, + "learning_rate": 1.6657917760279967e-05, + "loss": 1.7715, + "step": 407 + }, + { + "epoch": 0.1764849009164887, + "grad_norm": 14.981849670410156, + "learning_rate": 1.6649168853893266e-05, + "loss": 1.6916, + "step": 408 + }, + { + "epoch": 0.17691746194814675, + "grad_norm": 14.908326148986816, + "learning_rate": 1.6640419947506562e-05, + "loss": 1.7563, + "step": 409 + }, + { + "epoch": 0.17735002297980482, + "grad_norm": 14.553082466125488, + "learning_rate": 1.663167104111986e-05, + "loss": 1.7202, + "step": 410 + }, + { + "epoch": 0.17778258401146288, + "grad_norm": 13.793340682983398, + "learning_rate": 1.662292213473316e-05, + "loss": 1.7678, + "step": 411 + }, + { + "epoch": 0.17821514504312091, + "grad_norm": 16.31795310974121, + "learning_rate": 1.6614173228346456e-05, + "loss": 1.8089, + "step": 412 + }, + { + "epoch": 0.17864770607477898, + "grad_norm": 17.318607330322266, + "learning_rate": 1.6605424321959755e-05, + "loss": 1.6696, + "step": 413 + }, + { + "epoch": 0.17908026710643704, + "grad_norm": 13.9689302444458, + "learning_rate": 1.6596675415573054e-05, + "loss": 1.7372, + "step": 414 + }, + { + "epoch": 0.1795128281380951, + "grad_norm": 14.510354995727539, + "learning_rate": 1.6587926509186354e-05, + "loss": 1.6597, + "step": 415 + }, + { + "epoch": 0.17994538916975317, + "grad_norm": 15.901954650878906, + "learning_rate": 1.6579177602799653e-05, + "loss": 1.7872, + "step": 416 + }, + { + "epoch": 0.18037795020141123, + "grad_norm": 13.021835327148438, + "learning_rate": 1.657042869641295e-05, + "loss": 1.6739, + "step": 417 + }, + { + "epoch": 0.1808105112330693, + "grad_norm": 14.94437313079834, + "learning_rate": 1.6561679790026248e-05, + "loss": 1.6915, + "step": 418 + }, + { + "epoch": 0.18124307226472736, + "grad_norm": 14.61279296875, + "learning_rate": 1.6552930883639547e-05, + "loss": 1.5963, + "step": 419 + }, + { + "epoch": 0.18167563329638542, + "grad_norm": 16.396753311157227, + "learning_rate": 1.6544181977252843e-05, + "loss": 1.702, + "step": 420 + }, + { + "epoch": 0.18210819432804348, + "grad_norm": 16.830820083618164, + "learning_rate": 1.6535433070866142e-05, + "loss": 1.8108, + "step": 421 + }, + { + "epoch": 0.18254075535970155, + "grad_norm": 15.551237106323242, + "learning_rate": 1.652668416447944e-05, + "loss": 1.7831, + "step": 422 + }, + { + "epoch": 0.18297331639135958, + "grad_norm": 15.525558471679688, + "learning_rate": 1.651793525809274e-05, + "loss": 1.6718, + "step": 423 + }, + { + "epoch": 0.18340587742301764, + "grad_norm": 12.82323169708252, + "learning_rate": 1.650918635170604e-05, + "loss": 1.642, + "step": 424 + }, + { + "epoch": 0.1838384384546757, + "grad_norm": 15.793924331665039, + "learning_rate": 1.650043744531934e-05, + "loss": 1.6871, + "step": 425 + }, + { + "epoch": 0.18427099948633377, + "grad_norm": 14.773061752319336, + "learning_rate": 1.6491688538932635e-05, + "loss": 1.6956, + "step": 426 + }, + { + "epoch": 0.18470356051799183, + "grad_norm": 16.91400909423828, + "learning_rate": 1.6482939632545934e-05, + "loss": 1.6336, + "step": 427 + }, + { + "epoch": 0.1851361215496499, + "grad_norm": 15.226611137390137, + "learning_rate": 1.6474190726159233e-05, + "loss": 1.7333, + "step": 428 + }, + { + "epoch": 0.18556868258130796, + "grad_norm": 16.962453842163086, + "learning_rate": 1.646544181977253e-05, + "loss": 1.6063, + "step": 429 + }, + { + "epoch": 0.18600124361296602, + "grad_norm": 16.48338508605957, + "learning_rate": 1.6456692913385828e-05, + "loss": 1.7043, + "step": 430 + }, + { + "epoch": 0.18643380464462408, + "grad_norm": 14.127233505249023, + "learning_rate": 1.6447944006999127e-05, + "loss": 1.709, + "step": 431 + }, + { + "epoch": 0.18686636567628215, + "grad_norm": 15.949320793151855, + "learning_rate": 1.6439195100612423e-05, + "loss": 1.7161, + "step": 432 + }, + { + "epoch": 0.1872989267079402, + "grad_norm": 15.113992691040039, + "learning_rate": 1.6430446194225722e-05, + "loss": 1.7378, + "step": 433 + }, + { + "epoch": 0.18773148773959825, + "grad_norm": 15.371466636657715, + "learning_rate": 1.642169728783902e-05, + "loss": 1.6978, + "step": 434 + }, + { + "epoch": 0.1881640487712563, + "grad_norm": 15.309712409973145, + "learning_rate": 1.641294838145232e-05, + "loss": 1.7107, + "step": 435 + }, + { + "epoch": 0.18859660980291437, + "grad_norm": 14.199424743652344, + "learning_rate": 1.640419947506562e-05, + "loss": 1.6976, + "step": 436 + }, + { + "epoch": 0.18902917083457244, + "grad_norm": 16.60711097717285, + "learning_rate": 1.6395450568678916e-05, + "loss": 1.5963, + "step": 437 + }, + { + "epoch": 0.1894617318662305, + "grad_norm": 14.320137023925781, + "learning_rate": 1.6386701662292215e-05, + "loss": 1.7274, + "step": 438 + }, + { + "epoch": 0.18989429289788856, + "grad_norm": 15.285809516906738, + "learning_rate": 1.6377952755905514e-05, + "loss": 1.7584, + "step": 439 + }, + { + "epoch": 0.19032685392954662, + "grad_norm": 15.035262107849121, + "learning_rate": 1.636920384951881e-05, + "loss": 1.6926, + "step": 440 + }, + { + "epoch": 0.1907594149612047, + "grad_norm": 15.250046730041504, + "learning_rate": 1.636045494313211e-05, + "loss": 1.7165, + "step": 441 + }, + { + "epoch": 0.19119197599286275, + "grad_norm": 17.97435188293457, + "learning_rate": 1.635170603674541e-05, + "loss": 1.8039, + "step": 442 + }, + { + "epoch": 0.1916245370245208, + "grad_norm": 13.659225463867188, + "learning_rate": 1.6342957130358708e-05, + "loss": 1.6173, + "step": 443 + }, + { + "epoch": 0.19205709805617888, + "grad_norm": 15.350072860717773, + "learning_rate": 1.6334208223972007e-05, + "loss": 1.6974, + "step": 444 + }, + { + "epoch": 0.1924896590878369, + "grad_norm": 16.155649185180664, + "learning_rate": 1.6325459317585303e-05, + "loss": 1.6537, + "step": 445 + }, + { + "epoch": 0.19292222011949497, + "grad_norm": 14.439220428466797, + "learning_rate": 1.6316710411198602e-05, + "loss": 1.7225, + "step": 446 + }, + { + "epoch": 0.19335478115115304, + "grad_norm": 17.609535217285156, + "learning_rate": 1.63079615048119e-05, + "loss": 1.723, + "step": 447 + }, + { + "epoch": 0.1937873421828111, + "grad_norm": 17.415800094604492, + "learning_rate": 1.6299212598425197e-05, + "loss": 1.636, + "step": 448 + }, + { + "epoch": 0.19421990321446916, + "grad_norm": 15.260915756225586, + "learning_rate": 1.6290463692038496e-05, + "loss": 1.6597, + "step": 449 + }, + { + "epoch": 0.19465246424612723, + "grad_norm": 16.445232391357422, + "learning_rate": 1.6281714785651795e-05, + "loss": 1.7469, + "step": 450 + }, + { + "epoch": 0.1950850252777853, + "grad_norm": 16.05836296081543, + "learning_rate": 1.627296587926509e-05, + "loss": 1.651, + "step": 451 + }, + { + "epoch": 0.19551758630944335, + "grad_norm": 14.436004638671875, + "learning_rate": 1.626421697287839e-05, + "loss": 1.6635, + "step": 452 + }, + { + "epoch": 0.19595014734110142, + "grad_norm": 14.610448837280273, + "learning_rate": 1.625546806649169e-05, + "loss": 1.8475, + "step": 453 + }, + { + "epoch": 0.19638270837275948, + "grad_norm": 16.329818725585938, + "learning_rate": 1.624671916010499e-05, + "loss": 1.7341, + "step": 454 + }, + { + "epoch": 0.19681526940441754, + "grad_norm": 15.847041130065918, + "learning_rate": 1.6237970253718288e-05, + "loss": 1.721, + "step": 455 + }, + { + "epoch": 0.19724783043607558, + "grad_norm": 13.708486557006836, + "learning_rate": 1.6229221347331584e-05, + "loss": 1.7186, + "step": 456 + }, + { + "epoch": 0.19768039146773364, + "grad_norm": 15.195647239685059, + "learning_rate": 1.6220472440944883e-05, + "loss": 1.7061, + "step": 457 + }, + { + "epoch": 0.1981129524993917, + "grad_norm": 16.739809036254883, + "learning_rate": 1.6211723534558182e-05, + "loss": 1.6949, + "step": 458 + }, + { + "epoch": 0.19854551353104977, + "grad_norm": 15.860810279846191, + "learning_rate": 1.6202974628171478e-05, + "loss": 1.654, + "step": 459 + }, + { + "epoch": 0.19897807456270783, + "grad_norm": 16.588729858398438, + "learning_rate": 1.6194225721784777e-05, + "loss": 1.6778, + "step": 460 + }, + { + "epoch": 0.1994106355943659, + "grad_norm": 15.871442794799805, + "learning_rate": 1.6185476815398076e-05, + "loss": 1.7289, + "step": 461 + }, + { + "epoch": 0.19984319662602396, + "grad_norm": 15.373008728027344, + "learning_rate": 1.6176727909011372e-05, + "loss": 1.6259, + "step": 462 + }, + { + "epoch": 0.20027575765768202, + "grad_norm": 14.221921920776367, + "learning_rate": 1.6167979002624675e-05, + "loss": 1.6892, + "step": 463 + }, + { + "epoch": 0.20070831868934008, + "grad_norm": 17.429065704345703, + "learning_rate": 1.615923009623797e-05, + "loss": 1.724, + "step": 464 + }, + { + "epoch": 0.20114087972099814, + "grad_norm": 20.687255859375, + "learning_rate": 1.615048118985127e-05, + "loss": 1.6504, + "step": 465 + }, + { + "epoch": 0.2015734407526562, + "grad_norm": 15.03296947479248, + "learning_rate": 1.614173228346457e-05, + "loss": 1.6124, + "step": 466 + }, + { + "epoch": 0.20200600178431424, + "grad_norm": 16.98514175415039, + "learning_rate": 1.6132983377077865e-05, + "loss": 1.78, + "step": 467 + }, + { + "epoch": 0.2024385628159723, + "grad_norm": 14.70289134979248, + "learning_rate": 1.6124234470691164e-05, + "loss": 1.7044, + "step": 468 + }, + { + "epoch": 0.20287112384763037, + "grad_norm": 14.248015403747559, + "learning_rate": 1.6115485564304463e-05, + "loss": 1.7596, + "step": 469 + }, + { + "epoch": 0.20330368487928843, + "grad_norm": 15.52908706665039, + "learning_rate": 1.6106736657917763e-05, + "loss": 1.7105, + "step": 470 + }, + { + "epoch": 0.2037362459109465, + "grad_norm": 18.255674362182617, + "learning_rate": 1.609798775153106e-05, + "loss": 1.7161, + "step": 471 + }, + { + "epoch": 0.20416880694260456, + "grad_norm": 22.3636474609375, + "learning_rate": 1.608923884514436e-05, + "loss": 1.5983, + "step": 472 + }, + { + "epoch": 0.20460136797426262, + "grad_norm": 17.28295135498047, + "learning_rate": 1.6080489938757657e-05, + "loss": 1.6615, + "step": 473 + }, + { + "epoch": 0.20503392900592068, + "grad_norm": 13.663941383361816, + "learning_rate": 1.6071741032370956e-05, + "loss": 1.6207, + "step": 474 + }, + { + "epoch": 0.20546649003757875, + "grad_norm": 14.511791229248047, + "learning_rate": 1.6062992125984255e-05, + "loss": 1.7379, + "step": 475 + }, + { + "epoch": 0.2058990510692368, + "grad_norm": 15.742350578308105, + "learning_rate": 1.605424321959755e-05, + "loss": 1.7579, + "step": 476 + }, + { + "epoch": 0.20633161210089487, + "grad_norm": 15.552555084228516, + "learning_rate": 1.604549431321085e-05, + "loss": 1.641, + "step": 477 + }, + { + "epoch": 0.2067641731325529, + "grad_norm": 21.889766693115234, + "learning_rate": 1.603674540682415e-05, + "loss": 1.7403, + "step": 478 + }, + { + "epoch": 0.20719673416421097, + "grad_norm": 20.306562423706055, + "learning_rate": 1.6027996500437445e-05, + "loss": 1.7009, + "step": 479 + }, + { + "epoch": 0.20762929519586903, + "grad_norm": 18.12723731994629, + "learning_rate": 1.6019247594050744e-05, + "loss": 1.6979, + "step": 480 + }, + { + "epoch": 0.2080618562275271, + "grad_norm": 14.94935417175293, + "learning_rate": 1.6010498687664044e-05, + "loss": 1.6484, + "step": 481 + }, + { + "epoch": 0.20849441725918516, + "grad_norm": 14.743106842041016, + "learning_rate": 1.6001749781277343e-05, + "loss": 1.6963, + "step": 482 + }, + { + "epoch": 0.20892697829084322, + "grad_norm": 14.982324600219727, + "learning_rate": 1.5993000874890642e-05, + "loss": 1.6868, + "step": 483 + }, + { + "epoch": 0.2093595393225013, + "grad_norm": 16.85517692565918, + "learning_rate": 1.5984251968503938e-05, + "loss": 1.5787, + "step": 484 + }, + { + "epoch": 0.20979210035415935, + "grad_norm": 15.209792137145996, + "learning_rate": 1.5975503062117237e-05, + "loss": 1.7086, + "step": 485 + }, + { + "epoch": 0.2102246613858174, + "grad_norm": 14.563628196716309, + "learning_rate": 1.5966754155730536e-05, + "loss": 1.7039, + "step": 486 + }, + { + "epoch": 0.21065722241747548, + "grad_norm": 15.1539945602417, + "learning_rate": 1.5958005249343832e-05, + "loss": 1.6866, + "step": 487 + }, + { + "epoch": 0.21108978344913354, + "grad_norm": 19.112892150878906, + "learning_rate": 1.594925634295713e-05, + "loss": 1.5955, + "step": 488 + }, + { + "epoch": 0.21152234448079157, + "grad_norm": 17.022937774658203, + "learning_rate": 1.594050743657043e-05, + "loss": 1.7138, + "step": 489 + }, + { + "epoch": 0.21195490551244964, + "grad_norm": 18.67705535888672, + "learning_rate": 1.5931758530183726e-05, + "loss": 1.7335, + "step": 490 + }, + { + "epoch": 0.2123874665441077, + "grad_norm": 17.58185577392578, + "learning_rate": 1.592300962379703e-05, + "loss": 1.6535, + "step": 491 + }, + { + "epoch": 0.21282002757576576, + "grad_norm": 16.525270462036133, + "learning_rate": 1.5914260717410325e-05, + "loss": 1.6493, + "step": 492 + }, + { + "epoch": 0.21325258860742383, + "grad_norm": 18.45403289794922, + "learning_rate": 1.5905511811023624e-05, + "loss": 1.6937, + "step": 493 + }, + { + "epoch": 0.2136851496390819, + "grad_norm": 18.30523681640625, + "learning_rate": 1.5896762904636923e-05, + "loss": 1.6143, + "step": 494 + }, + { + "epoch": 0.21411771067073995, + "grad_norm": 17.691091537475586, + "learning_rate": 1.588801399825022e-05, + "loss": 1.6493, + "step": 495 + }, + { + "epoch": 0.21455027170239802, + "grad_norm": 15.201132774353027, + "learning_rate": 1.5879265091863518e-05, + "loss": 1.6753, + "step": 496 + }, + { + "epoch": 0.21498283273405608, + "grad_norm": 17.74352264404297, + "learning_rate": 1.5870516185476817e-05, + "loss": 1.7362, + "step": 497 + }, + { + "epoch": 0.21541539376571414, + "grad_norm": 16.426006317138672, + "learning_rate": 1.5861767279090113e-05, + "loss": 1.5865, + "step": 498 + }, + { + "epoch": 0.2158479547973722, + "grad_norm": 18.481365203857422, + "learning_rate": 1.5853018372703412e-05, + "loss": 1.7536, + "step": 499 + }, + { + "epoch": 0.21628051582903024, + "grad_norm": 16.491756439208984, + "learning_rate": 1.584426946631671e-05, + "loss": 1.7682, + "step": 500 + }, + { + "epoch": 0.2167130768606883, + "grad_norm": 15.862979888916016, + "learning_rate": 1.583552055993001e-05, + "loss": 1.6539, + "step": 501 + }, + { + "epoch": 0.21714563789234637, + "grad_norm": 14.549097061157227, + "learning_rate": 1.582677165354331e-05, + "loss": 1.7378, + "step": 502 + }, + { + "epoch": 0.21757819892400443, + "grad_norm": 14.149446487426758, + "learning_rate": 1.5818022747156606e-05, + "loss": 1.6905, + "step": 503 + }, + { + "epoch": 0.2180107599556625, + "grad_norm": 17.096704483032227, + "learning_rate": 1.5809273840769905e-05, + "loss": 1.5642, + "step": 504 + }, + { + "epoch": 0.21844332098732056, + "grad_norm": 17.932981491088867, + "learning_rate": 1.5800524934383204e-05, + "loss": 1.6426, + "step": 505 + }, + { + "epoch": 0.21887588201897862, + "grad_norm": 17.249406814575195, + "learning_rate": 1.57917760279965e-05, + "loss": 1.6455, + "step": 506 + }, + { + "epoch": 0.21930844305063668, + "grad_norm": 18.000295639038086, + "learning_rate": 1.57830271216098e-05, + "loss": 1.6118, + "step": 507 + }, + { + "epoch": 0.21974100408229474, + "grad_norm": 16.909530639648438, + "learning_rate": 1.57742782152231e-05, + "loss": 1.6894, + "step": 508 + }, + { + "epoch": 0.2201735651139528, + "grad_norm": 17.263492584228516, + "learning_rate": 1.5765529308836394e-05, + "loss": 1.7422, + "step": 509 + }, + { + "epoch": 0.22060612614561087, + "grad_norm": 16.53858184814453, + "learning_rate": 1.5756780402449694e-05, + "loss": 1.5943, + "step": 510 + }, + { + "epoch": 0.2210386871772689, + "grad_norm": 15.582964897155762, + "learning_rate": 1.5748031496062993e-05, + "loss": 1.568, + "step": 511 + }, + { + "epoch": 0.22147124820892697, + "grad_norm": 17.556089401245117, + "learning_rate": 1.5739282589676292e-05, + "loss": 1.7498, + "step": 512 + }, + { + "epoch": 0.22190380924058503, + "grad_norm": 16.844451904296875, + "learning_rate": 1.573053368328959e-05, + "loss": 1.6113, + "step": 513 + }, + { + "epoch": 0.2223363702722431, + "grad_norm": 16.041711807250977, + "learning_rate": 1.5721784776902887e-05, + "loss": 1.7614, + "step": 514 + }, + { + "epoch": 0.22276893130390116, + "grad_norm": 17.016220092773438, + "learning_rate": 1.5713035870516186e-05, + "loss": 1.6991, + "step": 515 + }, + { + "epoch": 0.22320149233555922, + "grad_norm": 16.927995681762695, + "learning_rate": 1.5704286964129485e-05, + "loss": 1.6816, + "step": 516 + }, + { + "epoch": 0.22363405336721728, + "grad_norm": 16.519311904907227, + "learning_rate": 1.5695538057742785e-05, + "loss": 1.5943, + "step": 517 + }, + { + "epoch": 0.22406661439887535, + "grad_norm": 18.04831314086914, + "learning_rate": 1.568678915135608e-05, + "loss": 1.7193, + "step": 518 + }, + { + "epoch": 0.2244991754305334, + "grad_norm": 18.500019073486328, + "learning_rate": 1.567804024496938e-05, + "loss": 1.6316, + "step": 519 + }, + { + "epoch": 0.22493173646219147, + "grad_norm": 18.94154930114746, + "learning_rate": 1.566929133858268e-05, + "loss": 1.6008, + "step": 520 + }, + { + "epoch": 0.22536429749384954, + "grad_norm": 15.78384780883789, + "learning_rate": 1.5660542432195978e-05, + "loss": 1.6177, + "step": 521 + }, + { + "epoch": 0.22579685852550757, + "grad_norm": 18.01935577392578, + "learning_rate": 1.5651793525809277e-05, + "loss": 1.7999, + "step": 522 + }, + { + "epoch": 0.22622941955716563, + "grad_norm": 19.498945236206055, + "learning_rate": 1.5643044619422573e-05, + "loss": 1.6587, + "step": 523 + }, + { + "epoch": 0.2266619805888237, + "grad_norm": 16.238536834716797, + "learning_rate": 1.5634295713035872e-05, + "loss": 1.6589, + "step": 524 + }, + { + "epoch": 0.22709454162048176, + "grad_norm": 14.878324508666992, + "learning_rate": 1.562554680664917e-05, + "loss": 1.692, + "step": 525 + }, + { + "epoch": 0.22752710265213982, + "grad_norm": 15.313729286193848, + "learning_rate": 1.5616797900262467e-05, + "loss": 1.6564, + "step": 526 + }, + { + "epoch": 0.2279596636837979, + "grad_norm": 14.713421821594238, + "learning_rate": 1.5608048993875766e-05, + "loss": 1.6997, + "step": 527 + }, + { + "epoch": 0.22839222471545595, + "grad_norm": 14.811202049255371, + "learning_rate": 1.5599300087489066e-05, + "loss": 1.7147, + "step": 528 + }, + { + "epoch": 0.228824785747114, + "grad_norm": 17.43622589111328, + "learning_rate": 1.559055118110236e-05, + "loss": 1.6092, + "step": 529 + }, + { + "epoch": 0.22925734677877208, + "grad_norm": 16.88450813293457, + "learning_rate": 1.5581802274715664e-05, + "loss": 1.6191, + "step": 530 + }, + { + "epoch": 0.22968990781043014, + "grad_norm": 17.206796646118164, + "learning_rate": 1.557305336832896e-05, + "loss": 1.7351, + "step": 531 + }, + { + "epoch": 0.2301224688420882, + "grad_norm": 18.76306915283203, + "learning_rate": 1.556430446194226e-05, + "loss": 1.5707, + "step": 532 + }, + { + "epoch": 0.23055502987374624, + "grad_norm": 15.88653564453125, + "learning_rate": 1.555555555555556e-05, + "loss": 1.6625, + "step": 533 + }, + { + "epoch": 0.2309875909054043, + "grad_norm": 17.676189422607422, + "learning_rate": 1.5546806649168854e-05, + "loss": 1.6742, + "step": 534 + }, + { + "epoch": 0.23142015193706236, + "grad_norm": 17.690826416015625, + "learning_rate": 1.5538057742782153e-05, + "loss": 1.6381, + "step": 535 + }, + { + "epoch": 0.23185271296872043, + "grad_norm": 16.335132598876953, + "learning_rate": 1.5529308836395453e-05, + "loss": 1.5988, + "step": 536 + }, + { + "epoch": 0.2322852740003785, + "grad_norm": 17.346731185913086, + "learning_rate": 1.552055993000875e-05, + "loss": 1.687, + "step": 537 + }, + { + "epoch": 0.23271783503203655, + "grad_norm": 16.654361724853516, + "learning_rate": 1.5511811023622048e-05, + "loss": 1.6375, + "step": 538 + }, + { + "epoch": 0.23315039606369461, + "grad_norm": 17.761816024780273, + "learning_rate": 1.5503062117235347e-05, + "loss": 1.6785, + "step": 539 + }, + { + "epoch": 0.23358295709535268, + "grad_norm": 16.973031997680664, + "learning_rate": 1.5494313210848646e-05, + "loss": 1.6814, + "step": 540 + }, + { + "epoch": 0.23401551812701074, + "grad_norm": 16.909561157226562, + "learning_rate": 1.5485564304461945e-05, + "loss": 1.6855, + "step": 541 + }, + { + "epoch": 0.2344480791586688, + "grad_norm": 17.03185272216797, + "learning_rate": 1.547681539807524e-05, + "loss": 1.6887, + "step": 542 + }, + { + "epoch": 0.23488064019032687, + "grad_norm": 17.3386173248291, + "learning_rate": 1.546806649168854e-05, + "loss": 1.6587, + "step": 543 + }, + { + "epoch": 0.2353132012219849, + "grad_norm": 16.984390258789062, + "learning_rate": 1.545931758530184e-05, + "loss": 1.5812, + "step": 544 + }, + { + "epoch": 0.23574576225364297, + "grad_norm": 16.079471588134766, + "learning_rate": 1.5450568678915135e-05, + "loss": 1.7236, + "step": 545 + }, + { + "epoch": 0.23617832328530103, + "grad_norm": 16.26591682434082, + "learning_rate": 1.5441819772528434e-05, + "loss": 1.6052, + "step": 546 + }, + { + "epoch": 0.2366108843169591, + "grad_norm": 17.542015075683594, + "learning_rate": 1.5433070866141734e-05, + "loss": 1.5683, + "step": 547 + }, + { + "epoch": 0.23704344534861715, + "grad_norm": 19.87151527404785, + "learning_rate": 1.542432195975503e-05, + "loss": 1.5891, + "step": 548 + }, + { + "epoch": 0.23747600638027522, + "grad_norm": 17.004043579101562, + "learning_rate": 1.5415573053368332e-05, + "loss": 1.6065, + "step": 549 + }, + { + "epoch": 0.23790856741193328, + "grad_norm": 16.69712257385254, + "learning_rate": 1.5406824146981628e-05, + "loss": 1.6858, + "step": 550 + }, + { + "epoch": 0.23834112844359134, + "grad_norm": 17.42692756652832, + "learning_rate": 1.5398075240594927e-05, + "loss": 1.7016, + "step": 551 + }, + { + "epoch": 0.2387736894752494, + "grad_norm": 15.988450050354004, + "learning_rate": 1.5389326334208226e-05, + "loss": 1.592, + "step": 552 + }, + { + "epoch": 0.23920625050690747, + "grad_norm": 16.765094757080078, + "learning_rate": 1.5380577427821522e-05, + "loss": 1.5969, + "step": 553 + }, + { + "epoch": 0.23963881153856553, + "grad_norm": 16.65338134765625, + "learning_rate": 1.537182852143482e-05, + "loss": 1.5488, + "step": 554 + }, + { + "epoch": 0.24007137257022357, + "grad_norm": 20.152639389038086, + "learning_rate": 1.536307961504812e-05, + "loss": 1.5912, + "step": 555 + }, + { + "epoch": 0.24050393360188163, + "grad_norm": 17.90691566467285, + "learning_rate": 1.5354330708661416e-05, + "loss": 1.7089, + "step": 556 + }, + { + "epoch": 0.2409364946335397, + "grad_norm": 16.137821197509766, + "learning_rate": 1.5345581802274716e-05, + "loss": 1.6125, + "step": 557 + }, + { + "epoch": 0.24136905566519776, + "grad_norm": 17.443849563598633, + "learning_rate": 1.5336832895888015e-05, + "loss": 1.7119, + "step": 558 + }, + { + "epoch": 0.24180161669685582, + "grad_norm": 19.99116325378418, + "learning_rate": 1.5328083989501314e-05, + "loss": 1.6325, + "step": 559 + }, + { + "epoch": 0.24223417772851388, + "grad_norm": 17.063501358032227, + "learning_rate": 1.5319335083114613e-05, + "loss": 1.6513, + "step": 560 + }, + { + "epoch": 0.24266673876017195, + "grad_norm": 16.421655654907227, + "learning_rate": 1.531058617672791e-05, + "loss": 1.683, + "step": 561 + }, + { + "epoch": 0.24309929979183, + "grad_norm": 20.072221755981445, + "learning_rate": 1.5301837270341208e-05, + "loss": 1.5857, + "step": 562 + }, + { + "epoch": 0.24353186082348807, + "grad_norm": 17.94641876220703, + "learning_rate": 1.5293088363954507e-05, + "loss": 1.7222, + "step": 563 + }, + { + "epoch": 0.24396442185514614, + "grad_norm": 18.8425235748291, + "learning_rate": 1.5284339457567807e-05, + "loss": 1.6236, + "step": 564 + }, + { + "epoch": 0.2443969828868042, + "grad_norm": 16.484027862548828, + "learning_rate": 1.5275590551181102e-05, + "loss": 1.6454, + "step": 565 + }, + { + "epoch": 0.24482954391846223, + "grad_norm": 16.227195739746094, + "learning_rate": 1.52668416447944e-05, + "loss": 1.5695, + "step": 566 + }, + { + "epoch": 0.2452621049501203, + "grad_norm": 17.701627731323242, + "learning_rate": 1.52580927384077e-05, + "loss": 1.723, + "step": 567 + }, + { + "epoch": 0.24569466598177836, + "grad_norm": 19.588029861450195, + "learning_rate": 1.5249343832021e-05, + "loss": 1.6525, + "step": 568 + }, + { + "epoch": 0.24612722701343642, + "grad_norm": 16.687175750732422, + "learning_rate": 1.5240594925634298e-05, + "loss": 1.6708, + "step": 569 + }, + { + "epoch": 0.24655978804509449, + "grad_norm": 16.77758026123047, + "learning_rate": 1.5231846019247595e-05, + "loss": 1.6512, + "step": 570 + }, + { + "epoch": 0.24699234907675255, + "grad_norm": 16.585819244384766, + "learning_rate": 1.5223097112860894e-05, + "loss": 1.5813, + "step": 571 + }, + { + "epoch": 0.2474249101084106, + "grad_norm": 18.844066619873047, + "learning_rate": 1.5214348206474192e-05, + "loss": 1.6259, + "step": 572 + }, + { + "epoch": 0.24785747114006867, + "grad_norm": 18.0266170501709, + "learning_rate": 1.520559930008749e-05, + "loss": 1.5875, + "step": 573 + }, + { + "epoch": 0.24829003217172674, + "grad_norm": 18.513761520385742, + "learning_rate": 1.5196850393700789e-05, + "loss": 1.6878, + "step": 574 + }, + { + "epoch": 0.2487225932033848, + "grad_norm": 16.963220596313477, + "learning_rate": 1.5188101487314086e-05, + "loss": 1.7002, + "step": 575 + }, + { + "epoch": 0.24915515423504286, + "grad_norm": 16.365562438964844, + "learning_rate": 1.5179352580927385e-05, + "loss": 1.6331, + "step": 576 + }, + { + "epoch": 0.2495877152667009, + "grad_norm": 18.376493453979492, + "learning_rate": 1.5170603674540683e-05, + "loss": 1.5584, + "step": 577 + }, + { + "epoch": 0.250020276298359, + "grad_norm": 15.05932903289795, + "learning_rate": 1.5161854768153984e-05, + "loss": 1.6453, + "step": 578 + }, + { + "epoch": 0.25045283733001705, + "grad_norm": 16.900829315185547, + "learning_rate": 1.5153105861767281e-05, + "loss": 1.6164, + "step": 579 + }, + { + "epoch": 0.2508853983616751, + "grad_norm": 18.5715389251709, + "learning_rate": 1.5144356955380579e-05, + "loss": 1.7586, + "step": 580 + }, + { + "epoch": 0.2513179593933332, + "grad_norm": 16.25688934326172, + "learning_rate": 1.5135608048993878e-05, + "loss": 1.5743, + "step": 581 + }, + { + "epoch": 0.2517505204249912, + "grad_norm": 16.32343292236328, + "learning_rate": 1.5126859142607175e-05, + "loss": 1.6409, + "step": 582 + }, + { + "epoch": 0.25218308145664925, + "grad_norm": 18.865724563598633, + "learning_rate": 1.5118110236220473e-05, + "loss": 1.5924, + "step": 583 + }, + { + "epoch": 0.2526156424883073, + "grad_norm": 16.306575775146484, + "learning_rate": 1.5109361329833772e-05, + "loss": 1.6264, + "step": 584 + }, + { + "epoch": 0.2530482035199654, + "grad_norm": 19.097639083862305, + "learning_rate": 1.510061242344707e-05, + "loss": 1.6059, + "step": 585 + }, + { + "epoch": 0.25348076455162344, + "grad_norm": 16.674352645874023, + "learning_rate": 1.5091863517060367e-05, + "loss": 1.7933, + "step": 586 + }, + { + "epoch": 0.2539133255832815, + "grad_norm": 19.233652114868164, + "learning_rate": 1.5083114610673668e-05, + "loss": 1.5857, + "step": 587 + }, + { + "epoch": 0.25434588661493956, + "grad_norm": 18.98441505432129, + "learning_rate": 1.5074365704286966e-05, + "loss": 1.709, + "step": 588 + }, + { + "epoch": 0.25477844764659763, + "grad_norm": 18.186471939086914, + "learning_rate": 1.5065616797900265e-05, + "loss": 1.6863, + "step": 589 + }, + { + "epoch": 0.2552110086782557, + "grad_norm": 18.54801368713379, + "learning_rate": 1.5056867891513562e-05, + "loss": 1.6825, + "step": 590 + }, + { + "epoch": 0.25564356970991375, + "grad_norm": 16.038410186767578, + "learning_rate": 1.504811898512686e-05, + "loss": 1.6853, + "step": 591 + }, + { + "epoch": 0.2560761307415718, + "grad_norm": 17.528871536254883, + "learning_rate": 1.5039370078740159e-05, + "loss": 1.7077, + "step": 592 + }, + { + "epoch": 0.2565086917732299, + "grad_norm": 15.960848808288574, + "learning_rate": 1.5030621172353457e-05, + "loss": 1.5823, + "step": 593 + }, + { + "epoch": 0.25694125280488794, + "grad_norm": 18.292795181274414, + "learning_rate": 1.5021872265966754e-05, + "loss": 1.5904, + "step": 594 + }, + { + "epoch": 0.257373813836546, + "grad_norm": 18.43050193786621, + "learning_rate": 1.5013123359580053e-05, + "loss": 1.6279, + "step": 595 + }, + { + "epoch": 0.25780637486820407, + "grad_norm": 18.099613189697266, + "learning_rate": 1.500437445319335e-05, + "loss": 1.6725, + "step": 596 + }, + { + "epoch": 0.25823893589986213, + "grad_norm": 17.15315055847168, + "learning_rate": 1.4995625546806652e-05, + "loss": 1.6643, + "step": 597 + }, + { + "epoch": 0.2586714969315202, + "grad_norm": 15.87186050415039, + "learning_rate": 1.498687664041995e-05, + "loss": 1.6733, + "step": 598 + }, + { + "epoch": 0.25910405796317826, + "grad_norm": 15.417596817016602, + "learning_rate": 1.4978127734033248e-05, + "loss": 1.743, + "step": 599 + }, + { + "epoch": 0.2595366189948363, + "grad_norm": 16.56854248046875, + "learning_rate": 1.4969378827646546e-05, + "loss": 1.7008, + "step": 600 + }, + { + "epoch": 0.2599691800264944, + "grad_norm": 16.416122436523438, + "learning_rate": 1.4960629921259843e-05, + "loss": 1.7122, + "step": 601 + }, + { + "epoch": 0.26040174105815245, + "grad_norm": 16.724794387817383, + "learning_rate": 1.4951881014873143e-05, + "loss": 1.7211, + "step": 602 + }, + { + "epoch": 0.2608343020898105, + "grad_norm": 16.47406768798828, + "learning_rate": 1.494313210848644e-05, + "loss": 1.6575, + "step": 603 + }, + { + "epoch": 0.2612668631214685, + "grad_norm": 18.724267959594727, + "learning_rate": 1.4934383202099738e-05, + "loss": 1.6323, + "step": 604 + }, + { + "epoch": 0.2616994241531266, + "grad_norm": 17.24701499938965, + "learning_rate": 1.4925634295713037e-05, + "loss": 1.7832, + "step": 605 + }, + { + "epoch": 0.26213198518478464, + "grad_norm": 18.684057235717773, + "learning_rate": 1.4916885389326334e-05, + "loss": 1.5967, + "step": 606 + }, + { + "epoch": 0.2625645462164427, + "grad_norm": 16.600284576416016, + "learning_rate": 1.4908136482939635e-05, + "loss": 1.5627, + "step": 607 + }, + { + "epoch": 0.26299710724810077, + "grad_norm": 17.85321044921875, + "learning_rate": 1.4899387576552933e-05, + "loss": 1.7005, + "step": 608 + }, + { + "epoch": 0.26342966827975883, + "grad_norm": 16.918190002441406, + "learning_rate": 1.489063867016623e-05, + "loss": 1.6876, + "step": 609 + }, + { + "epoch": 0.2638622293114169, + "grad_norm": 16.909236907958984, + "learning_rate": 1.488188976377953e-05, + "loss": 1.5946, + "step": 610 + }, + { + "epoch": 0.26429479034307496, + "grad_norm": 17.064176559448242, + "learning_rate": 1.4873140857392827e-05, + "loss": 1.7075, + "step": 611 + }, + { + "epoch": 0.264727351374733, + "grad_norm": 19.011877059936523, + "learning_rate": 1.4864391951006125e-05, + "loss": 1.765, + "step": 612 + }, + { + "epoch": 0.2651599124063911, + "grad_norm": 17.99385643005371, + "learning_rate": 1.4855643044619424e-05, + "loss": 1.6925, + "step": 613 + }, + { + "epoch": 0.26559247343804915, + "grad_norm": 17.17367172241211, + "learning_rate": 1.4846894138232721e-05, + "loss": 1.5354, + "step": 614 + }, + { + "epoch": 0.2660250344697072, + "grad_norm": 19.10651206970215, + "learning_rate": 1.4838145231846019e-05, + "loss": 1.5447, + "step": 615 + }, + { + "epoch": 0.2664575955013653, + "grad_norm": 18.000473022460938, + "learning_rate": 1.482939632545932e-05, + "loss": 1.5983, + "step": 616 + }, + { + "epoch": 0.26689015653302334, + "grad_norm": 17.769773483276367, + "learning_rate": 1.4820647419072617e-05, + "loss": 1.6972, + "step": 617 + }, + { + "epoch": 0.2673227175646814, + "grad_norm": 17.360748291015625, + "learning_rate": 1.4811898512685916e-05, + "loss": 1.6898, + "step": 618 + }, + { + "epoch": 0.26775527859633946, + "grad_norm": 18.90128517150879, + "learning_rate": 1.4803149606299214e-05, + "loss": 1.5293, + "step": 619 + }, + { + "epoch": 0.2681878396279975, + "grad_norm": 18.458005905151367, + "learning_rate": 1.4794400699912513e-05, + "loss": 1.6708, + "step": 620 + }, + { + "epoch": 0.2686204006596556, + "grad_norm": 16.414493560791016, + "learning_rate": 1.478565179352581e-05, + "loss": 1.5625, + "step": 621 + }, + { + "epoch": 0.26905296169131365, + "grad_norm": 17.863723754882812, + "learning_rate": 1.4776902887139108e-05, + "loss": 1.6055, + "step": 622 + }, + { + "epoch": 0.2694855227229717, + "grad_norm": 19.638256072998047, + "learning_rate": 1.4768153980752407e-05, + "loss": 1.6344, + "step": 623 + }, + { + "epoch": 0.2699180837546298, + "grad_norm": 19.117496490478516, + "learning_rate": 1.4759405074365705e-05, + "loss": 1.6323, + "step": 624 + }, + { + "epoch": 0.27035064478628784, + "grad_norm": 17.23756217956543, + "learning_rate": 1.4750656167979002e-05, + "loss": 1.6586, + "step": 625 + }, + { + "epoch": 0.27078320581794585, + "grad_norm": 17.25014877319336, + "learning_rate": 1.4741907261592303e-05, + "loss": 1.6476, + "step": 626 + }, + { + "epoch": 0.2712157668496039, + "grad_norm": 22.015825271606445, + "learning_rate": 1.47331583552056e-05, + "loss": 1.5858, + "step": 627 + }, + { + "epoch": 0.271648327881262, + "grad_norm": 19.36150360107422, + "learning_rate": 1.47244094488189e-05, + "loss": 1.6196, + "step": 628 + }, + { + "epoch": 0.27208088891292004, + "grad_norm": 17.298595428466797, + "learning_rate": 1.4715660542432198e-05, + "loss": 1.6995, + "step": 629 + }, + { + "epoch": 0.2725134499445781, + "grad_norm": 17.826095581054688, + "learning_rate": 1.4706911636045495e-05, + "loss": 1.7035, + "step": 630 + }, + { + "epoch": 0.27294601097623616, + "grad_norm": 17.059179306030273, + "learning_rate": 1.4698162729658794e-05, + "loss": 1.603, + "step": 631 + }, + { + "epoch": 0.2733785720078942, + "grad_norm": 18.210107803344727, + "learning_rate": 1.4689413823272092e-05, + "loss": 1.7385, + "step": 632 + }, + { + "epoch": 0.2738111330395523, + "grad_norm": 18.413015365600586, + "learning_rate": 1.468066491688539e-05, + "loss": 1.588, + "step": 633 + }, + { + "epoch": 0.27424369407121035, + "grad_norm": 17.428800582885742, + "learning_rate": 1.4671916010498688e-05, + "loss": 1.673, + "step": 634 + }, + { + "epoch": 0.2746762551028684, + "grad_norm": 17.01789665222168, + "learning_rate": 1.4663167104111988e-05, + "loss": 1.6085, + "step": 635 + }, + { + "epoch": 0.2751088161345265, + "grad_norm": 16.585988998413086, + "learning_rate": 1.4654418197725287e-05, + "loss": 1.7314, + "step": 636 + }, + { + "epoch": 0.27554137716618454, + "grad_norm": 18.863544464111328, + "learning_rate": 1.4645669291338584e-05, + "loss": 1.685, + "step": 637 + }, + { + "epoch": 0.2759739381978426, + "grad_norm": 18.15181541442871, + "learning_rate": 1.4636920384951882e-05, + "loss": 1.7304, + "step": 638 + }, + { + "epoch": 0.27640649922950067, + "grad_norm": 18.61130142211914, + "learning_rate": 1.4628171478565181e-05, + "loss": 1.5327, + "step": 639 + }, + { + "epoch": 0.27683906026115873, + "grad_norm": 17.54844856262207, + "learning_rate": 1.4619422572178479e-05, + "loss": 1.6265, + "step": 640 + }, + { + "epoch": 0.2772716212928168, + "grad_norm": 17.166112899780273, + "learning_rate": 1.4610673665791776e-05, + "loss": 1.6124, + "step": 641 + }, + { + "epoch": 0.27770418232447486, + "grad_norm": 19.315031051635742, + "learning_rate": 1.4601924759405075e-05, + "loss": 1.5918, + "step": 642 + }, + { + "epoch": 0.2781367433561329, + "grad_norm": 18.814680099487305, + "learning_rate": 1.4593175853018373e-05, + "loss": 1.5706, + "step": 643 + }, + { + "epoch": 0.278569304387791, + "grad_norm": 17.746023178100586, + "learning_rate": 1.4584426946631672e-05, + "loss": 1.7087, + "step": 644 + }, + { + "epoch": 0.27900186541944905, + "grad_norm": 21.044361114501953, + "learning_rate": 1.4575678040244971e-05, + "loss": 1.6326, + "step": 645 + }, + { + "epoch": 0.2794344264511071, + "grad_norm": 20.996479034423828, + "learning_rate": 1.456692913385827e-05, + "loss": 1.5941, + "step": 646 + }, + { + "epoch": 0.2798669874827652, + "grad_norm": 18.970081329345703, + "learning_rate": 1.4558180227471568e-05, + "loss": 1.6649, + "step": 647 + }, + { + "epoch": 0.2802995485144232, + "grad_norm": 18.267818450927734, + "learning_rate": 1.4549431321084865e-05, + "loss": 1.6545, + "step": 648 + }, + { + "epoch": 0.28073210954608124, + "grad_norm": 16.575632095336914, + "learning_rate": 1.4540682414698165e-05, + "loss": 1.6283, + "step": 649 + }, + { + "epoch": 0.2811646705777393, + "grad_norm": 17.574951171875, + "learning_rate": 1.4531933508311462e-05, + "loss": 1.664, + "step": 650 + }, + { + "epoch": 0.28159723160939737, + "grad_norm": 18.63271713256836, + "learning_rate": 1.452318460192476e-05, + "loss": 1.6736, + "step": 651 + }, + { + "epoch": 0.28202979264105543, + "grad_norm": 18.147533416748047, + "learning_rate": 1.4514435695538059e-05, + "loss": 1.6374, + "step": 652 + }, + { + "epoch": 0.2824623536727135, + "grad_norm": 17.428810119628906, + "learning_rate": 1.4505686789151356e-05, + "loss": 1.7221, + "step": 653 + }, + { + "epoch": 0.28289491470437156, + "grad_norm": 17.664213180541992, + "learning_rate": 1.4496937882764654e-05, + "loss": 1.6235, + "step": 654 + }, + { + "epoch": 0.2833274757360296, + "grad_norm": 18.84585952758789, + "learning_rate": 1.4488188976377955e-05, + "loss": 1.7157, + "step": 655 + }, + { + "epoch": 0.2837600367676877, + "grad_norm": 19.42424774169922, + "learning_rate": 1.4479440069991252e-05, + "loss": 1.6526, + "step": 656 + }, + { + "epoch": 0.28419259779934575, + "grad_norm": 18.132667541503906, + "learning_rate": 1.4470691163604552e-05, + "loss": 1.7109, + "step": 657 + }, + { + "epoch": 0.2846251588310038, + "grad_norm": 18.288928985595703, + "learning_rate": 1.4461942257217849e-05, + "loss": 1.6393, + "step": 658 + }, + { + "epoch": 0.2850577198626619, + "grad_norm": 15.85251522064209, + "learning_rate": 1.4453193350831147e-05, + "loss": 1.6375, + "step": 659 + }, + { + "epoch": 0.28549028089431994, + "grad_norm": 17.576475143432617, + "learning_rate": 1.4444444444444446e-05, + "loss": 1.6288, + "step": 660 + }, + { + "epoch": 0.285922841925978, + "grad_norm": 16.68917465209961, + "learning_rate": 1.4435695538057743e-05, + "loss": 1.5772, + "step": 661 + }, + { + "epoch": 0.28635540295763606, + "grad_norm": 19.292522430419922, + "learning_rate": 1.442694663167104e-05, + "loss": 1.6764, + "step": 662 + }, + { + "epoch": 0.2867879639892941, + "grad_norm": 19.660804748535156, + "learning_rate": 1.441819772528434e-05, + "loss": 1.7028, + "step": 663 + }, + { + "epoch": 0.2872205250209522, + "grad_norm": 17.166290283203125, + "learning_rate": 1.440944881889764e-05, + "loss": 1.5549, + "step": 664 + }, + { + "epoch": 0.28765308605261025, + "grad_norm": 15.878508567810059, + "learning_rate": 1.4400699912510938e-05, + "loss": 1.6366, + "step": 665 + }, + { + "epoch": 0.2880856470842683, + "grad_norm": 16.062185287475586, + "learning_rate": 1.4391951006124236e-05, + "loss": 1.6623, + "step": 666 + }, + { + "epoch": 0.2885182081159264, + "grad_norm": 18.430627822875977, + "learning_rate": 1.4383202099737535e-05, + "loss": 1.7102, + "step": 667 + }, + { + "epoch": 0.28895076914758444, + "grad_norm": 18.56020736694336, + "learning_rate": 1.4374453193350833e-05, + "loss": 1.6928, + "step": 668 + }, + { + "epoch": 0.2893833301792425, + "grad_norm": 77.00767517089844, + "learning_rate": 1.436570428696413e-05, + "loss": 1.6911, + "step": 669 + }, + { + "epoch": 0.2898158912109005, + "grad_norm": 18.834728240966797, + "learning_rate": 1.435695538057743e-05, + "loss": 1.6888, + "step": 670 + }, + { + "epoch": 0.2902484522425586, + "grad_norm": 17.199195861816406, + "learning_rate": 1.4348206474190727e-05, + "loss": 1.691, + "step": 671 + }, + { + "epoch": 0.29068101327421664, + "grad_norm": 18.331899642944336, + "learning_rate": 1.4339457567804024e-05, + "loss": 1.6973, + "step": 672 + }, + { + "epoch": 0.2911135743058747, + "grad_norm": 16.407562255859375, + "learning_rate": 1.4330708661417324e-05, + "loss": 1.6564, + "step": 673 + }, + { + "epoch": 0.29154613533753276, + "grad_norm": 18.343046188354492, + "learning_rate": 1.4321959755030623e-05, + "loss": 1.717, + "step": 674 + }, + { + "epoch": 0.2919786963691908, + "grad_norm": 19.638093948364258, + "learning_rate": 1.4313210848643922e-05, + "loss": 1.7019, + "step": 675 + }, + { + "epoch": 0.2924112574008489, + "grad_norm": 31.77922821044922, + "learning_rate": 1.430446194225722e-05, + "loss": 1.6391, + "step": 676 + }, + { + "epoch": 0.29284381843250695, + "grad_norm": 18.263328552246094, + "learning_rate": 1.4295713035870517e-05, + "loss": 1.5636, + "step": 677 + }, + { + "epoch": 0.293276379464165, + "grad_norm": 17.560848236083984, + "learning_rate": 1.4286964129483816e-05, + "loss": 1.6535, + "step": 678 + }, + { + "epoch": 0.2937089404958231, + "grad_norm": 18.677249908447266, + "learning_rate": 1.4278215223097114e-05, + "loss": 1.6181, + "step": 679 + }, + { + "epoch": 0.29414150152748114, + "grad_norm": 18.061887741088867, + "learning_rate": 1.4269466316710411e-05, + "loss": 1.5453, + "step": 680 + }, + { + "epoch": 0.2945740625591392, + "grad_norm": 17.87262535095215, + "learning_rate": 1.426071741032371e-05, + "loss": 1.6855, + "step": 681 + }, + { + "epoch": 0.29500662359079727, + "grad_norm": 18.203630447387695, + "learning_rate": 1.4251968503937008e-05, + "loss": 1.5759, + "step": 682 + }, + { + "epoch": 0.29543918462245533, + "grad_norm": 17.520793914794922, + "learning_rate": 1.4243219597550309e-05, + "loss": 1.5989, + "step": 683 + }, + { + "epoch": 0.2958717456541134, + "grad_norm": 19.39581298828125, + "learning_rate": 1.4234470691163606e-05, + "loss": 1.674, + "step": 684 + }, + { + "epoch": 0.29630430668577146, + "grad_norm": 18.8718318939209, + "learning_rate": 1.4225721784776904e-05, + "loss": 1.5593, + "step": 685 + }, + { + "epoch": 0.2967368677174295, + "grad_norm": 20.18855094909668, + "learning_rate": 1.4216972878390203e-05, + "loss": 1.5447, + "step": 686 + }, + { + "epoch": 0.2971694287490876, + "grad_norm": 18.366270065307617, + "learning_rate": 1.42082239720035e-05, + "loss": 1.5408, + "step": 687 + }, + { + "epoch": 0.29760198978074565, + "grad_norm": 20.94564437866211, + "learning_rate": 1.4199475065616798e-05, + "loss": 1.6166, + "step": 688 + }, + { + "epoch": 0.2980345508124037, + "grad_norm": 19.897859573364258, + "learning_rate": 1.4190726159230097e-05, + "loss": 1.5997, + "step": 689 + }, + { + "epoch": 0.29846711184406177, + "grad_norm": 18.334325790405273, + "learning_rate": 1.4181977252843395e-05, + "loss": 1.6338, + "step": 690 + }, + { + "epoch": 0.29889967287571984, + "grad_norm": 19.647480010986328, + "learning_rate": 1.4173228346456694e-05, + "loss": 1.6537, + "step": 691 + }, + { + "epoch": 0.29933223390737784, + "grad_norm": 16.615379333496094, + "learning_rate": 1.4164479440069992e-05, + "loss": 1.6717, + "step": 692 + }, + { + "epoch": 0.2997647949390359, + "grad_norm": 20.802907943725586, + "learning_rate": 1.4155730533683293e-05, + "loss": 1.6626, + "step": 693 + }, + { + "epoch": 0.30019735597069397, + "grad_norm": 17.72242546081543, + "learning_rate": 1.414698162729659e-05, + "loss": 1.6199, + "step": 694 + }, + { + "epoch": 0.30062991700235203, + "grad_norm": 17.221155166625977, + "learning_rate": 1.4138232720909888e-05, + "loss": 1.7148, + "step": 695 + }, + { + "epoch": 0.3010624780340101, + "grad_norm": 18.239665985107422, + "learning_rate": 1.4129483814523187e-05, + "loss": 1.654, + "step": 696 + }, + { + "epoch": 0.30149503906566816, + "grad_norm": 20.456558227539062, + "learning_rate": 1.4120734908136484e-05, + "loss": 1.6063, + "step": 697 + }, + { + "epoch": 0.3019276000973262, + "grad_norm": 18.604167938232422, + "learning_rate": 1.4111986001749782e-05, + "loss": 1.6348, + "step": 698 + }, + { + "epoch": 0.3023601611289843, + "grad_norm": 17.619157791137695, + "learning_rate": 1.4103237095363081e-05, + "loss": 1.5949, + "step": 699 + }, + { + "epoch": 0.30279272216064235, + "grad_norm": 17.160158157348633, + "learning_rate": 1.4094488188976379e-05, + "loss": 1.608, + "step": 700 + }, + { + "epoch": 0.3032252831923004, + "grad_norm": 17.992366790771484, + "learning_rate": 1.4085739282589676e-05, + "loss": 1.6816, + "step": 701 + }, + { + "epoch": 0.3036578442239585, + "grad_norm": 18.279861450195312, + "learning_rate": 1.4076990376202975e-05, + "loss": 1.5903, + "step": 702 + }, + { + "epoch": 0.30409040525561654, + "grad_norm": 17.513811111450195, + "learning_rate": 1.4068241469816274e-05, + "loss": 1.6141, + "step": 703 + }, + { + "epoch": 0.3045229662872746, + "grad_norm": 19.576257705688477, + "learning_rate": 1.4059492563429574e-05, + "loss": 1.5614, + "step": 704 + }, + { + "epoch": 0.30495552731893266, + "grad_norm": 17.985549926757812, + "learning_rate": 1.4050743657042871e-05, + "loss": 1.6339, + "step": 705 + }, + { + "epoch": 0.3053880883505907, + "grad_norm": 20.642486572265625, + "learning_rate": 1.4041994750656169e-05, + "loss": 1.5741, + "step": 706 + }, + { + "epoch": 0.3058206493822488, + "grad_norm": 18.464508056640625, + "learning_rate": 1.4033245844269468e-05, + "loss": 1.6104, + "step": 707 + }, + { + "epoch": 0.30625321041390685, + "grad_norm": 17.332229614257812, + "learning_rate": 1.4024496937882765e-05, + "loss": 1.6038, + "step": 708 + }, + { + "epoch": 0.3066857714455649, + "grad_norm": 21.05571937561035, + "learning_rate": 1.4015748031496063e-05, + "loss": 1.7033, + "step": 709 + }, + { + "epoch": 0.307118332477223, + "grad_norm": 18.811159133911133, + "learning_rate": 1.4006999125109362e-05, + "loss": 1.6677, + "step": 710 + }, + { + "epoch": 0.30755089350888104, + "grad_norm": 17.237346649169922, + "learning_rate": 1.399825021872266e-05, + "loss": 1.6081, + "step": 711 + }, + { + "epoch": 0.3079834545405391, + "grad_norm": 17.864810943603516, + "learning_rate": 1.398950131233596e-05, + "loss": 1.5892, + "step": 712 + }, + { + "epoch": 0.30841601557219717, + "grad_norm": 17.532041549682617, + "learning_rate": 1.3980752405949258e-05, + "loss": 1.5575, + "step": 713 + }, + { + "epoch": 0.3088485766038552, + "grad_norm": 18.569576263427734, + "learning_rate": 1.3972003499562557e-05, + "loss": 1.6348, + "step": 714 + }, + { + "epoch": 0.30928113763551324, + "grad_norm": 18.27239227294922, + "learning_rate": 1.3963254593175855e-05, + "loss": 1.6954, + "step": 715 + }, + { + "epoch": 0.3097136986671713, + "grad_norm": 20.6396484375, + "learning_rate": 1.3954505686789152e-05, + "loss": 1.6909, + "step": 716 + }, + { + "epoch": 0.31014625969882936, + "grad_norm": 19.61980628967285, + "learning_rate": 1.3945756780402451e-05, + "loss": 1.6499, + "step": 717 + }, + { + "epoch": 0.3105788207304874, + "grad_norm": 19.68534278869629, + "learning_rate": 1.3937007874015749e-05, + "loss": 1.5033, + "step": 718 + }, + { + "epoch": 0.3110113817621455, + "grad_norm": 22.719221115112305, + "learning_rate": 1.3928258967629047e-05, + "loss": 1.6344, + "step": 719 + }, + { + "epoch": 0.31144394279380355, + "grad_norm": 18.73038673400879, + "learning_rate": 1.3919510061242346e-05, + "loss": 1.6237, + "step": 720 + }, + { + "epoch": 0.3118765038254616, + "grad_norm": 17.38966941833496, + "learning_rate": 1.3910761154855643e-05, + "loss": 1.5169, + "step": 721 + }, + { + "epoch": 0.3123090648571197, + "grad_norm": 23.203266143798828, + "learning_rate": 1.3902012248468944e-05, + "loss": 1.6486, + "step": 722 + }, + { + "epoch": 0.31274162588877774, + "grad_norm": 20.895124435424805, + "learning_rate": 1.3893263342082242e-05, + "loss": 1.5868, + "step": 723 + }, + { + "epoch": 0.3131741869204358, + "grad_norm": 18.580778121948242, + "learning_rate": 1.388451443569554e-05, + "loss": 1.5986, + "step": 724 + }, + { + "epoch": 0.31360674795209387, + "grad_norm": 18.76534652709961, + "learning_rate": 1.3875765529308838e-05, + "loss": 1.5955, + "step": 725 + }, + { + "epoch": 0.31403930898375193, + "grad_norm": 19.820236206054688, + "learning_rate": 1.3867016622922136e-05, + "loss": 1.6397, + "step": 726 + }, + { + "epoch": 0.31447187001541, + "grad_norm": 18.899864196777344, + "learning_rate": 1.3858267716535433e-05, + "loss": 1.633, + "step": 727 + }, + { + "epoch": 0.31490443104706806, + "grad_norm": 19.738550186157227, + "learning_rate": 1.3849518810148733e-05, + "loss": 1.6484, + "step": 728 + }, + { + "epoch": 0.3153369920787261, + "grad_norm": 20.931293487548828, + "learning_rate": 1.384076990376203e-05, + "loss": 1.6189, + "step": 729 + }, + { + "epoch": 0.3157695531103842, + "grad_norm": 18.17987060546875, + "learning_rate": 1.3832020997375328e-05, + "loss": 1.6628, + "step": 730 + }, + { + "epoch": 0.31620211414204225, + "grad_norm": 19.147859573364258, + "learning_rate": 1.3823272090988629e-05, + "loss": 1.5953, + "step": 731 + }, + { + "epoch": 0.3166346751737003, + "grad_norm": 18.06837272644043, + "learning_rate": 1.3814523184601926e-05, + "loss": 1.5991, + "step": 732 + }, + { + "epoch": 0.31706723620535837, + "grad_norm": 17.540843963623047, + "learning_rate": 1.3805774278215225e-05, + "loss": 1.6227, + "step": 733 + }, + { + "epoch": 0.31749979723701643, + "grad_norm": 20.706098556518555, + "learning_rate": 1.3797025371828523e-05, + "loss": 1.5718, + "step": 734 + }, + { + "epoch": 0.3179323582686745, + "grad_norm": 17.51397132873535, + "learning_rate": 1.378827646544182e-05, + "loss": 1.6043, + "step": 735 + }, + { + "epoch": 0.3183649193003325, + "grad_norm": 20.64704132080078, + "learning_rate": 1.377952755905512e-05, + "loss": 1.5771, + "step": 736 + }, + { + "epoch": 0.31879748033199057, + "grad_norm": 20.126808166503906, + "learning_rate": 1.3770778652668417e-05, + "loss": 1.7199, + "step": 737 + }, + { + "epoch": 0.31923004136364863, + "grad_norm": 17.626785278320312, + "learning_rate": 1.3762029746281716e-05, + "loss": 1.6152, + "step": 738 + }, + { + "epoch": 0.3196626023953067, + "grad_norm": 19.35613250732422, + "learning_rate": 1.3753280839895014e-05, + "loss": 1.5903, + "step": 739 + }, + { + "epoch": 0.32009516342696476, + "grad_norm": 19.347549438476562, + "learning_rate": 1.3744531933508311e-05, + "loss": 1.5552, + "step": 740 + }, + { + "epoch": 0.3205277244586228, + "grad_norm": 22.455907821655273, + "learning_rate": 1.3735783027121612e-05, + "loss": 1.6141, + "step": 741 + }, + { + "epoch": 0.3209602854902809, + "grad_norm": 18.277362823486328, + "learning_rate": 1.372703412073491e-05, + "loss": 1.6331, + "step": 742 + }, + { + "epoch": 0.32139284652193895, + "grad_norm": 18.168556213378906, + "learning_rate": 1.3718285214348209e-05, + "loss": 1.7211, + "step": 743 + }, + { + "epoch": 0.321825407553597, + "grad_norm": 18.769269943237305, + "learning_rate": 1.3709536307961506e-05, + "loss": 1.5252, + "step": 744 + }, + { + "epoch": 0.3222579685852551, + "grad_norm": 22.99976348876953, + "learning_rate": 1.3700787401574804e-05, + "loss": 1.6573, + "step": 745 + }, + { + "epoch": 0.32269052961691314, + "grad_norm": 20.37944793701172, + "learning_rate": 1.3692038495188103e-05, + "loss": 1.6173, + "step": 746 + }, + { + "epoch": 0.3231230906485712, + "grad_norm": 22.109603881835938, + "learning_rate": 1.36832895888014e-05, + "loss": 1.6456, + "step": 747 + }, + { + "epoch": 0.32355565168022926, + "grad_norm": 17.85908317565918, + "learning_rate": 1.3674540682414698e-05, + "loss": 1.5592, + "step": 748 + }, + { + "epoch": 0.3239882127118873, + "grad_norm": 17.462928771972656, + "learning_rate": 1.3665791776027997e-05, + "loss": 1.5589, + "step": 749 + }, + { + "epoch": 0.3244207737435454, + "grad_norm": 17.60029411315918, + "learning_rate": 1.3657042869641295e-05, + "loss": 1.6156, + "step": 750 + }, + { + "epoch": 0.32485333477520345, + "grad_norm": 18.957937240600586, + "learning_rate": 1.3648293963254596e-05, + "loss": 1.7443, + "step": 751 + }, + { + "epoch": 0.3252858958068615, + "grad_norm": 22.84882926940918, + "learning_rate": 1.3639545056867893e-05, + "loss": 1.55, + "step": 752 + }, + { + "epoch": 0.3257184568385196, + "grad_norm": 20.238740921020508, + "learning_rate": 1.363079615048119e-05, + "loss": 1.5943, + "step": 753 + }, + { + "epoch": 0.32615101787017764, + "grad_norm": 17.576885223388672, + "learning_rate": 1.362204724409449e-05, + "loss": 1.6363, + "step": 754 + }, + { + "epoch": 0.3265835789018357, + "grad_norm": 20.89413070678711, + "learning_rate": 1.3613298337707787e-05, + "loss": 1.6913, + "step": 755 + }, + { + "epoch": 0.32701613993349377, + "grad_norm": 18.79582977294922, + "learning_rate": 1.3604549431321085e-05, + "loss": 1.6068, + "step": 756 + }, + { + "epoch": 0.32744870096515183, + "grad_norm": 16.650836944580078, + "learning_rate": 1.3595800524934384e-05, + "loss": 1.634, + "step": 757 + }, + { + "epoch": 0.32788126199680984, + "grad_norm": 17.64925765991211, + "learning_rate": 1.3587051618547682e-05, + "loss": 1.6472, + "step": 758 + }, + { + "epoch": 0.3283138230284679, + "grad_norm": 20.90283966064453, + "learning_rate": 1.357830271216098e-05, + "loss": 1.5816, + "step": 759 + }, + { + "epoch": 0.32874638406012596, + "grad_norm": 22.616561889648438, + "learning_rate": 1.356955380577428e-05, + "loss": 1.5406, + "step": 760 + }, + { + "epoch": 0.329178945091784, + "grad_norm": 18.393531799316406, + "learning_rate": 1.356080489938758e-05, + "loss": 1.5632, + "step": 761 + }, + { + "epoch": 0.3296115061234421, + "grad_norm": 17.92359733581543, + "learning_rate": 1.3552055993000877e-05, + "loss": 1.6438, + "step": 762 + }, + { + "epoch": 0.33004406715510015, + "grad_norm": 18.8277530670166, + "learning_rate": 1.3543307086614174e-05, + "loss": 1.607, + "step": 763 + }, + { + "epoch": 0.3304766281867582, + "grad_norm": 19.19496726989746, + "learning_rate": 1.3534558180227474e-05, + "loss": 1.6057, + "step": 764 + }, + { + "epoch": 0.3309091892184163, + "grad_norm": 19.613494873046875, + "learning_rate": 1.3525809273840771e-05, + "loss": 1.6079, + "step": 765 + }, + { + "epoch": 0.33134175025007434, + "grad_norm": 19.068227767944336, + "learning_rate": 1.3517060367454069e-05, + "loss": 1.5423, + "step": 766 + }, + { + "epoch": 0.3317743112817324, + "grad_norm": 20.14394760131836, + "learning_rate": 1.3508311461067368e-05, + "loss": 1.643, + "step": 767 + }, + { + "epoch": 0.33220687231339047, + "grad_norm": 20.151247024536133, + "learning_rate": 1.3499562554680665e-05, + "loss": 1.6013, + "step": 768 + }, + { + "epoch": 0.33263943334504853, + "grad_norm": 19.042261123657227, + "learning_rate": 1.3490813648293963e-05, + "loss": 1.5702, + "step": 769 + }, + { + "epoch": 0.3330719943767066, + "grad_norm": 18.928590774536133, + "learning_rate": 1.3482064741907264e-05, + "loss": 1.5902, + "step": 770 + }, + { + "epoch": 0.33350455540836466, + "grad_norm": 20.279457092285156, + "learning_rate": 1.3473315835520561e-05, + "loss": 1.6624, + "step": 771 + }, + { + "epoch": 0.3339371164400227, + "grad_norm": 19.55834197998047, + "learning_rate": 1.346456692913386e-05, + "loss": 1.5899, + "step": 772 + }, + { + "epoch": 0.3343696774716808, + "grad_norm": 19.354820251464844, + "learning_rate": 1.3455818022747158e-05, + "loss": 1.586, + "step": 773 + }, + { + "epoch": 0.33480223850333884, + "grad_norm": 21.938444137573242, + "learning_rate": 1.3447069116360455e-05, + "loss": 1.6159, + "step": 774 + }, + { + "epoch": 0.3352347995349969, + "grad_norm": 17.96108055114746, + "learning_rate": 1.3438320209973755e-05, + "loss": 1.6662, + "step": 775 + }, + { + "epoch": 0.33566736056665497, + "grad_norm": 18.945526123046875, + "learning_rate": 1.3429571303587052e-05, + "loss": 1.5927, + "step": 776 + }, + { + "epoch": 0.33609992159831303, + "grad_norm": 19.803407669067383, + "learning_rate": 1.342082239720035e-05, + "loss": 1.6594, + "step": 777 + }, + { + "epoch": 0.3365324826299711, + "grad_norm": 18.89844512939453, + "learning_rate": 1.3412073490813649e-05, + "loss": 1.6048, + "step": 778 + }, + { + "epoch": 0.33696504366162916, + "grad_norm": 20.112564086914062, + "learning_rate": 1.3403324584426948e-05, + "loss": 1.6187, + "step": 779 + }, + { + "epoch": 0.33739760469328717, + "grad_norm": 17.415624618530273, + "learning_rate": 1.3394575678040247e-05, + "loss": 1.5409, + "step": 780 + }, + { + "epoch": 0.33783016572494523, + "grad_norm": 16.357385635375977, + "learning_rate": 1.3385826771653545e-05, + "loss": 1.6128, + "step": 781 + }, + { + "epoch": 0.3382627267566033, + "grad_norm": 18.058176040649414, + "learning_rate": 1.3377077865266842e-05, + "loss": 1.5935, + "step": 782 + }, + { + "epoch": 0.33869528778826136, + "grad_norm": 19.19011688232422, + "learning_rate": 1.3368328958880142e-05, + "loss": 1.5617, + "step": 783 + }, + { + "epoch": 0.3391278488199194, + "grad_norm": 19.17144203186035, + "learning_rate": 1.3359580052493439e-05, + "loss": 1.6607, + "step": 784 + }, + { + "epoch": 0.3395604098515775, + "grad_norm": 18.61704444885254, + "learning_rate": 1.3350831146106738e-05, + "loss": 1.612, + "step": 785 + }, + { + "epoch": 0.33999297088323555, + "grad_norm": 20.21958351135254, + "learning_rate": 1.3342082239720036e-05, + "loss": 1.5809, + "step": 786 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 19.189098358154297, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5807, + "step": 787 + }, + { + "epoch": 0.34085809294655167, + "grad_norm": 22.04606819152832, + "learning_rate": 1.3324584426946633e-05, + "loss": 1.6273, + "step": 788 + }, + { + "epoch": 0.34129065397820973, + "grad_norm": 19.73443031311035, + "learning_rate": 1.3315835520559932e-05, + "loss": 1.6006, + "step": 789 + }, + { + "epoch": 0.3417232150098678, + "grad_norm": 20.105873107910156, + "learning_rate": 1.3307086614173231e-05, + "loss": 1.573, + "step": 790 + }, + { + "epoch": 0.34215577604152586, + "grad_norm": 20.053709030151367, + "learning_rate": 1.3298337707786528e-05, + "loss": 1.5847, + "step": 791 + }, + { + "epoch": 0.3425883370731839, + "grad_norm": 17.113309860229492, + "learning_rate": 1.3289588801399826e-05, + "loss": 1.5184, + "step": 792 + }, + { + "epoch": 0.343020898104842, + "grad_norm": 17.02407455444336, + "learning_rate": 1.3280839895013125e-05, + "loss": 1.5491, + "step": 793 + }, + { + "epoch": 0.34345345913650005, + "grad_norm": 19.07799530029297, + "learning_rate": 1.3272090988626423e-05, + "loss": 1.6541, + "step": 794 + }, + { + "epoch": 0.3438860201681581, + "grad_norm": 19.018844604492188, + "learning_rate": 1.326334208223972e-05, + "loss": 1.5478, + "step": 795 + }, + { + "epoch": 0.3443185811998162, + "grad_norm": 18.546709060668945, + "learning_rate": 1.325459317585302e-05, + "loss": 1.6061, + "step": 796 + }, + { + "epoch": 0.34475114223147424, + "grad_norm": 20.176528930664062, + "learning_rate": 1.3245844269466317e-05, + "loss": 1.7127, + "step": 797 + }, + { + "epoch": 0.3451837032631323, + "grad_norm": 20.838176727294922, + "learning_rate": 1.3237095363079614e-05, + "loss": 1.6132, + "step": 798 + }, + { + "epoch": 0.34561626429479037, + "grad_norm": 17.911251068115234, + "learning_rate": 1.3228346456692915e-05, + "loss": 1.659, + "step": 799 + }, + { + "epoch": 0.34604882532644843, + "grad_norm": 20.308305740356445, + "learning_rate": 1.3219597550306213e-05, + "loss": 1.5575, + "step": 800 + }, + { + "epoch": 0.3464813863581065, + "grad_norm": 18.12563133239746, + "learning_rate": 1.3210848643919512e-05, + "loss": 1.7155, + "step": 801 + }, + { + "epoch": 0.3469139473897645, + "grad_norm": 18.247697830200195, + "learning_rate": 1.320209973753281e-05, + "loss": 1.6309, + "step": 802 + }, + { + "epoch": 0.34734650842142256, + "grad_norm": 20.070714950561523, + "learning_rate": 1.3193350831146107e-05, + "loss": 1.5833, + "step": 803 + }, + { + "epoch": 0.3477790694530806, + "grad_norm": 22.178564071655273, + "learning_rate": 1.3184601924759406e-05, + "loss": 1.6604, + "step": 804 + }, + { + "epoch": 0.3482116304847387, + "grad_norm": 19.786285400390625, + "learning_rate": 1.3175853018372704e-05, + "loss": 1.5634, + "step": 805 + }, + { + "epoch": 0.34864419151639675, + "grad_norm": 18.314327239990234, + "learning_rate": 1.3167104111986003e-05, + "loss": 1.657, + "step": 806 + }, + { + "epoch": 0.3490767525480548, + "grad_norm": 20.814380645751953, + "learning_rate": 1.31583552055993e-05, + "loss": 1.6143, + "step": 807 + }, + { + "epoch": 0.3495093135797129, + "grad_norm": 19.84009552001953, + "learning_rate": 1.3149606299212601e-05, + "loss": 1.6296, + "step": 808 + }, + { + "epoch": 0.34994187461137094, + "grad_norm": 18.69206428527832, + "learning_rate": 1.3140857392825899e-05, + "loss": 1.6647, + "step": 809 + }, + { + "epoch": 0.350374435643029, + "grad_norm": 19.76461410522461, + "learning_rate": 1.3132108486439196e-05, + "loss": 1.5763, + "step": 810 + }, + { + "epoch": 0.35080699667468707, + "grad_norm": 19.293476104736328, + "learning_rate": 1.3123359580052496e-05, + "loss": 1.5856, + "step": 811 + }, + { + "epoch": 0.35123955770634513, + "grad_norm": 20.5203914642334, + "learning_rate": 1.3114610673665793e-05, + "loss": 1.5266, + "step": 812 + }, + { + "epoch": 0.3516721187380032, + "grad_norm": 20.248397827148438, + "learning_rate": 1.310586176727909e-05, + "loss": 1.6702, + "step": 813 + }, + { + "epoch": 0.35210467976966126, + "grad_norm": 16.614885330200195, + "learning_rate": 1.309711286089239e-05, + "loss": 1.6621, + "step": 814 + }, + { + "epoch": 0.3525372408013193, + "grad_norm": 16.071029663085938, + "learning_rate": 1.3088363954505687e-05, + "loss": 1.6654, + "step": 815 + }, + { + "epoch": 0.3529698018329774, + "grad_norm": 18.72723960876465, + "learning_rate": 1.3079615048118985e-05, + "loss": 1.6519, + "step": 816 + }, + { + "epoch": 0.35340236286463544, + "grad_norm": 18.771820068359375, + "learning_rate": 1.3070866141732284e-05, + "loss": 1.688, + "step": 817 + }, + { + "epoch": 0.3538349238962935, + "grad_norm": 17.976329803466797, + "learning_rate": 1.3062117235345583e-05, + "loss": 1.6781, + "step": 818 + }, + { + "epoch": 0.35426748492795157, + "grad_norm": 19.907297134399414, + "learning_rate": 1.3053368328958883e-05, + "loss": 1.623, + "step": 819 + }, + { + "epoch": 0.35470004595960963, + "grad_norm": 19.37649154663086, + "learning_rate": 1.304461942257218e-05, + "loss": 1.6589, + "step": 820 + }, + { + "epoch": 0.3551326069912677, + "grad_norm": 21.129186630249023, + "learning_rate": 1.3035870516185478e-05, + "loss": 1.5147, + "step": 821 + }, + { + "epoch": 0.35556516802292576, + "grad_norm": 20.957626342773438, + "learning_rate": 1.3027121609798777e-05, + "loss": 1.6448, + "step": 822 + }, + { + "epoch": 0.3559977290545838, + "grad_norm": 21.617694854736328, + "learning_rate": 1.3018372703412074e-05, + "loss": 1.5604, + "step": 823 + }, + { + "epoch": 0.35643029008624183, + "grad_norm": 21.156492233276367, + "learning_rate": 1.3009623797025372e-05, + "loss": 1.6346, + "step": 824 + }, + { + "epoch": 0.3568628511178999, + "grad_norm": 20.160831451416016, + "learning_rate": 1.3000874890638671e-05, + "loss": 1.5343, + "step": 825 + }, + { + "epoch": 0.35729541214955796, + "grad_norm": 21.976581573486328, + "learning_rate": 1.2992125984251968e-05, + "loss": 1.615, + "step": 826 + }, + { + "epoch": 0.357727973181216, + "grad_norm": 21.678958892822266, + "learning_rate": 1.2983377077865266e-05, + "loss": 1.5747, + "step": 827 + }, + { + "epoch": 0.3581605342128741, + "grad_norm": 22.03537940979004, + "learning_rate": 1.2974628171478567e-05, + "loss": 1.5686, + "step": 828 + }, + { + "epoch": 0.35859309524453215, + "grad_norm": 18.182485580444336, + "learning_rate": 1.2965879265091864e-05, + "loss": 1.6549, + "step": 829 + }, + { + "epoch": 0.3590256562761902, + "grad_norm": 18.341358184814453, + "learning_rate": 1.2957130358705164e-05, + "loss": 1.7054, + "step": 830 + }, + { + "epoch": 0.35945821730784827, + "grad_norm": 20.54411506652832, + "learning_rate": 1.2948381452318461e-05, + "loss": 1.6214, + "step": 831 + }, + { + "epoch": 0.35989077833950633, + "grad_norm": 17.28512954711914, + "learning_rate": 1.293963254593176e-05, + "loss": 1.6161, + "step": 832 + }, + { + "epoch": 0.3603233393711644, + "grad_norm": 20.21026039123535, + "learning_rate": 1.2930883639545058e-05, + "loss": 1.5949, + "step": 833 + }, + { + "epoch": 0.36075590040282246, + "grad_norm": 20.83098030090332, + "learning_rate": 1.2922134733158355e-05, + "loss": 1.556, + "step": 834 + }, + { + "epoch": 0.3611884614344805, + "grad_norm": 19.548084259033203, + "learning_rate": 1.2913385826771655e-05, + "loss": 1.6087, + "step": 835 + }, + { + "epoch": 0.3616210224661386, + "grad_norm": 19.833934783935547, + "learning_rate": 1.2904636920384952e-05, + "loss": 1.6511, + "step": 836 + }, + { + "epoch": 0.36205358349779665, + "grad_norm": 24.85418701171875, + "learning_rate": 1.2895888013998253e-05, + "loss": 1.6686, + "step": 837 + }, + { + "epoch": 0.3624861445294547, + "grad_norm": 20.29999542236328, + "learning_rate": 1.288713910761155e-05, + "loss": 1.6503, + "step": 838 + }, + { + "epoch": 0.3629187055611128, + "grad_norm": 18.538761138916016, + "learning_rate": 1.2878390201224848e-05, + "loss": 1.5934, + "step": 839 + }, + { + "epoch": 0.36335126659277084, + "grad_norm": 20.594356536865234, + "learning_rate": 1.2869641294838147e-05, + "loss": 1.6694, + "step": 840 + }, + { + "epoch": 0.3637838276244289, + "grad_norm": 18.274131774902344, + "learning_rate": 1.2860892388451445e-05, + "loss": 1.6111, + "step": 841 + }, + { + "epoch": 0.36421638865608696, + "grad_norm": 19.532379150390625, + "learning_rate": 1.2852143482064742e-05, + "loss": 1.5865, + "step": 842 + }, + { + "epoch": 0.364648949687745, + "grad_norm": 19.116188049316406, + "learning_rate": 1.2843394575678041e-05, + "loss": 1.5406, + "step": 843 + }, + { + "epoch": 0.3650815107194031, + "grad_norm": 21.525840759277344, + "learning_rate": 1.2834645669291339e-05, + "loss": 1.5579, + "step": 844 + }, + { + "epoch": 0.36551407175106115, + "grad_norm": 19.764007568359375, + "learning_rate": 1.2825896762904636e-05, + "loss": 1.5497, + "step": 845 + }, + { + "epoch": 0.36594663278271916, + "grad_norm": 20.76372528076172, + "learning_rate": 1.2817147856517936e-05, + "loss": 1.5076, + "step": 846 + }, + { + "epoch": 0.3663791938143772, + "grad_norm": 26.738079071044922, + "learning_rate": 1.2808398950131235e-05, + "loss": 1.5783, + "step": 847 + }, + { + "epoch": 0.3668117548460353, + "grad_norm": 21.190170288085938, + "learning_rate": 1.2799650043744534e-05, + "loss": 1.4697, + "step": 848 + }, + { + "epoch": 0.36724431587769335, + "grad_norm": 21.188650131225586, + "learning_rate": 1.2790901137357832e-05, + "loss": 1.6, + "step": 849 + }, + { + "epoch": 0.3676768769093514, + "grad_norm": 20.70604705810547, + "learning_rate": 1.2782152230971129e-05, + "loss": 1.6808, + "step": 850 + }, + { + "epoch": 0.3681094379410095, + "grad_norm": 20.689743041992188, + "learning_rate": 1.2773403324584428e-05, + "loss": 1.5621, + "step": 851 + }, + { + "epoch": 0.36854199897266754, + "grad_norm": 20.194637298583984, + "learning_rate": 1.2764654418197726e-05, + "loss": 1.6808, + "step": 852 + }, + { + "epoch": 0.3689745600043256, + "grad_norm": 19.224369049072266, + "learning_rate": 1.2755905511811025e-05, + "loss": 1.6184, + "step": 853 + }, + { + "epoch": 0.36940712103598367, + "grad_norm": 19.203989028930664, + "learning_rate": 1.2747156605424323e-05, + "loss": 1.5306, + "step": 854 + }, + { + "epoch": 0.36983968206764173, + "grad_norm": 20.381193161010742, + "learning_rate": 1.273840769903762e-05, + "loss": 1.6061, + "step": 855 + }, + { + "epoch": 0.3702722430992998, + "grad_norm": 19.450544357299805, + "learning_rate": 1.2729658792650921e-05, + "loss": 1.6247, + "step": 856 + }, + { + "epoch": 0.37070480413095785, + "grad_norm": 21.83024787902832, + "learning_rate": 1.2720909886264218e-05, + "loss": 1.5597, + "step": 857 + }, + { + "epoch": 0.3711373651626159, + "grad_norm": 21.240888595581055, + "learning_rate": 1.2712160979877518e-05, + "loss": 1.604, + "step": 858 + }, + { + "epoch": 0.371569926194274, + "grad_norm": 21.858312606811523, + "learning_rate": 1.2703412073490815e-05, + "loss": 1.5974, + "step": 859 + }, + { + "epoch": 0.37200248722593204, + "grad_norm": 19.26487159729004, + "learning_rate": 1.2694663167104113e-05, + "loss": 1.7254, + "step": 860 + }, + { + "epoch": 0.3724350482575901, + "grad_norm": 22.703330993652344, + "learning_rate": 1.2685914260717412e-05, + "loss": 1.6511, + "step": 861 + }, + { + "epoch": 0.37286760928924817, + "grad_norm": 20.9957218170166, + "learning_rate": 1.267716535433071e-05, + "loss": 1.5367, + "step": 862 + }, + { + "epoch": 0.37330017032090623, + "grad_norm": 19.078935623168945, + "learning_rate": 1.2668416447944007e-05, + "loss": 1.5111, + "step": 863 + }, + { + "epoch": 0.3737327313525643, + "grad_norm": 20.05775260925293, + "learning_rate": 1.2659667541557306e-05, + "loss": 1.5996, + "step": 864 + }, + { + "epoch": 0.37416529238422236, + "grad_norm": 18.394546508789062, + "learning_rate": 1.2650918635170604e-05, + "loss": 1.6372, + "step": 865 + }, + { + "epoch": 0.3745978534158804, + "grad_norm": 19.035602569580078, + "learning_rate": 1.2642169728783905e-05, + "loss": 1.6267, + "step": 866 + }, + { + "epoch": 0.3750304144475385, + "grad_norm": 19.66758918762207, + "learning_rate": 1.2633420822397202e-05, + "loss": 1.5181, + "step": 867 + }, + { + "epoch": 0.3754629754791965, + "grad_norm": 19.294878005981445, + "learning_rate": 1.26246719160105e-05, + "loss": 1.6038, + "step": 868 + }, + { + "epoch": 0.37589553651085456, + "grad_norm": 21.874311447143555, + "learning_rate": 1.2615923009623799e-05, + "loss": 1.5302, + "step": 869 + }, + { + "epoch": 0.3763280975425126, + "grad_norm": 20.003787994384766, + "learning_rate": 1.2607174103237096e-05, + "loss": 1.5961, + "step": 870 + }, + { + "epoch": 0.3767606585741707, + "grad_norm": 19.584362030029297, + "learning_rate": 1.2598425196850394e-05, + "loss": 1.6231, + "step": 871 + }, + { + "epoch": 0.37719321960582874, + "grad_norm": 18.144039154052734, + "learning_rate": 1.2589676290463693e-05, + "loss": 1.4876, + "step": 872 + }, + { + "epoch": 0.3776257806374868, + "grad_norm": 20.27103042602539, + "learning_rate": 1.258092738407699e-05, + "loss": 1.5783, + "step": 873 + }, + { + "epoch": 0.37805834166914487, + "grad_norm": 24.131301879882812, + "learning_rate": 1.2572178477690288e-05, + "loss": 1.6346, + "step": 874 + }, + { + "epoch": 0.37849090270080293, + "grad_norm": 25.75758934020996, + "learning_rate": 1.2563429571303587e-05, + "loss": 1.7327, + "step": 875 + }, + { + "epoch": 0.378923463732461, + "grad_norm": 22.106565475463867, + "learning_rate": 1.2554680664916886e-05, + "loss": 1.618, + "step": 876 + }, + { + "epoch": 0.37935602476411906, + "grad_norm": 19.74208641052246, + "learning_rate": 1.2545931758530186e-05, + "loss": 1.5391, + "step": 877 + }, + { + "epoch": 0.3797885857957771, + "grad_norm": 22.241756439208984, + "learning_rate": 1.2537182852143483e-05, + "loss": 1.597, + "step": 878 + }, + { + "epoch": 0.3802211468274352, + "grad_norm": 18.778261184692383, + "learning_rate": 1.2528433945756782e-05, + "loss": 1.5784, + "step": 879 + }, + { + "epoch": 0.38065370785909325, + "grad_norm": 20.45719337463379, + "learning_rate": 1.251968503937008e-05, + "loss": 1.5788, + "step": 880 + }, + { + "epoch": 0.3810862688907513, + "grad_norm": 23.036163330078125, + "learning_rate": 1.2510936132983377e-05, + "loss": 1.7009, + "step": 881 + }, + { + "epoch": 0.3815188299224094, + "grad_norm": 20.001171112060547, + "learning_rate": 1.2502187226596677e-05, + "loss": 1.4824, + "step": 882 + }, + { + "epoch": 0.38195139095406744, + "grad_norm": 19.137182235717773, + "learning_rate": 1.2493438320209974e-05, + "loss": 1.5866, + "step": 883 + }, + { + "epoch": 0.3823839519857255, + "grad_norm": 21.097896575927734, + "learning_rate": 1.2484689413823272e-05, + "loss": 1.5977, + "step": 884 + }, + { + "epoch": 0.38281651301738356, + "grad_norm": 20.685922622680664, + "learning_rate": 1.2475940507436573e-05, + "loss": 1.6055, + "step": 885 + }, + { + "epoch": 0.3832490740490416, + "grad_norm": 19.775070190429688, + "learning_rate": 1.246719160104987e-05, + "loss": 1.6436, + "step": 886 + }, + { + "epoch": 0.3836816350806997, + "grad_norm": 20.196359634399414, + "learning_rate": 1.245844269466317e-05, + "loss": 1.5265, + "step": 887 + }, + { + "epoch": 0.38411419611235775, + "grad_norm": 19.682497024536133, + "learning_rate": 1.2449693788276467e-05, + "loss": 1.5624, + "step": 888 + }, + { + "epoch": 0.3845467571440158, + "grad_norm": 20.493722915649414, + "learning_rate": 1.2440944881889764e-05, + "loss": 1.6223, + "step": 889 + }, + { + "epoch": 0.3849793181756738, + "grad_norm": 19.67377281188965, + "learning_rate": 1.2432195975503064e-05, + "loss": 1.6164, + "step": 890 + }, + { + "epoch": 0.3854118792073319, + "grad_norm": 18.345895767211914, + "learning_rate": 1.2423447069116361e-05, + "loss": 1.6155, + "step": 891 + }, + { + "epoch": 0.38584444023898995, + "grad_norm": 18.30042266845703, + "learning_rate": 1.2414698162729659e-05, + "loss": 1.5257, + "step": 892 + }, + { + "epoch": 0.386277001270648, + "grad_norm": 19.78774070739746, + "learning_rate": 1.2405949256342958e-05, + "loss": 1.716, + "step": 893 + }, + { + "epoch": 0.3867095623023061, + "grad_norm": 21.33734130859375, + "learning_rate": 1.2397200349956255e-05, + "loss": 1.5729, + "step": 894 + }, + { + "epoch": 0.38714212333396414, + "grad_norm": 18.871843338012695, + "learning_rate": 1.2388451443569556e-05, + "loss": 1.6478, + "step": 895 + }, + { + "epoch": 0.3875746843656222, + "grad_norm": 20.553613662719727, + "learning_rate": 1.2379702537182854e-05, + "loss": 1.6663, + "step": 896 + }, + { + "epoch": 0.38800724539728026, + "grad_norm": 19.660924911499023, + "learning_rate": 1.2370953630796151e-05, + "loss": 1.6294, + "step": 897 + }, + { + "epoch": 0.38843980642893833, + "grad_norm": 17.97296714782715, + "learning_rate": 1.236220472440945e-05, + "loss": 1.5321, + "step": 898 + }, + { + "epoch": 0.3888723674605964, + "grad_norm": 20.643564224243164, + "learning_rate": 1.2353455818022748e-05, + "loss": 1.5242, + "step": 899 + }, + { + "epoch": 0.38930492849225445, + "grad_norm": 19.187049865722656, + "learning_rate": 1.2344706911636047e-05, + "loss": 1.5819, + "step": 900 + }, + { + "epoch": 0.3897374895239125, + "grad_norm": 18.788549423217773, + "learning_rate": 1.2335958005249345e-05, + "loss": 1.6182, + "step": 901 + }, + { + "epoch": 0.3901700505555706, + "grad_norm": 20.54323959350586, + "learning_rate": 1.2327209098862642e-05, + "loss": 1.6235, + "step": 902 + }, + { + "epoch": 0.39060261158722864, + "grad_norm": 20.15620231628418, + "learning_rate": 1.2318460192475941e-05, + "loss": 1.5536, + "step": 903 + }, + { + "epoch": 0.3910351726188867, + "grad_norm": 18.571413040161133, + "learning_rate": 1.230971128608924e-05, + "loss": 1.6502, + "step": 904 + }, + { + "epoch": 0.39146773365054477, + "grad_norm": 19.422834396362305, + "learning_rate": 1.230096237970254e-05, + "loss": 1.5417, + "step": 905 + }, + { + "epoch": 0.39190029468220283, + "grad_norm": 20.704635620117188, + "learning_rate": 1.2292213473315837e-05, + "loss": 1.6313, + "step": 906 + }, + { + "epoch": 0.3923328557138609, + "grad_norm": 17.548553466796875, + "learning_rate": 1.2283464566929135e-05, + "loss": 1.6678, + "step": 907 + }, + { + "epoch": 0.39276541674551896, + "grad_norm": 19.540618896484375, + "learning_rate": 1.2274715660542434e-05, + "loss": 1.5068, + "step": 908 + }, + { + "epoch": 0.393197977777177, + "grad_norm": 18.6761531829834, + "learning_rate": 1.2265966754155732e-05, + "loss": 1.5907, + "step": 909 + }, + { + "epoch": 0.3936305388088351, + "grad_norm": 19.963176727294922, + "learning_rate": 1.2257217847769029e-05, + "loss": 1.5861, + "step": 910 + }, + { + "epoch": 0.39406309984049315, + "grad_norm": 20.782676696777344, + "learning_rate": 1.2248468941382328e-05, + "loss": 1.5452, + "step": 911 + }, + { + "epoch": 0.39449566087215115, + "grad_norm": 20.450653076171875, + "learning_rate": 1.2239720034995626e-05, + "loss": 1.5404, + "step": 912 + }, + { + "epoch": 0.3949282219038092, + "grad_norm": 18.47747230529785, + "learning_rate": 1.2230971128608923e-05, + "loss": 1.6799, + "step": 913 + }, + { + "epoch": 0.3953607829354673, + "grad_norm": 21.109474182128906, + "learning_rate": 1.2222222222222224e-05, + "loss": 1.5815, + "step": 914 + }, + { + "epoch": 0.39579334396712534, + "grad_norm": 20.80393409729004, + "learning_rate": 1.2213473315835522e-05, + "loss": 1.6559, + "step": 915 + }, + { + "epoch": 0.3962259049987834, + "grad_norm": 21.14255714416504, + "learning_rate": 1.2204724409448821e-05, + "loss": 1.6067, + "step": 916 + }, + { + "epoch": 0.39665846603044147, + "grad_norm": 18.84864616394043, + "learning_rate": 1.2195975503062118e-05, + "loss": 1.5728, + "step": 917 + }, + { + "epoch": 0.39709102706209953, + "grad_norm": 21.547277450561523, + "learning_rate": 1.2187226596675416e-05, + "loss": 1.5919, + "step": 918 + }, + { + "epoch": 0.3975235880937576, + "grad_norm": 20.30232048034668, + "learning_rate": 1.2178477690288715e-05, + "loss": 1.5967, + "step": 919 + }, + { + "epoch": 0.39795614912541566, + "grad_norm": 19.81118392944336, + "learning_rate": 1.2169728783902013e-05, + "loss": 1.5498, + "step": 920 + }, + { + "epoch": 0.3983887101570737, + "grad_norm": 20.070619583129883, + "learning_rate": 1.216097987751531e-05, + "loss": 1.5291, + "step": 921 + }, + { + "epoch": 0.3988212711887318, + "grad_norm": 18.752845764160156, + "learning_rate": 1.215223097112861e-05, + "loss": 1.595, + "step": 922 + }, + { + "epoch": 0.39925383222038985, + "grad_norm": 20.85921287536621, + "learning_rate": 1.2143482064741907e-05, + "loss": 1.4615, + "step": 923 + }, + { + "epoch": 0.3996863932520479, + "grad_norm": 20.928272247314453, + "learning_rate": 1.2134733158355208e-05, + "loss": 1.5868, + "step": 924 + }, + { + "epoch": 0.400118954283706, + "grad_norm": 21.330936431884766, + "learning_rate": 1.2125984251968505e-05, + "loss": 1.5825, + "step": 925 + }, + { + "epoch": 0.40055151531536404, + "grad_norm": 19.58514404296875, + "learning_rate": 1.2117235345581804e-05, + "loss": 1.6101, + "step": 926 + }, + { + "epoch": 0.4009840763470221, + "grad_norm": 19.794700622558594, + "learning_rate": 1.2108486439195102e-05, + "loss": 1.5484, + "step": 927 + }, + { + "epoch": 0.40141663737868016, + "grad_norm": 20.019779205322266, + "learning_rate": 1.20997375328084e-05, + "loss": 1.6467, + "step": 928 + }, + { + "epoch": 0.4018491984103382, + "grad_norm": 18.108291625976562, + "learning_rate": 1.2090988626421699e-05, + "loss": 1.568, + "step": 929 + }, + { + "epoch": 0.4022817594419963, + "grad_norm": 20.311721801757812, + "learning_rate": 1.2082239720034996e-05, + "loss": 1.4256, + "step": 930 + }, + { + "epoch": 0.40271432047365435, + "grad_norm": 20.440967559814453, + "learning_rate": 1.2073490813648294e-05, + "loss": 1.5777, + "step": 931 + }, + { + "epoch": 0.4031468815053124, + "grad_norm": 20.755502700805664, + "learning_rate": 1.2064741907261593e-05, + "loss": 1.5757, + "step": 932 + }, + { + "epoch": 0.4035794425369704, + "grad_norm": 21.64055633544922, + "learning_rate": 1.2055993000874892e-05, + "loss": 1.6168, + "step": 933 + }, + { + "epoch": 0.4040120035686285, + "grad_norm": 19.491615295410156, + "learning_rate": 1.2047244094488191e-05, + "loss": 1.5655, + "step": 934 + }, + { + "epoch": 0.40444456460028655, + "grad_norm": 19.454198837280273, + "learning_rate": 1.2038495188101489e-05, + "loss": 1.5896, + "step": 935 + }, + { + "epoch": 0.4048771256319446, + "grad_norm": 20.64267349243164, + "learning_rate": 1.2029746281714786e-05, + "loss": 1.5161, + "step": 936 + }, + { + "epoch": 0.4053096866636027, + "grad_norm": 23.738571166992188, + "learning_rate": 1.2020997375328086e-05, + "loss": 1.7056, + "step": 937 + }, + { + "epoch": 0.40574224769526074, + "grad_norm": 22.766353607177734, + "learning_rate": 1.2012248468941383e-05, + "loss": 1.5408, + "step": 938 + }, + { + "epoch": 0.4061748087269188, + "grad_norm": 21.712682723999023, + "learning_rate": 1.200349956255468e-05, + "loss": 1.5679, + "step": 939 + }, + { + "epoch": 0.40660736975857686, + "grad_norm": 20.704023361206055, + "learning_rate": 1.199475065616798e-05, + "loss": 1.5291, + "step": 940 + }, + { + "epoch": 0.4070399307902349, + "grad_norm": 21.490806579589844, + "learning_rate": 1.1986001749781277e-05, + "loss": 1.5927, + "step": 941 + }, + { + "epoch": 0.407472491821893, + "grad_norm": 18.206605911254883, + "learning_rate": 1.1977252843394575e-05, + "loss": 1.5576, + "step": 942 + }, + { + "epoch": 0.40790505285355105, + "grad_norm": 19.448074340820312, + "learning_rate": 1.1968503937007876e-05, + "loss": 1.5126, + "step": 943 + }, + { + "epoch": 0.4083376138852091, + "grad_norm": 19.33980369567871, + "learning_rate": 1.1959755030621173e-05, + "loss": 1.5412, + "step": 944 + }, + { + "epoch": 0.4087701749168672, + "grad_norm": 20.316843032836914, + "learning_rate": 1.1951006124234472e-05, + "loss": 1.697, + "step": 945 + }, + { + "epoch": 0.40920273594852524, + "grad_norm": 19.3831787109375, + "learning_rate": 1.194225721784777e-05, + "loss": 1.5501, + "step": 946 + }, + { + "epoch": 0.4096352969801833, + "grad_norm": 23.11265754699707, + "learning_rate": 1.193350831146107e-05, + "loss": 1.6124, + "step": 947 + }, + { + "epoch": 0.41006785801184137, + "grad_norm": 23.18746566772461, + "learning_rate": 1.1924759405074367e-05, + "loss": 1.61, + "step": 948 + }, + { + "epoch": 0.41050041904349943, + "grad_norm": 20.464345932006836, + "learning_rate": 1.1916010498687664e-05, + "loss": 1.6349, + "step": 949 + }, + { + "epoch": 0.4109329800751575, + "grad_norm": 18.070871353149414, + "learning_rate": 1.1907261592300963e-05, + "loss": 1.6273, + "step": 950 + }, + { + "epoch": 0.41136554110681556, + "grad_norm": 19.049097061157227, + "learning_rate": 1.1898512685914261e-05, + "loss": 1.4902, + "step": 951 + }, + { + "epoch": 0.4117981021384736, + "grad_norm": 20.535675048828125, + "learning_rate": 1.1889763779527562e-05, + "loss": 1.5897, + "step": 952 + }, + { + "epoch": 0.4122306631701317, + "grad_norm": 19.585886001586914, + "learning_rate": 1.188101487314086e-05, + "loss": 1.6457, + "step": 953 + }, + { + "epoch": 0.41266322420178975, + "grad_norm": 19.01354217529297, + "learning_rate": 1.1872265966754157e-05, + "loss": 1.6315, + "step": 954 + }, + { + "epoch": 0.41309578523344775, + "grad_norm": 21.235271453857422, + "learning_rate": 1.1863517060367456e-05, + "loss": 1.4923, + "step": 955 + }, + { + "epoch": 0.4135283462651058, + "grad_norm": 19.011507034301758, + "learning_rate": 1.1854768153980754e-05, + "loss": 1.5508, + "step": 956 + }, + { + "epoch": 0.4139609072967639, + "grad_norm": 20.91914176940918, + "learning_rate": 1.1846019247594051e-05, + "loss": 1.5762, + "step": 957 + }, + { + "epoch": 0.41439346832842194, + "grad_norm": 23.34760284423828, + "learning_rate": 1.183727034120735e-05, + "loss": 1.5603, + "step": 958 + }, + { + "epoch": 0.41482602936008, + "grad_norm": 23.663442611694336, + "learning_rate": 1.1828521434820648e-05, + "loss": 1.607, + "step": 959 + }, + { + "epoch": 0.41525859039173807, + "grad_norm": 18.84950065612793, + "learning_rate": 1.1819772528433945e-05, + "loss": 1.5646, + "step": 960 + }, + { + "epoch": 0.41569115142339613, + "grad_norm": 20.305644989013672, + "learning_rate": 1.1811023622047245e-05, + "loss": 1.5428, + "step": 961 + }, + { + "epoch": 0.4161237124550542, + "grad_norm": 19.10517692565918, + "learning_rate": 1.1802274715660544e-05, + "loss": 1.5305, + "step": 962 + }, + { + "epoch": 0.41655627348671226, + "grad_norm": 22.100454330444336, + "learning_rate": 1.1793525809273843e-05, + "loss": 1.5978, + "step": 963 + }, + { + "epoch": 0.4169888345183703, + "grad_norm": 19.938411712646484, + "learning_rate": 1.178477690288714e-05, + "loss": 1.5133, + "step": 964 + }, + { + "epoch": 0.4174213955500284, + "grad_norm": 20.516897201538086, + "learning_rate": 1.1776027996500438e-05, + "loss": 1.6113, + "step": 965 + }, + { + "epoch": 0.41785395658168645, + "grad_norm": 19.226377487182617, + "learning_rate": 1.1767279090113737e-05, + "loss": 1.5843, + "step": 966 + }, + { + "epoch": 0.4182865176133445, + "grad_norm": 25.08070182800293, + "learning_rate": 1.1758530183727035e-05, + "loss": 1.6075, + "step": 967 + }, + { + "epoch": 0.4187190786450026, + "grad_norm": 20.30937385559082, + "learning_rate": 1.1749781277340332e-05, + "loss": 1.5732, + "step": 968 + }, + { + "epoch": 0.41915163967666064, + "grad_norm": 20.013835906982422, + "learning_rate": 1.1741032370953631e-05, + "loss": 1.6688, + "step": 969 + }, + { + "epoch": 0.4195842007083187, + "grad_norm": 21.13936424255371, + "learning_rate": 1.1732283464566929e-05, + "loss": 1.5732, + "step": 970 + }, + { + "epoch": 0.42001676173997676, + "grad_norm": 20.149782180786133, + "learning_rate": 1.1723534558180228e-05, + "loss": 1.5247, + "step": 971 + }, + { + "epoch": 0.4204493227716348, + "grad_norm": 22.26946449279785, + "learning_rate": 1.1714785651793527e-05, + "loss": 1.5106, + "step": 972 + }, + { + "epoch": 0.4208818838032929, + "grad_norm": 20.233417510986328, + "learning_rate": 1.1706036745406827e-05, + "loss": 1.5702, + "step": 973 + }, + { + "epoch": 0.42131444483495095, + "grad_norm": 20.490550994873047, + "learning_rate": 1.1697287839020124e-05, + "loss": 1.6082, + "step": 974 + }, + { + "epoch": 0.421747005866609, + "grad_norm": 19.094743728637695, + "learning_rate": 1.1688538932633422e-05, + "loss": 1.5653, + "step": 975 + }, + { + "epoch": 0.4221795668982671, + "grad_norm": 18.951610565185547, + "learning_rate": 1.167979002624672e-05, + "loss": 1.5294, + "step": 976 + }, + { + "epoch": 0.4226121279299251, + "grad_norm": 19.141788482666016, + "learning_rate": 1.1671041119860018e-05, + "loss": 1.5549, + "step": 977 + }, + { + "epoch": 0.42304468896158315, + "grad_norm": 22.37894630432129, + "learning_rate": 1.1662292213473316e-05, + "loss": 1.5739, + "step": 978 + }, + { + "epoch": 0.4234772499932412, + "grad_norm": 21.99265480041504, + "learning_rate": 1.1653543307086615e-05, + "loss": 1.5732, + "step": 979 + }, + { + "epoch": 0.4239098110248993, + "grad_norm": 21.509435653686523, + "learning_rate": 1.1644794400699913e-05, + "loss": 1.6226, + "step": 980 + }, + { + "epoch": 0.42434237205655734, + "grad_norm": 21.833322525024414, + "learning_rate": 1.1636045494313213e-05, + "loss": 1.561, + "step": 981 + }, + { + "epoch": 0.4247749330882154, + "grad_norm": 20.158653259277344, + "learning_rate": 1.1627296587926511e-05, + "loss": 1.5053, + "step": 982 + }, + { + "epoch": 0.42520749411987346, + "grad_norm": 23.664594650268555, + "learning_rate": 1.1618547681539808e-05, + "loss": 1.6409, + "step": 983 + }, + { + "epoch": 0.4256400551515315, + "grad_norm": 20.181825637817383, + "learning_rate": 1.1609798775153108e-05, + "loss": 1.615, + "step": 984 + }, + { + "epoch": 0.4260726161831896, + "grad_norm": 23.865310668945312, + "learning_rate": 1.1601049868766405e-05, + "loss": 1.6681, + "step": 985 + }, + { + "epoch": 0.42650517721484765, + "grad_norm": 22.003345489501953, + "learning_rate": 1.1592300962379703e-05, + "loss": 1.594, + "step": 986 + }, + { + "epoch": 0.4269377382465057, + "grad_norm": 21.85173797607422, + "learning_rate": 1.1583552055993002e-05, + "loss": 1.5251, + "step": 987 + }, + { + "epoch": 0.4273702992781638, + "grad_norm": 22.519914627075195, + "learning_rate": 1.15748031496063e-05, + "loss": 1.6031, + "step": 988 + }, + { + "epoch": 0.42780286030982184, + "grad_norm": 18.701091766357422, + "learning_rate": 1.1566054243219597e-05, + "loss": 1.5812, + "step": 989 + }, + { + "epoch": 0.4282354213414799, + "grad_norm": 18.49114418029785, + "learning_rate": 1.1557305336832896e-05, + "loss": 1.573, + "step": 990 + }, + { + "epoch": 0.42866798237313797, + "grad_norm": 21.027111053466797, + "learning_rate": 1.1548556430446195e-05, + "loss": 1.5618, + "step": 991 + }, + { + "epoch": 0.42910054340479603, + "grad_norm": 22.11174964904785, + "learning_rate": 1.1539807524059495e-05, + "loss": 1.5999, + "step": 992 + }, + { + "epoch": 0.4295331044364541, + "grad_norm": 23.42853355407715, + "learning_rate": 1.1531058617672792e-05, + "loss": 1.5065, + "step": 993 + }, + { + "epoch": 0.42996566546811216, + "grad_norm": 20.36092185974121, + "learning_rate": 1.1522309711286091e-05, + "loss": 1.5454, + "step": 994 + }, + { + "epoch": 0.4303982264997702, + "grad_norm": 20.617124557495117, + "learning_rate": 1.1513560804899389e-05, + "loss": 1.6106, + "step": 995 + }, + { + "epoch": 0.4308307875314283, + "grad_norm": 21.008230209350586, + "learning_rate": 1.1504811898512686e-05, + "loss": 1.6075, + "step": 996 + }, + { + "epoch": 0.43126334856308635, + "grad_norm": 23.843276977539062, + "learning_rate": 1.1496062992125985e-05, + "loss": 1.5715, + "step": 997 + }, + { + "epoch": 0.4316959095947444, + "grad_norm": 23.46642303466797, + "learning_rate": 1.1487314085739283e-05, + "loss": 1.5887, + "step": 998 + }, + { + "epoch": 0.4321284706264024, + "grad_norm": 32.012420654296875, + "learning_rate": 1.147856517935258e-05, + "loss": 1.5999, + "step": 999 + }, + { + "epoch": 0.4325610316580605, + "grad_norm": 22.296770095825195, + "learning_rate": 1.1469816272965881e-05, + "loss": 1.5889, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 2311, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2407280868764634e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}