diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,88396 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999603881956823, + "eval_steps": 500, + "global_step": 12622, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.922360863537334e-05, + "grad_norm": 42.614111854296226, + "learning_rate": 5.277044854881267e-08, + "loss": 2.2873, + "step": 1 + }, + { + "epoch": 0.00015844721727074668, + "grad_norm": 37.76236820883414, + "learning_rate": 1.0554089709762534e-07, + "loss": 2.1054, + "step": 2 + }, + { + "epoch": 0.00023767082590612002, + "grad_norm": 44.20324100032348, + "learning_rate": 1.5831134564643802e-07, + "loss": 2.3843, + "step": 3 + }, + { + "epoch": 0.00031689443454149336, + "grad_norm": 45.99189863287436, + "learning_rate": 2.1108179419525068e-07, + "loss": 2.3161, + "step": 4 + }, + { + "epoch": 0.0003961180431768667, + "grad_norm": 61.44135033953449, + "learning_rate": 2.6385224274406334e-07, + "loss": 2.5541, + "step": 5 + }, + { + "epoch": 0.00047534165181224003, + "grad_norm": 51.340025526685665, + "learning_rate": 3.1662269129287605e-07, + "loss": 2.4175, + "step": 6 + }, + { + "epoch": 0.0005545652604476134, + "grad_norm": 40.24597610798052, + "learning_rate": 3.693931398416887e-07, + "loss": 2.2304, + "step": 7 + }, + { + "epoch": 0.0006337888690829867, + "grad_norm": 35.26815455408599, + "learning_rate": 4.2216358839050136e-07, + "loss": 2.154, + "step": 8 + }, + { + "epoch": 0.00071301247771836, + "grad_norm": 37.95145832540095, + "learning_rate": 4.7493403693931397e-07, + "loss": 2.3349, + "step": 9 + }, + { + "epoch": 0.0007922360863537334, + "grad_norm": 32.52077001059761, + "learning_rate": 5.277044854881267e-07, + "loss": 2.1674, + "step": 10 + }, + { + "epoch": 0.0008714596949891067, + "grad_norm": 32.695184250972915, + "learning_rate": 5.804749340369393e-07, + "loss": 2.2218, + "step": 11 + }, + { + "epoch": 0.0009506833036244801, + "grad_norm": 31.569994275350567, + "learning_rate": 6.332453825857521e-07, + "loss": 2.1072, + "step": 12 + }, + { + "epoch": 0.0010299069122598535, + "grad_norm": 36.87101598190986, + "learning_rate": 6.860158311345646e-07, + "loss": 2.1892, + "step": 13 + }, + { + "epoch": 0.0011091305208952267, + "grad_norm": 31.42772724566191, + "learning_rate": 7.387862796833774e-07, + "loss": 1.9935, + "step": 14 + }, + { + "epoch": 0.0011883541295306002, + "grad_norm": 33.810169392899475, + "learning_rate": 7.915567282321901e-07, + "loss": 1.9951, + "step": 15 + }, + { + "epoch": 0.0012675777381659734, + "grad_norm": 34.022371788566474, + "learning_rate": 8.443271767810027e-07, + "loss": 1.8929, + "step": 16 + }, + { + "epoch": 0.0013468013468013469, + "grad_norm": 27.522247882860785, + "learning_rate": 8.970976253298154e-07, + "loss": 1.743, + "step": 17 + }, + { + "epoch": 0.00142602495543672, + "grad_norm": 26.687057865206754, + "learning_rate": 9.498680738786279e-07, + "loss": 1.7579, + "step": 18 + }, + { + "epoch": 0.0015052485640720936, + "grad_norm": 25.8674338473116, + "learning_rate": 1.0026385224274407e-06, + "loss": 1.6363, + "step": 19 + }, + { + "epoch": 0.0015844721727074668, + "grad_norm": 24.962962468925166, + "learning_rate": 1.0554089709762534e-06, + "loss": 1.5538, + "step": 20 + }, + { + "epoch": 0.0016636957813428402, + "grad_norm": 19.201733561685682, + "learning_rate": 1.108179419525066e-06, + "loss": 1.4519, + "step": 21 + }, + { + "epoch": 0.0017429193899782135, + "grad_norm": 17.008045145586003, + "learning_rate": 1.1609498680738787e-06, + "loss": 1.5598, + "step": 22 + }, + { + "epoch": 0.001822142998613587, + "grad_norm": 22.925097803179106, + "learning_rate": 1.2137203166226915e-06, + "loss": 1.3887, + "step": 23 + }, + { + "epoch": 0.0019013666072489601, + "grad_norm": 14.467758174340604, + "learning_rate": 1.2664907651715042e-06, + "loss": 1.4529, + "step": 24 + }, + { + "epoch": 0.0019805902158843334, + "grad_norm": 18.069021547652184, + "learning_rate": 1.3192612137203166e-06, + "loss": 1.5061, + "step": 25 + }, + { + "epoch": 0.002059813824519707, + "grad_norm": 13.54544464402972, + "learning_rate": 1.3720316622691293e-06, + "loss": 1.3593, + "step": 26 + }, + { + "epoch": 0.0021390374331550803, + "grad_norm": 13.221854786587498, + "learning_rate": 1.4248021108179422e-06, + "loss": 1.3513, + "step": 27 + }, + { + "epoch": 0.0022182610417904535, + "grad_norm": 15.201327664134368, + "learning_rate": 1.4775725593667548e-06, + "loss": 1.4593, + "step": 28 + }, + { + "epoch": 0.0022974846504258267, + "grad_norm": 11.505659412870976, + "learning_rate": 1.5303430079155673e-06, + "loss": 1.3352, + "step": 29 + }, + { + "epoch": 0.0023767082590612004, + "grad_norm": 11.135420445794026, + "learning_rate": 1.5831134564643801e-06, + "loss": 1.1387, + "step": 30 + }, + { + "epoch": 0.0024559318676965736, + "grad_norm": 12.751311370173159, + "learning_rate": 1.6358839050131928e-06, + "loss": 1.2372, + "step": 31 + }, + { + "epoch": 0.002535155476331947, + "grad_norm": 10.191505969079218, + "learning_rate": 1.6886543535620054e-06, + "loss": 1.2993, + "step": 32 + }, + { + "epoch": 0.00261437908496732, + "grad_norm": 10.36869003526171, + "learning_rate": 1.7414248021108183e-06, + "loss": 1.1897, + "step": 33 + }, + { + "epoch": 0.0026936026936026937, + "grad_norm": 11.106688395157834, + "learning_rate": 1.7941952506596308e-06, + "loss": 1.2186, + "step": 34 + }, + { + "epoch": 0.002772826302238067, + "grad_norm": 9.811641497260164, + "learning_rate": 1.8469656992084434e-06, + "loss": 1.173, + "step": 35 + }, + { + "epoch": 0.00285204991087344, + "grad_norm": 9.748182125181, + "learning_rate": 1.8997361477572559e-06, + "loss": 1.1051, + "step": 36 + }, + { + "epoch": 0.0029312735195088134, + "grad_norm": 11.359302006106475, + "learning_rate": 1.9525065963060687e-06, + "loss": 1.1292, + "step": 37 + }, + { + "epoch": 0.003010497128144187, + "grad_norm": 9.351484551662441, + "learning_rate": 2.0052770448548814e-06, + "loss": 1.1873, + "step": 38 + }, + { + "epoch": 0.0030897207367795603, + "grad_norm": 9.400069186684982, + "learning_rate": 2.058047493403694e-06, + "loss": 1.2061, + "step": 39 + }, + { + "epoch": 0.0031689443454149336, + "grad_norm": 10.5292163306882, + "learning_rate": 2.1108179419525067e-06, + "loss": 1.1456, + "step": 40 + }, + { + "epoch": 0.003248167954050307, + "grad_norm": 8.75803640559494, + "learning_rate": 2.1635883905013194e-06, + "loss": 1.122, + "step": 41 + }, + { + "epoch": 0.0033273915626856805, + "grad_norm": 9.958373416709335, + "learning_rate": 2.216358839050132e-06, + "loss": 1.0987, + "step": 42 + }, + { + "epoch": 0.0034066151713210537, + "grad_norm": 10.295384189695442, + "learning_rate": 2.2691292875989447e-06, + "loss": 1.0441, + "step": 43 + }, + { + "epoch": 0.003485838779956427, + "grad_norm": 9.364845821887274, + "learning_rate": 2.3218997361477573e-06, + "loss": 1.1967, + "step": 44 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 10.836964308793071, + "learning_rate": 2.37467018469657e-06, + "loss": 1.1089, + "step": 45 + }, + { + "epoch": 0.003644285997227174, + "grad_norm": 10.301230304850362, + "learning_rate": 2.427440633245383e-06, + "loss": 1.2325, + "step": 46 + }, + { + "epoch": 0.003723509605862547, + "grad_norm": 11.32728636855559, + "learning_rate": 2.4802110817941953e-06, + "loss": 1.06, + "step": 47 + }, + { + "epoch": 0.0038027332144979203, + "grad_norm": 8.608360270376286, + "learning_rate": 2.5329815303430084e-06, + "loss": 1.0562, + "step": 48 + }, + { + "epoch": 0.0038819568231332935, + "grad_norm": 9.748035469890205, + "learning_rate": 2.5857519788918206e-06, + "loss": 1.0684, + "step": 49 + }, + { + "epoch": 0.003961180431768667, + "grad_norm": 9.799999174312287, + "learning_rate": 2.6385224274406333e-06, + "loss": 0.9504, + "step": 50 + }, + { + "epoch": 0.00404040404040404, + "grad_norm": 8.982776127017706, + "learning_rate": 2.6912928759894464e-06, + "loss": 1.0965, + "step": 51 + }, + { + "epoch": 0.004119627649039414, + "grad_norm": 8.334820113759973, + "learning_rate": 2.7440633245382586e-06, + "loss": 1.0768, + "step": 52 + }, + { + "epoch": 0.004198851257674787, + "grad_norm": 9.00233498749542, + "learning_rate": 2.7968337730870717e-06, + "loss": 1.0667, + "step": 53 + }, + { + "epoch": 0.0042780748663101605, + "grad_norm": 8.639113973155974, + "learning_rate": 2.8496042216358843e-06, + "loss": 1.0581, + "step": 54 + }, + { + "epoch": 0.004357298474945534, + "grad_norm": 8.878926254612908, + "learning_rate": 2.9023746701846966e-06, + "loss": 0.9961, + "step": 55 + }, + { + "epoch": 0.004436522083580907, + "grad_norm": 8.162243884388658, + "learning_rate": 2.9551451187335096e-06, + "loss": 1.0752, + "step": 56 + }, + { + "epoch": 0.004515745692216281, + "grad_norm": 8.330023679308326, + "learning_rate": 3.0079155672823223e-06, + "loss": 0.9054, + "step": 57 + }, + { + "epoch": 0.0045949693008516534, + "grad_norm": 7.53471868214602, + "learning_rate": 3.0606860158311345e-06, + "loss": 1.0052, + "step": 58 + }, + { + "epoch": 0.004674192909487027, + "grad_norm": 7.112656124939108, + "learning_rate": 3.1134564643799476e-06, + "loss": 0.9873, + "step": 59 + }, + { + "epoch": 0.004753416518122401, + "grad_norm": 9.490342308709941, + "learning_rate": 3.1662269129287603e-06, + "loss": 0.9418, + "step": 60 + }, + { + "epoch": 0.004832640126757774, + "grad_norm": 9.12196590712411, + "learning_rate": 3.2189973614775725e-06, + "loss": 1.0247, + "step": 61 + }, + { + "epoch": 0.004911863735393147, + "grad_norm": 7.820198356920157, + "learning_rate": 3.2717678100263856e-06, + "loss": 0.9676, + "step": 62 + }, + { + "epoch": 0.004991087344028521, + "grad_norm": 7.730956247882421, + "learning_rate": 3.3245382585751982e-06, + "loss": 0.915, + "step": 63 + }, + { + "epoch": 0.005070310952663894, + "grad_norm": 8.7988146994701, + "learning_rate": 3.377308707124011e-06, + "loss": 0.9994, + "step": 64 + }, + { + "epoch": 0.005149534561299267, + "grad_norm": 7.964992208166385, + "learning_rate": 3.4300791556728235e-06, + "loss": 0.9097, + "step": 65 + }, + { + "epoch": 0.00522875816993464, + "grad_norm": 9.772442960771029, + "learning_rate": 3.4828496042216366e-06, + "loss": 0.9568, + "step": 66 + }, + { + "epoch": 0.005307981778570014, + "grad_norm": 8.135360901673979, + "learning_rate": 3.535620052770449e-06, + "loss": 0.9513, + "step": 67 + }, + { + "epoch": 0.0053872053872053875, + "grad_norm": 7.6709860091833715, + "learning_rate": 3.5883905013192615e-06, + "loss": 0.8889, + "step": 68 + }, + { + "epoch": 0.00546642899584076, + "grad_norm": 7.443248507707116, + "learning_rate": 3.6411609498680746e-06, + "loss": 0.8934, + "step": 69 + }, + { + "epoch": 0.005545652604476134, + "grad_norm": 8.896062469657917, + "learning_rate": 3.693931398416887e-06, + "loss": 0.8696, + "step": 70 + }, + { + "epoch": 0.005624876213111508, + "grad_norm": 6.7033732684985425, + "learning_rate": 3.7467018469656995e-06, + "loss": 0.9287, + "step": 71 + }, + { + "epoch": 0.00570409982174688, + "grad_norm": 8.28266478949728, + "learning_rate": 3.7994722955145117e-06, + "loss": 1.0932, + "step": 72 + }, + { + "epoch": 0.005783323430382254, + "grad_norm": 7.480457460036021, + "learning_rate": 3.852242744063324e-06, + "loss": 0.9265, + "step": 73 + }, + { + "epoch": 0.005862547039017627, + "grad_norm": 8.46194228991338, + "learning_rate": 3.9050131926121375e-06, + "loss": 0.8829, + "step": 74 + }, + { + "epoch": 0.0059417706476530005, + "grad_norm": 7.108096457031844, + "learning_rate": 3.95778364116095e-06, + "loss": 0.8676, + "step": 75 + }, + { + "epoch": 0.006020994256288374, + "grad_norm": 8.26289581235156, + "learning_rate": 4.010554089709763e-06, + "loss": 0.9526, + "step": 76 + }, + { + "epoch": 0.006100217864923747, + "grad_norm": 7.426924959601872, + "learning_rate": 4.063324538258576e-06, + "loss": 0.8984, + "step": 77 + }, + { + "epoch": 0.006179441473559121, + "grad_norm": 7.314131265022161, + "learning_rate": 4.116094986807388e-06, + "loss": 0.938, + "step": 78 + }, + { + "epoch": 0.006258665082194494, + "grad_norm": 7.2349724550929215, + "learning_rate": 4.168865435356201e-06, + "loss": 0.9012, + "step": 79 + }, + { + "epoch": 0.006337888690829867, + "grad_norm": 6.303291307253892, + "learning_rate": 4.221635883905013e-06, + "loss": 0.9582, + "step": 80 + }, + { + "epoch": 0.006417112299465241, + "grad_norm": 8.165404763258302, + "learning_rate": 4.274406332453826e-06, + "loss": 0.8817, + "step": 81 + }, + { + "epoch": 0.006496335908100614, + "grad_norm": 6.589957848159812, + "learning_rate": 4.327176781002639e-06, + "loss": 0.8781, + "step": 82 + }, + { + "epoch": 0.006575559516735987, + "grad_norm": 7.044234476574764, + "learning_rate": 4.379947229551452e-06, + "loss": 0.8443, + "step": 83 + }, + { + "epoch": 0.006654783125371361, + "grad_norm": 6.679333954068805, + "learning_rate": 4.432717678100264e-06, + "loss": 0.8553, + "step": 84 + }, + { + "epoch": 0.006734006734006734, + "grad_norm": 7.208240387990633, + "learning_rate": 4.485488126649077e-06, + "loss": 0.887, + "step": 85 + }, + { + "epoch": 0.006813230342642107, + "grad_norm": 6.654126823841316, + "learning_rate": 4.538258575197889e-06, + "loss": 0.8218, + "step": 86 + }, + { + "epoch": 0.006892453951277481, + "grad_norm": 6.6573158796176966, + "learning_rate": 4.5910290237467024e-06, + "loss": 0.8883, + "step": 87 + }, + { + "epoch": 0.006971677559912854, + "grad_norm": 7.475449982222273, + "learning_rate": 4.643799472295515e-06, + "loss": 0.9484, + "step": 88 + }, + { + "epoch": 0.0070509011685482275, + "grad_norm": 7.226263349561209, + "learning_rate": 4.696569920844328e-06, + "loss": 0.9058, + "step": 89 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 6.714612107470067, + "learning_rate": 4.74934036939314e-06, + "loss": 0.945, + "step": 90 + }, + { + "epoch": 0.007209348385818974, + "grad_norm": 7.2642247561182955, + "learning_rate": 4.802110817941953e-06, + "loss": 0.8411, + "step": 91 + }, + { + "epoch": 0.007288571994454348, + "grad_norm": 6.0516847173704695, + "learning_rate": 4.854881266490766e-06, + "loss": 0.9651, + "step": 92 + }, + { + "epoch": 0.00736779560308972, + "grad_norm": 7.480869160126672, + "learning_rate": 4.907651715039578e-06, + "loss": 0.7841, + "step": 93 + }, + { + "epoch": 0.007447019211725094, + "grad_norm": 8.225645152206116, + "learning_rate": 4.960422163588391e-06, + "loss": 0.8955, + "step": 94 + }, + { + "epoch": 0.007526242820360468, + "grad_norm": 7.216394822081365, + "learning_rate": 5.013192612137203e-06, + "loss": 0.8966, + "step": 95 + }, + { + "epoch": 0.0076054664289958405, + "grad_norm": 7.369662653573449, + "learning_rate": 5.065963060686017e-06, + "loss": 0.884, + "step": 96 + }, + { + "epoch": 0.007684690037631214, + "grad_norm": 7.725513207950532, + "learning_rate": 5.118733509234829e-06, + "loss": 0.9601, + "step": 97 + }, + { + "epoch": 0.007763913646266587, + "grad_norm": 7.6744067203539545, + "learning_rate": 5.171503957783641e-06, + "loss": 0.9021, + "step": 98 + }, + { + "epoch": 0.00784313725490196, + "grad_norm": 6.291842395956368, + "learning_rate": 5.224274406332454e-06, + "loss": 0.8333, + "step": 99 + }, + { + "epoch": 0.007922360863537333, + "grad_norm": 6.437414406114134, + "learning_rate": 5.2770448548812665e-06, + "loss": 0.8087, + "step": 100 + }, + { + "epoch": 0.008001584472172708, + "grad_norm": 6.567236790812515, + "learning_rate": 5.32981530343008e-06, + "loss": 0.8172, + "step": 101 + }, + { + "epoch": 0.00808080808080808, + "grad_norm": 6.062145791177876, + "learning_rate": 5.382585751978893e-06, + "loss": 0.8378, + "step": 102 + }, + { + "epoch": 0.008160031689443454, + "grad_norm": 7.775648390585933, + "learning_rate": 5.435356200527705e-06, + "loss": 0.9035, + "step": 103 + }, + { + "epoch": 0.008239255298078828, + "grad_norm": 6.451599166161916, + "learning_rate": 5.488126649076517e-06, + "loss": 0.948, + "step": 104 + }, + { + "epoch": 0.008318478906714201, + "grad_norm": 7.411667507978405, + "learning_rate": 5.540897097625331e-06, + "loss": 0.8766, + "step": 105 + }, + { + "epoch": 0.008397702515349574, + "grad_norm": 7.033119727954545, + "learning_rate": 5.593667546174143e-06, + "loss": 0.7836, + "step": 106 + }, + { + "epoch": 0.008476926123984948, + "grad_norm": 6.369139602547609, + "learning_rate": 5.6464379947229556e-06, + "loss": 0.8344, + "step": 107 + }, + { + "epoch": 0.008556149732620321, + "grad_norm": 8.42831073461348, + "learning_rate": 5.699208443271769e-06, + "loss": 0.8721, + "step": 108 + }, + { + "epoch": 0.008635373341255694, + "grad_norm": 6.945334822024758, + "learning_rate": 5.751978891820581e-06, + "loss": 0.8424, + "step": 109 + }, + { + "epoch": 0.008714596949891068, + "grad_norm": 7.3402728309411716, + "learning_rate": 5.804749340369393e-06, + "loss": 0.7852, + "step": 110 + }, + { + "epoch": 0.008793820558526441, + "grad_norm": 7.621443431050536, + "learning_rate": 5.857519788918207e-06, + "loss": 0.8047, + "step": 111 + }, + { + "epoch": 0.008873044167161814, + "grad_norm": 6.042150172429204, + "learning_rate": 5.910290237467019e-06, + "loss": 0.6894, + "step": 112 + }, + { + "epoch": 0.008952267775797187, + "grad_norm": 8.284828432110238, + "learning_rate": 5.9630606860158315e-06, + "loss": 0.9772, + "step": 113 + }, + { + "epoch": 0.009031491384432561, + "grad_norm": 6.82287369170654, + "learning_rate": 6.015831134564645e-06, + "loss": 0.7183, + "step": 114 + }, + { + "epoch": 0.009110714993067934, + "grad_norm": 7.709064088472621, + "learning_rate": 6.068601583113457e-06, + "loss": 0.874, + "step": 115 + }, + { + "epoch": 0.009189938601703307, + "grad_norm": 7.818614870566409, + "learning_rate": 6.121372031662269e-06, + "loss": 0.8342, + "step": 116 + }, + { + "epoch": 0.009269162210338681, + "grad_norm": 6.510556273745667, + "learning_rate": 6.174142480211083e-06, + "loss": 0.9821, + "step": 117 + }, + { + "epoch": 0.009348385818974054, + "grad_norm": 6.688508998061781, + "learning_rate": 6.226912928759895e-06, + "loss": 0.8985, + "step": 118 + }, + { + "epoch": 0.009427609427609427, + "grad_norm": 6.16060199216206, + "learning_rate": 6.2796833773087074e-06, + "loss": 0.7694, + "step": 119 + }, + { + "epoch": 0.009506833036244802, + "grad_norm": 6.637708679329666, + "learning_rate": 6.3324538258575205e-06, + "loss": 0.9272, + "step": 120 + }, + { + "epoch": 0.009586056644880174, + "grad_norm": 7.117160429896391, + "learning_rate": 6.385224274406333e-06, + "loss": 0.8983, + "step": 121 + }, + { + "epoch": 0.009665280253515547, + "grad_norm": 6.381316300883817, + "learning_rate": 6.437994722955145e-06, + "loss": 0.8254, + "step": 122 + }, + { + "epoch": 0.009744503862150922, + "grad_norm": 7.535814059523654, + "learning_rate": 6.490765171503959e-06, + "loss": 0.8102, + "step": 123 + }, + { + "epoch": 0.009823727470786294, + "grad_norm": 8.245421659934367, + "learning_rate": 6.543535620052771e-06, + "loss": 0.8976, + "step": 124 + }, + { + "epoch": 0.009902951079421667, + "grad_norm": 8.035345095892026, + "learning_rate": 6.596306068601583e-06, + "loss": 0.872, + "step": 125 + }, + { + "epoch": 0.009982174688057042, + "grad_norm": 7.768805602766118, + "learning_rate": 6.6490765171503965e-06, + "loss": 0.8721, + "step": 126 + }, + { + "epoch": 0.010061398296692415, + "grad_norm": 6.8564534315544865, + "learning_rate": 6.701846965699209e-06, + "loss": 0.7111, + "step": 127 + }, + { + "epoch": 0.010140621905327787, + "grad_norm": 6.035762422487148, + "learning_rate": 6.754617414248022e-06, + "loss": 0.7817, + "step": 128 + }, + { + "epoch": 0.01021984551396316, + "grad_norm": 6.544879876408064, + "learning_rate": 6.807387862796835e-06, + "loss": 0.7646, + "step": 129 + }, + { + "epoch": 0.010299069122598535, + "grad_norm": 6.374898342587209, + "learning_rate": 6.860158311345647e-06, + "loss": 0.8517, + "step": 130 + }, + { + "epoch": 0.010378292731233908, + "grad_norm": 6.742524152542619, + "learning_rate": 6.912928759894459e-06, + "loss": 0.783, + "step": 131 + }, + { + "epoch": 0.01045751633986928, + "grad_norm": 6.463683281336434, + "learning_rate": 6.965699208443273e-06, + "loss": 0.8553, + "step": 132 + }, + { + "epoch": 0.010536739948504655, + "grad_norm": 6.006316276828489, + "learning_rate": 7.0184696569920855e-06, + "loss": 0.8629, + "step": 133 + }, + { + "epoch": 0.010615963557140028, + "grad_norm": 6.9008903602171605, + "learning_rate": 7.071240105540898e-06, + "loss": 0.8117, + "step": 134 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 6.200731497677948, + "learning_rate": 7.124010554089711e-06, + "loss": 0.9114, + "step": 135 + }, + { + "epoch": 0.010774410774410775, + "grad_norm": 6.807630684088574, + "learning_rate": 7.176781002638523e-06, + "loss": 0.8792, + "step": 136 + }, + { + "epoch": 0.010853634383046148, + "grad_norm": 5.3701401090661, + "learning_rate": 7.229551451187335e-06, + "loss": 0.8673, + "step": 137 + }, + { + "epoch": 0.01093285799168152, + "grad_norm": 6.757375028297844, + "learning_rate": 7.282321899736149e-06, + "loss": 0.8845, + "step": 138 + }, + { + "epoch": 0.011012081600316895, + "grad_norm": 7.1004230146825895, + "learning_rate": 7.3350923482849614e-06, + "loss": 0.8714, + "step": 139 + }, + { + "epoch": 0.011091305208952268, + "grad_norm": 5.336040919868236, + "learning_rate": 7.387862796833774e-06, + "loss": 0.7554, + "step": 140 + }, + { + "epoch": 0.01117052881758764, + "grad_norm": 6.988830275708981, + "learning_rate": 7.440633245382587e-06, + "loss": 0.8094, + "step": 141 + }, + { + "epoch": 0.011249752426223015, + "grad_norm": 6.582171590449282, + "learning_rate": 7.493403693931399e-06, + "loss": 0.7634, + "step": 142 + }, + { + "epoch": 0.011328976034858388, + "grad_norm": 5.987515827179504, + "learning_rate": 7.546174142480211e-06, + "loss": 0.7018, + "step": 143 + }, + { + "epoch": 0.01140819964349376, + "grad_norm": 6.700327692091717, + "learning_rate": 7.5989445910290234e-06, + "loss": 0.8476, + "step": 144 + }, + { + "epoch": 0.011487423252129134, + "grad_norm": 6.378921328384878, + "learning_rate": 7.651715039577837e-06, + "loss": 0.7756, + "step": 145 + }, + { + "epoch": 0.011566646860764508, + "grad_norm": 6.2661110671266425, + "learning_rate": 7.704485488126649e-06, + "loss": 0.7612, + "step": 146 + }, + { + "epoch": 0.011645870469399881, + "grad_norm": 6.020189423063324, + "learning_rate": 7.757255936675462e-06, + "loss": 0.8413, + "step": 147 + }, + { + "epoch": 0.011725094078035254, + "grad_norm": 6.020660293827696, + "learning_rate": 7.810026385224275e-06, + "loss": 0.7821, + "step": 148 + }, + { + "epoch": 0.011804317686670628, + "grad_norm": 6.255480130642274, + "learning_rate": 7.862796833773088e-06, + "loss": 0.8484, + "step": 149 + }, + { + "epoch": 0.011883541295306001, + "grad_norm": 5.665212849234186, + "learning_rate": 7.9155672823219e-06, + "loss": 0.7354, + "step": 150 + }, + { + "epoch": 0.011962764903941374, + "grad_norm": 6.09860841147306, + "learning_rate": 7.968337730870712e-06, + "loss": 0.6789, + "step": 151 + }, + { + "epoch": 0.012041988512576748, + "grad_norm": 5.807336004435362, + "learning_rate": 8.021108179419526e-06, + "loss": 0.7493, + "step": 152 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 8.018066053700938, + "learning_rate": 8.073878627968339e-06, + "loss": 0.8071, + "step": 153 + }, + { + "epoch": 0.012200435729847494, + "grad_norm": 6.683925727218106, + "learning_rate": 8.126649076517152e-06, + "loss": 0.7496, + "step": 154 + }, + { + "epoch": 0.012279659338482869, + "grad_norm": 5.214523793254702, + "learning_rate": 8.179419525065963e-06, + "loss": 0.6752, + "step": 155 + }, + { + "epoch": 0.012358882947118241, + "grad_norm": 6.384856496160088, + "learning_rate": 8.232189973614776e-06, + "loss": 0.7791, + "step": 156 + }, + { + "epoch": 0.012438106555753614, + "grad_norm": 5.7506919577732365, + "learning_rate": 8.28496042216359e-06, + "loss": 0.6719, + "step": 157 + }, + { + "epoch": 0.012517330164388989, + "grad_norm": 6.451943905941833, + "learning_rate": 8.337730870712402e-06, + "loss": 0.8368, + "step": 158 + }, + { + "epoch": 0.012596553773024361, + "grad_norm": 6.353285853795154, + "learning_rate": 8.390501319261214e-06, + "loss": 0.8077, + "step": 159 + }, + { + "epoch": 0.012675777381659734, + "grad_norm": 6.3946120664157045, + "learning_rate": 8.443271767810027e-06, + "loss": 0.916, + "step": 160 + }, + { + "epoch": 0.012755000990295109, + "grad_norm": 5.531993314272616, + "learning_rate": 8.49604221635884e-06, + "loss": 0.8075, + "step": 161 + }, + { + "epoch": 0.012834224598930482, + "grad_norm": 5.750261961203504, + "learning_rate": 8.548812664907651e-06, + "loss": 0.786, + "step": 162 + }, + { + "epoch": 0.012913448207565854, + "grad_norm": 5.983737631006511, + "learning_rate": 8.601583113456466e-06, + "loss": 0.6516, + "step": 163 + }, + { + "epoch": 0.012992671816201227, + "grad_norm": 7.504346432602982, + "learning_rate": 8.654353562005277e-06, + "loss": 0.8069, + "step": 164 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 6.0464892561402594, + "learning_rate": 8.70712401055409e-06, + "loss": 0.8819, + "step": 165 + }, + { + "epoch": 0.013151119033471975, + "grad_norm": 5.83366448717492, + "learning_rate": 8.759894459102904e-06, + "loss": 0.6973, + "step": 166 + }, + { + "epoch": 0.013230342642107347, + "grad_norm": 6.4472556260765925, + "learning_rate": 8.812664907651715e-06, + "loss": 0.8768, + "step": 167 + }, + { + "epoch": 0.013309566250742722, + "grad_norm": 6.036974197208871, + "learning_rate": 8.865435356200528e-06, + "loss": 0.7416, + "step": 168 + }, + { + "epoch": 0.013388789859378095, + "grad_norm": 5.852387747977291, + "learning_rate": 8.918205804749341e-06, + "loss": 0.6665, + "step": 169 + }, + { + "epoch": 0.013468013468013467, + "grad_norm": 5.47985240394418, + "learning_rate": 8.970976253298154e-06, + "loss": 0.9516, + "step": 170 + }, + { + "epoch": 0.013547237076648842, + "grad_norm": 6.241247727690077, + "learning_rate": 9.023746701846966e-06, + "loss": 0.8888, + "step": 171 + }, + { + "epoch": 0.013626460685284215, + "grad_norm": 6.3090614251586885, + "learning_rate": 9.076517150395779e-06, + "loss": 0.8151, + "step": 172 + }, + { + "epoch": 0.013705684293919588, + "grad_norm": 6.8806061487648025, + "learning_rate": 9.129287598944592e-06, + "loss": 0.822, + "step": 173 + }, + { + "epoch": 0.013784907902554962, + "grad_norm": 5.836880430806096, + "learning_rate": 9.182058047493405e-06, + "loss": 0.7376, + "step": 174 + }, + { + "epoch": 0.013864131511190335, + "grad_norm": 5.7922355557454575, + "learning_rate": 9.234828496042218e-06, + "loss": 0.7222, + "step": 175 + }, + { + "epoch": 0.013943355119825708, + "grad_norm": 6.01423356467863, + "learning_rate": 9.28759894459103e-06, + "loss": 0.7947, + "step": 176 + }, + { + "epoch": 0.014022578728461082, + "grad_norm": 5.3625604233349184, + "learning_rate": 9.340369393139842e-06, + "loss": 0.7288, + "step": 177 + }, + { + "epoch": 0.014101802337096455, + "grad_norm": 5.8912869137431505, + "learning_rate": 9.393139841688655e-06, + "loss": 0.7991, + "step": 178 + }, + { + "epoch": 0.014181025945731828, + "grad_norm": 6.226436224894201, + "learning_rate": 9.445910290237469e-06, + "loss": 0.778, + "step": 179 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 6.126272177451517, + "learning_rate": 9.49868073878628e-06, + "loss": 0.6597, + "step": 180 + }, + { + "epoch": 0.014339473163002575, + "grad_norm": 6.214182615091143, + "learning_rate": 9.551451187335093e-06, + "loss": 0.8443, + "step": 181 + }, + { + "epoch": 0.014418696771637948, + "grad_norm": 5.978541650955973, + "learning_rate": 9.604221635883906e-06, + "loss": 0.7414, + "step": 182 + }, + { + "epoch": 0.01449792038027332, + "grad_norm": 5.6190877624160755, + "learning_rate": 9.656992084432717e-06, + "loss": 0.6482, + "step": 183 + }, + { + "epoch": 0.014577143988908695, + "grad_norm": 5.837583999021052, + "learning_rate": 9.709762532981532e-06, + "loss": 0.8733, + "step": 184 + }, + { + "epoch": 0.014656367597544068, + "grad_norm": 5.650537160006914, + "learning_rate": 9.762532981530344e-06, + "loss": 0.7878, + "step": 185 + }, + { + "epoch": 0.01473559120617944, + "grad_norm": 4.895363025247281, + "learning_rate": 9.815303430079157e-06, + "loss": 0.7125, + "step": 186 + }, + { + "epoch": 0.014814814814814815, + "grad_norm": 6.1635072979440295, + "learning_rate": 9.86807387862797e-06, + "loss": 0.804, + "step": 187 + }, + { + "epoch": 0.014894038423450188, + "grad_norm": 5.204900213880355, + "learning_rate": 9.920844327176781e-06, + "loss": 0.705, + "step": 188 + }, + { + "epoch": 0.014973262032085561, + "grad_norm": 5.7582758917110555, + "learning_rate": 9.973614775725594e-06, + "loss": 0.7431, + "step": 189 + }, + { + "epoch": 0.015052485640720936, + "grad_norm": 5.606321164149868, + "learning_rate": 1.0026385224274406e-05, + "loss": 0.8404, + "step": 190 + }, + { + "epoch": 0.015131709249356308, + "grad_norm": 5.408619447702593, + "learning_rate": 1.007915567282322e-05, + "loss": 0.7144, + "step": 191 + }, + { + "epoch": 0.015210932857991681, + "grad_norm": 5.8683474427102045, + "learning_rate": 1.0131926121372034e-05, + "loss": 0.6259, + "step": 192 + }, + { + "epoch": 0.015290156466627056, + "grad_norm": 6.423863810489736, + "learning_rate": 1.0184696569920845e-05, + "loss": 0.7656, + "step": 193 + }, + { + "epoch": 0.015369380075262428, + "grad_norm": 4.828903750610458, + "learning_rate": 1.0237467018469658e-05, + "loss": 0.6393, + "step": 194 + }, + { + "epoch": 0.015448603683897801, + "grad_norm": 4.680933132914672, + "learning_rate": 1.0290237467018471e-05, + "loss": 0.6767, + "step": 195 + }, + { + "epoch": 0.015527827292533174, + "grad_norm": 6.145591101536939, + "learning_rate": 1.0343007915567282e-05, + "loss": 0.7693, + "step": 196 + }, + { + "epoch": 0.015607050901168549, + "grad_norm": 5.635191054312746, + "learning_rate": 1.0395778364116096e-05, + "loss": 0.7552, + "step": 197 + }, + { + "epoch": 0.01568627450980392, + "grad_norm": 5.303888806565517, + "learning_rate": 1.0448548812664909e-05, + "loss": 0.7241, + "step": 198 + }, + { + "epoch": 0.015765498118439296, + "grad_norm": 5.6327296377707485, + "learning_rate": 1.050131926121372e-05, + "loss": 0.7321, + "step": 199 + }, + { + "epoch": 0.015844721727074667, + "grad_norm": 5.7751096911113535, + "learning_rate": 1.0554089709762533e-05, + "loss": 0.6401, + "step": 200 + }, + { + "epoch": 0.01592394533571004, + "grad_norm": 5.080070634374656, + "learning_rate": 1.0606860158311348e-05, + "loss": 0.6558, + "step": 201 + }, + { + "epoch": 0.016003168944345416, + "grad_norm": 5.354288260848981, + "learning_rate": 1.065963060686016e-05, + "loss": 0.7592, + "step": 202 + }, + { + "epoch": 0.016082392552980787, + "grad_norm": 5.9386907334086665, + "learning_rate": 1.0712401055408972e-05, + "loss": 0.8064, + "step": 203 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 5.292484411072734, + "learning_rate": 1.0765171503957785e-05, + "loss": 0.6699, + "step": 204 + }, + { + "epoch": 0.016240839770251536, + "grad_norm": 5.665278837914741, + "learning_rate": 1.0817941952506597e-05, + "loss": 0.7521, + "step": 205 + }, + { + "epoch": 0.016320063378886907, + "grad_norm": 5.740857851043832, + "learning_rate": 1.087071240105541e-05, + "loss": 0.7137, + "step": 206 + }, + { + "epoch": 0.01639928698752228, + "grad_norm": 6.0454658670924335, + "learning_rate": 1.0923482849604223e-05, + "loss": 0.7445, + "step": 207 + }, + { + "epoch": 0.016478510596157656, + "grad_norm": 5.23838587009887, + "learning_rate": 1.0976253298153034e-05, + "loss": 0.7024, + "step": 208 + }, + { + "epoch": 0.016557734204793027, + "grad_norm": 6.205514996867051, + "learning_rate": 1.1029023746701847e-05, + "loss": 0.8582, + "step": 209 + }, + { + "epoch": 0.016636957813428402, + "grad_norm": 5.309346411741108, + "learning_rate": 1.1081794195250662e-05, + "loss": 0.7522, + "step": 210 + }, + { + "epoch": 0.016716181422063776, + "grad_norm": 5.5375116733389325, + "learning_rate": 1.1134564643799472e-05, + "loss": 0.6855, + "step": 211 + }, + { + "epoch": 0.016795405030699147, + "grad_norm": 5.578528155883679, + "learning_rate": 1.1187335092348287e-05, + "loss": 0.6859, + "step": 212 + }, + { + "epoch": 0.016874628639334522, + "grad_norm": 5.382253655791046, + "learning_rate": 1.12401055408971e-05, + "loss": 0.7339, + "step": 213 + }, + { + "epoch": 0.016953852247969897, + "grad_norm": 5.528062539724129, + "learning_rate": 1.1292875989445911e-05, + "loss": 0.8214, + "step": 214 + }, + { + "epoch": 0.017033075856605268, + "grad_norm": 5.859717168728105, + "learning_rate": 1.1345646437994724e-05, + "loss": 0.7755, + "step": 215 + }, + { + "epoch": 0.017112299465240642, + "grad_norm": 5.1296682886393015, + "learning_rate": 1.1398416886543537e-05, + "loss": 0.8086, + "step": 216 + }, + { + "epoch": 0.017191523073876017, + "grad_norm": 5.792792407400176, + "learning_rate": 1.1451187335092349e-05, + "loss": 0.6665, + "step": 217 + }, + { + "epoch": 0.017270746682511388, + "grad_norm": 5.426129151374314, + "learning_rate": 1.1503957783641162e-05, + "loss": 0.6077, + "step": 218 + }, + { + "epoch": 0.017349970291146762, + "grad_norm": 6.446499513139741, + "learning_rate": 1.1556728232189975e-05, + "loss": 0.6683, + "step": 219 + }, + { + "epoch": 0.017429193899782137, + "grad_norm": 5.257821679228257, + "learning_rate": 1.1609498680738786e-05, + "loss": 0.7431, + "step": 220 + }, + { + "epoch": 0.017508417508417508, + "grad_norm": 6.0486630424704, + "learning_rate": 1.16622691292876e-05, + "loss": 0.8161, + "step": 221 + }, + { + "epoch": 0.017587641117052882, + "grad_norm": 5.391818266826954, + "learning_rate": 1.1715039577836414e-05, + "loss": 0.8968, + "step": 222 + }, + { + "epoch": 0.017666864725688253, + "grad_norm": 5.214277082229823, + "learning_rate": 1.1767810026385225e-05, + "loss": 0.8127, + "step": 223 + }, + { + "epoch": 0.017746088334323628, + "grad_norm": 5.947812857415303, + "learning_rate": 1.1820580474934039e-05, + "loss": 0.7495, + "step": 224 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 6.530761964307894, + "learning_rate": 1.1873350923482852e-05, + "loss": 0.6612, + "step": 225 + }, + { + "epoch": 0.017904535551594374, + "grad_norm": 6.345427295525233, + "learning_rate": 1.1926121372031663e-05, + "loss": 0.6558, + "step": 226 + }, + { + "epoch": 0.017983759160229748, + "grad_norm": 5.698031713831435, + "learning_rate": 1.1978891820580476e-05, + "loss": 0.7203, + "step": 227 + }, + { + "epoch": 0.018062982768865123, + "grad_norm": 5.12522619447126, + "learning_rate": 1.203166226912929e-05, + "loss": 0.7576, + "step": 228 + }, + { + "epoch": 0.018142206377500494, + "grad_norm": 5.808288250755425, + "learning_rate": 1.20844327176781e-05, + "loss": 0.8105, + "step": 229 + }, + { + "epoch": 0.018221429986135868, + "grad_norm": 4.9300203075498805, + "learning_rate": 1.2137203166226914e-05, + "loss": 0.7983, + "step": 230 + }, + { + "epoch": 0.018300653594771243, + "grad_norm": 4.697187469691393, + "learning_rate": 1.2189973614775727e-05, + "loss": 0.7084, + "step": 231 + }, + { + "epoch": 0.018379877203406614, + "grad_norm": 5.001773945258765, + "learning_rate": 1.2242744063324538e-05, + "loss": 0.7223, + "step": 232 + }, + { + "epoch": 0.01845910081204199, + "grad_norm": 7.1355893597252855, + "learning_rate": 1.2295514511873353e-05, + "loss": 0.8413, + "step": 233 + }, + { + "epoch": 0.018538324420677363, + "grad_norm": 6.348833763192268, + "learning_rate": 1.2348284960422166e-05, + "loss": 0.6731, + "step": 234 + }, + { + "epoch": 0.018617548029312734, + "grad_norm": 4.946204827143474, + "learning_rate": 1.2401055408970977e-05, + "loss": 0.6908, + "step": 235 + }, + { + "epoch": 0.01869677163794811, + "grad_norm": 5.493686807606341, + "learning_rate": 1.245382585751979e-05, + "loss": 0.7671, + "step": 236 + }, + { + "epoch": 0.018775995246583483, + "grad_norm": 5.673914737037365, + "learning_rate": 1.2506596306068604e-05, + "loss": 0.7087, + "step": 237 + }, + { + "epoch": 0.018855218855218854, + "grad_norm": 4.83293534885964, + "learning_rate": 1.2559366754617415e-05, + "loss": 0.7008, + "step": 238 + }, + { + "epoch": 0.01893444246385423, + "grad_norm": 4.668620821999954, + "learning_rate": 1.2612137203166228e-05, + "loss": 0.7444, + "step": 239 + }, + { + "epoch": 0.019013666072489603, + "grad_norm": 5.320602701600323, + "learning_rate": 1.2664907651715041e-05, + "loss": 0.7076, + "step": 240 + }, + { + "epoch": 0.019092889681124974, + "grad_norm": 5.216235256752975, + "learning_rate": 1.2717678100263852e-05, + "loss": 0.669, + "step": 241 + }, + { + "epoch": 0.01917211328976035, + "grad_norm": 8.852938876339596, + "learning_rate": 1.2770448548812666e-05, + "loss": 0.7982, + "step": 242 + }, + { + "epoch": 0.019251336898395723, + "grad_norm": 7.639310926487093, + "learning_rate": 1.282321899736148e-05, + "loss": 0.645, + "step": 243 + }, + { + "epoch": 0.019330560507031094, + "grad_norm": 5.191896526211173, + "learning_rate": 1.287598944591029e-05, + "loss": 0.6148, + "step": 244 + }, + { + "epoch": 0.01940978411566647, + "grad_norm": 6.010869015407416, + "learning_rate": 1.2928759894459105e-05, + "loss": 0.8221, + "step": 245 + }, + { + "epoch": 0.019489007724301843, + "grad_norm": 6.462023902750167, + "learning_rate": 1.2981530343007918e-05, + "loss": 0.8216, + "step": 246 + }, + { + "epoch": 0.019568231332937214, + "grad_norm": 6.6283590992549595, + "learning_rate": 1.303430079155673e-05, + "loss": 0.8258, + "step": 247 + }, + { + "epoch": 0.01964745494157259, + "grad_norm": 5.2723616814518275, + "learning_rate": 1.3087071240105542e-05, + "loss": 0.8107, + "step": 248 + }, + { + "epoch": 0.019726678550207963, + "grad_norm": 5.489791824763702, + "learning_rate": 1.3139841688654355e-05, + "loss": 0.7367, + "step": 249 + }, + { + "epoch": 0.019805902158843335, + "grad_norm": 5.400167981733446, + "learning_rate": 1.3192612137203167e-05, + "loss": 0.7817, + "step": 250 + }, + { + "epoch": 0.01988512576747871, + "grad_norm": 4.944767634852149, + "learning_rate": 1.324538258575198e-05, + "loss": 0.6969, + "step": 251 + }, + { + "epoch": 0.019964349376114084, + "grad_norm": 6.164358046513915, + "learning_rate": 1.3298153034300793e-05, + "loss": 0.7408, + "step": 252 + }, + { + "epoch": 0.020043572984749455, + "grad_norm": 4.713538395438616, + "learning_rate": 1.3350923482849604e-05, + "loss": 0.7404, + "step": 253 + }, + { + "epoch": 0.02012279659338483, + "grad_norm": 5.476222960909804, + "learning_rate": 1.3403693931398417e-05, + "loss": 0.7923, + "step": 254 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 6.439441203627157, + "learning_rate": 1.3456464379947232e-05, + "loss": 0.6825, + "step": 255 + }, + { + "epoch": 0.020281243810655575, + "grad_norm": 5.172281142180801, + "learning_rate": 1.3509234828496044e-05, + "loss": 0.8169, + "step": 256 + }, + { + "epoch": 0.02036046741929095, + "grad_norm": 4.533213017046608, + "learning_rate": 1.3562005277044857e-05, + "loss": 0.694, + "step": 257 + }, + { + "epoch": 0.02043969102792632, + "grad_norm": 6.029156818830929, + "learning_rate": 1.361477572559367e-05, + "loss": 0.8392, + "step": 258 + }, + { + "epoch": 0.020518914636561695, + "grad_norm": 5.227043666231582, + "learning_rate": 1.3667546174142481e-05, + "loss": 0.7026, + "step": 259 + }, + { + "epoch": 0.02059813824519707, + "grad_norm": 5.037218463230358, + "learning_rate": 1.3720316622691294e-05, + "loss": 0.8244, + "step": 260 + }, + { + "epoch": 0.02067736185383244, + "grad_norm": 5.175478194531059, + "learning_rate": 1.3773087071240107e-05, + "loss": 0.8124, + "step": 261 + }, + { + "epoch": 0.020756585462467815, + "grad_norm": 5.21225774868909, + "learning_rate": 1.3825857519788919e-05, + "loss": 0.6844, + "step": 262 + }, + { + "epoch": 0.02083580907110319, + "grad_norm": 4.344408122167347, + "learning_rate": 1.3878627968337732e-05, + "loss": 0.7178, + "step": 263 + }, + { + "epoch": 0.02091503267973856, + "grad_norm": 5.772588454479037, + "learning_rate": 1.3931398416886547e-05, + "loss": 0.9369, + "step": 264 + }, + { + "epoch": 0.020994256288373935, + "grad_norm": 4.860320800887344, + "learning_rate": 1.3984168865435356e-05, + "loss": 0.7445, + "step": 265 + }, + { + "epoch": 0.02107347989700931, + "grad_norm": 4.926487338802397, + "learning_rate": 1.4036939313984171e-05, + "loss": 0.6876, + "step": 266 + }, + { + "epoch": 0.02115270350564468, + "grad_norm": 4.685217534157765, + "learning_rate": 1.4089709762532984e-05, + "loss": 0.7607, + "step": 267 + }, + { + "epoch": 0.021231927114280055, + "grad_norm": 5.7889691146255995, + "learning_rate": 1.4142480211081795e-05, + "loss": 0.8241, + "step": 268 + }, + { + "epoch": 0.02131115072291543, + "grad_norm": 5.173504532588536, + "learning_rate": 1.4195250659630609e-05, + "loss": 0.7805, + "step": 269 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 5.062020683191384, + "learning_rate": 1.4248021108179422e-05, + "loss": 0.7023, + "step": 270 + }, + { + "epoch": 0.021469597940186175, + "grad_norm": 5.125795245863302, + "learning_rate": 1.4300791556728233e-05, + "loss": 0.7066, + "step": 271 + }, + { + "epoch": 0.02154882154882155, + "grad_norm": 5.221424203324531, + "learning_rate": 1.4353562005277046e-05, + "loss": 0.6957, + "step": 272 + }, + { + "epoch": 0.02162804515745692, + "grad_norm": 4.751600484591223, + "learning_rate": 1.440633245382586e-05, + "loss": 0.725, + "step": 273 + }, + { + "epoch": 0.021707268766092296, + "grad_norm": 5.704248185447199, + "learning_rate": 1.445910290237467e-05, + "loss": 0.7975, + "step": 274 + }, + { + "epoch": 0.02178649237472767, + "grad_norm": 5.283878229259629, + "learning_rate": 1.4511873350923484e-05, + "loss": 0.7827, + "step": 275 + }, + { + "epoch": 0.02186571598336304, + "grad_norm": 5.561430305745563, + "learning_rate": 1.4564643799472298e-05, + "loss": 0.8687, + "step": 276 + }, + { + "epoch": 0.021944939591998416, + "grad_norm": 4.953562752325849, + "learning_rate": 1.461741424802111e-05, + "loss": 0.6568, + "step": 277 + }, + { + "epoch": 0.02202416320063379, + "grad_norm": 4.989031450636796, + "learning_rate": 1.4670184696569923e-05, + "loss": 0.6796, + "step": 278 + }, + { + "epoch": 0.02210338680926916, + "grad_norm": 5.0814915981541215, + "learning_rate": 1.4722955145118736e-05, + "loss": 0.6442, + "step": 279 + }, + { + "epoch": 0.022182610417904536, + "grad_norm": 5.440612218933665, + "learning_rate": 1.4775725593667547e-05, + "loss": 0.8033, + "step": 280 + }, + { + "epoch": 0.02226183402653991, + "grad_norm": 4.7036030181612265, + "learning_rate": 1.482849604221636e-05, + "loss": 0.7499, + "step": 281 + }, + { + "epoch": 0.02234105763517528, + "grad_norm": 5.563795040266641, + "learning_rate": 1.4881266490765173e-05, + "loss": 0.7586, + "step": 282 + }, + { + "epoch": 0.022420281243810656, + "grad_norm": 5.571378672857436, + "learning_rate": 1.4934036939313985e-05, + "loss": 0.751, + "step": 283 + }, + { + "epoch": 0.02249950485244603, + "grad_norm": 4.5765927786250264, + "learning_rate": 1.4986807387862798e-05, + "loss": 0.6686, + "step": 284 + }, + { + "epoch": 0.0225787284610814, + "grad_norm": 4.520184821154518, + "learning_rate": 1.503957783641161e-05, + "loss": 0.6698, + "step": 285 + }, + { + "epoch": 0.022657952069716776, + "grad_norm": 4.719191921605493, + "learning_rate": 1.5092348284960422e-05, + "loss": 0.7684, + "step": 286 + }, + { + "epoch": 0.02273717567835215, + "grad_norm": 5.282904894309323, + "learning_rate": 1.5145118733509237e-05, + "loss": 0.7937, + "step": 287 + }, + { + "epoch": 0.02281639928698752, + "grad_norm": 4.85916128437978, + "learning_rate": 1.5197889182058047e-05, + "loss": 0.7639, + "step": 288 + }, + { + "epoch": 0.022895622895622896, + "grad_norm": 5.087385286910722, + "learning_rate": 1.5250659630606862e-05, + "loss": 0.7772, + "step": 289 + }, + { + "epoch": 0.022974846504258267, + "grad_norm": 5.385201890840065, + "learning_rate": 1.5303430079155675e-05, + "loss": 0.7612, + "step": 290 + }, + { + "epoch": 0.023054070112893642, + "grad_norm": 4.7618693908420875, + "learning_rate": 1.5356200527704484e-05, + "loss": 0.7228, + "step": 291 + }, + { + "epoch": 0.023133293721529016, + "grad_norm": 4.656091577931212, + "learning_rate": 1.5408970976253298e-05, + "loss": 0.7909, + "step": 292 + }, + { + "epoch": 0.023212517330164387, + "grad_norm": 5.0321567242044365, + "learning_rate": 1.5461741424802114e-05, + "loss": 0.7067, + "step": 293 + }, + { + "epoch": 0.023291740938799762, + "grad_norm": 4.77287153420482, + "learning_rate": 1.5514511873350924e-05, + "loss": 0.6899, + "step": 294 + }, + { + "epoch": 0.023370964547435136, + "grad_norm": 4.558788078989551, + "learning_rate": 1.5567282321899737e-05, + "loss": 0.7837, + "step": 295 + }, + { + "epoch": 0.023450188156070507, + "grad_norm": 5.216472651111241, + "learning_rate": 1.562005277044855e-05, + "loss": 0.6787, + "step": 296 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 5.210615417423931, + "learning_rate": 1.5672823218997363e-05, + "loss": 0.7199, + "step": 297 + }, + { + "epoch": 0.023608635373341257, + "grad_norm": 5.6984947886017965, + "learning_rate": 1.5725593667546176e-05, + "loss": 0.7916, + "step": 298 + }, + { + "epoch": 0.023687858981976628, + "grad_norm": 5.532960252900681, + "learning_rate": 1.577836411609499e-05, + "loss": 0.8564, + "step": 299 + }, + { + "epoch": 0.023767082590612002, + "grad_norm": 6.697260878045465, + "learning_rate": 1.58311345646438e-05, + "loss": 0.7996, + "step": 300 + }, + { + "epoch": 0.023846306199247377, + "grad_norm": 4.432578004969917, + "learning_rate": 1.5883905013192612e-05, + "loss": 0.6621, + "step": 301 + }, + { + "epoch": 0.023925529807882748, + "grad_norm": 5.53714351619017, + "learning_rate": 1.5936675461741425e-05, + "loss": 0.6781, + "step": 302 + }, + { + "epoch": 0.024004753416518122, + "grad_norm": 4.485178636736496, + "learning_rate": 1.5989445910290238e-05, + "loss": 0.6729, + "step": 303 + }, + { + "epoch": 0.024083977025153497, + "grad_norm": 4.8765460394107185, + "learning_rate": 1.604221635883905e-05, + "loss": 0.6071, + "step": 304 + }, + { + "epoch": 0.024163200633788868, + "grad_norm": 5.1872808489810005, + "learning_rate": 1.6094986807387864e-05, + "loss": 0.7562, + "step": 305 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 4.750901671483955, + "learning_rate": 1.6147757255936677e-05, + "loss": 0.6922, + "step": 306 + }, + { + "epoch": 0.024321647851059617, + "grad_norm": 4.58393405278496, + "learning_rate": 1.620052770448549e-05, + "loss": 0.7805, + "step": 307 + }, + { + "epoch": 0.024400871459694988, + "grad_norm": 4.631535325277142, + "learning_rate": 1.6253298153034303e-05, + "loss": 0.7336, + "step": 308 + }, + { + "epoch": 0.024480095068330363, + "grad_norm": 5.669850117188683, + "learning_rate": 1.6306068601583113e-05, + "loss": 0.5998, + "step": 309 + }, + { + "epoch": 0.024559318676965737, + "grad_norm": 4.712742532199791, + "learning_rate": 1.6358839050131926e-05, + "loss": 0.7182, + "step": 310 + }, + { + "epoch": 0.024638542285601108, + "grad_norm": 4.878669935714314, + "learning_rate": 1.641160949868074e-05, + "loss": 0.7017, + "step": 311 + }, + { + "epoch": 0.024717765894236483, + "grad_norm": 5.469294720818251, + "learning_rate": 1.6464379947229552e-05, + "loss": 0.686, + "step": 312 + }, + { + "epoch": 0.024796989502871857, + "grad_norm": 5.505596073484324, + "learning_rate": 1.6517150395778365e-05, + "loss": 0.9775, + "step": 313 + }, + { + "epoch": 0.024876213111507228, + "grad_norm": 5.606387481204577, + "learning_rate": 1.656992084432718e-05, + "loss": 0.7261, + "step": 314 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 4.464959691277312, + "learning_rate": 1.6622691292875988e-05, + "loss": 0.6096, + "step": 315 + }, + { + "epoch": 0.025034660328777977, + "grad_norm": 4.898467999914199, + "learning_rate": 1.6675461741424805e-05, + "loss": 0.8412, + "step": 316 + }, + { + "epoch": 0.02511388393741335, + "grad_norm": 4.872228087709652, + "learning_rate": 1.6728232189973618e-05, + "loss": 0.6006, + "step": 317 + }, + { + "epoch": 0.025193107546048723, + "grad_norm": 6.173465907759124, + "learning_rate": 1.6781002638522427e-05, + "loss": 0.6591, + "step": 318 + }, + { + "epoch": 0.025272331154684097, + "grad_norm": 4.904345804183026, + "learning_rate": 1.683377308707124e-05, + "loss": 0.646, + "step": 319 + }, + { + "epoch": 0.02535155476331947, + "grad_norm": 5.586871756586915, + "learning_rate": 1.6886543535620054e-05, + "loss": 0.5606, + "step": 320 + }, + { + "epoch": 0.025430778371954843, + "grad_norm": 4.6174459074797, + "learning_rate": 1.6939313984168867e-05, + "loss": 0.7099, + "step": 321 + }, + { + "epoch": 0.025510001980590218, + "grad_norm": 5.235131703024437, + "learning_rate": 1.699208443271768e-05, + "loss": 0.7065, + "step": 322 + }, + { + "epoch": 0.02558922558922559, + "grad_norm": 4.826210405176726, + "learning_rate": 1.7044854881266493e-05, + "loss": 0.8279, + "step": 323 + }, + { + "epoch": 0.025668449197860963, + "grad_norm": 4.54598953496401, + "learning_rate": 1.7097625329815303e-05, + "loss": 0.6764, + "step": 324 + }, + { + "epoch": 0.025747672806496334, + "grad_norm": 4.424977462814265, + "learning_rate": 1.7150395778364116e-05, + "loss": 0.604, + "step": 325 + }, + { + "epoch": 0.02582689641513171, + "grad_norm": 4.454436082874082, + "learning_rate": 1.7203166226912932e-05, + "loss": 0.7477, + "step": 326 + }, + { + "epoch": 0.025906120023767083, + "grad_norm": 4.933371834743646, + "learning_rate": 1.7255936675461742e-05, + "loss": 0.8529, + "step": 327 + }, + { + "epoch": 0.025985343632402454, + "grad_norm": 5.015914473675524, + "learning_rate": 1.7308707124010555e-05, + "loss": 0.7221, + "step": 328 + }, + { + "epoch": 0.02606456724103783, + "grad_norm": 4.883274382223023, + "learning_rate": 1.7361477572559368e-05, + "loss": 0.7482, + "step": 329 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 6.222847344775064, + "learning_rate": 1.741424802110818e-05, + "loss": 0.847, + "step": 330 + }, + { + "epoch": 0.026223014458308574, + "grad_norm": 5.122229201964031, + "learning_rate": 1.7467018469656994e-05, + "loss": 0.7503, + "step": 331 + }, + { + "epoch": 0.02630223806694395, + "grad_norm": 4.884576991437588, + "learning_rate": 1.7519788918205807e-05, + "loss": 0.698, + "step": 332 + }, + { + "epoch": 0.026381461675579324, + "grad_norm": 4.360395504080726, + "learning_rate": 1.7572559366754617e-05, + "loss": 0.7492, + "step": 333 + }, + { + "epoch": 0.026460685284214695, + "grad_norm": 4.20751253678441, + "learning_rate": 1.762532981530343e-05, + "loss": 0.6072, + "step": 334 + }, + { + "epoch": 0.02653990889285007, + "grad_norm": 4.0010841153087044, + "learning_rate": 1.7678100263852246e-05, + "loss": 0.6294, + "step": 335 + }, + { + "epoch": 0.026619132501485444, + "grad_norm": 5.0942419602954025, + "learning_rate": 1.7730870712401056e-05, + "loss": 0.7124, + "step": 336 + }, + { + "epoch": 0.026698356110120815, + "grad_norm": 4.9025049718820926, + "learning_rate": 1.778364116094987e-05, + "loss": 0.7486, + "step": 337 + }, + { + "epoch": 0.02677757971875619, + "grad_norm": 4.216820203802327, + "learning_rate": 1.7836411609498682e-05, + "loss": 0.6624, + "step": 338 + }, + { + "epoch": 0.026856803327391564, + "grad_norm": 4.873402113608072, + "learning_rate": 1.7889182058047495e-05, + "loss": 0.6233, + "step": 339 + }, + { + "epoch": 0.026936026936026935, + "grad_norm": 5.207898415655085, + "learning_rate": 1.794195250659631e-05, + "loss": 0.8605, + "step": 340 + }, + { + "epoch": 0.02701525054466231, + "grad_norm": 4.279273943695959, + "learning_rate": 1.799472295514512e-05, + "loss": 0.8051, + "step": 341 + }, + { + "epoch": 0.027094474153297684, + "grad_norm": 4.889154719893198, + "learning_rate": 1.804749340369393e-05, + "loss": 0.8153, + "step": 342 + }, + { + "epoch": 0.027173697761933055, + "grad_norm": 4.304712492086262, + "learning_rate": 1.8100263852242744e-05, + "loss": 0.5527, + "step": 343 + }, + { + "epoch": 0.02725292137056843, + "grad_norm": 4.757940775300428, + "learning_rate": 1.8153034300791557e-05, + "loss": 0.7414, + "step": 344 + }, + { + "epoch": 0.027332144979203804, + "grad_norm": 4.745641576583929, + "learning_rate": 1.820580474934037e-05, + "loss": 0.7018, + "step": 345 + }, + { + "epoch": 0.027411368587839175, + "grad_norm": 5.606855238901125, + "learning_rate": 1.8258575197889184e-05, + "loss": 0.711, + "step": 346 + }, + { + "epoch": 0.02749059219647455, + "grad_norm": 4.589455511019485, + "learning_rate": 1.8311345646437997e-05, + "loss": 0.6657, + "step": 347 + }, + { + "epoch": 0.027569815805109924, + "grad_norm": 4.380556690595863, + "learning_rate": 1.836411609498681e-05, + "loss": 0.6207, + "step": 348 + }, + { + "epoch": 0.027649039413745295, + "grad_norm": 5.464281507136505, + "learning_rate": 1.8416886543535623e-05, + "loss": 0.7978, + "step": 349 + }, + { + "epoch": 0.02772826302238067, + "grad_norm": 4.455830264641891, + "learning_rate": 1.8469656992084436e-05, + "loss": 0.647, + "step": 350 + }, + { + "epoch": 0.027807486631016044, + "grad_norm": 4.415427815113544, + "learning_rate": 1.8522427440633246e-05, + "loss": 0.7126, + "step": 351 + }, + { + "epoch": 0.027886710239651415, + "grad_norm": 4.423598563001037, + "learning_rate": 1.857519788918206e-05, + "loss": 0.7051, + "step": 352 + }, + { + "epoch": 0.02796593384828679, + "grad_norm": 3.8569744021636385, + "learning_rate": 1.8627968337730872e-05, + "loss": 0.6308, + "step": 353 + }, + { + "epoch": 0.028045157456922164, + "grad_norm": 4.415411292606307, + "learning_rate": 1.8680738786279685e-05, + "loss": 0.5519, + "step": 354 + }, + { + "epoch": 0.028124381065557535, + "grad_norm": 5.0422917734338135, + "learning_rate": 1.8733509234828498e-05, + "loss": 0.7172, + "step": 355 + }, + { + "epoch": 0.02820360467419291, + "grad_norm": 4.002764445493207, + "learning_rate": 1.878627968337731e-05, + "loss": 0.7165, + "step": 356 + }, + { + "epoch": 0.028282828282828285, + "grad_norm": 4.599337876833972, + "learning_rate": 1.883905013192612e-05, + "loss": 0.7968, + "step": 357 + }, + { + "epoch": 0.028362051891463656, + "grad_norm": 5.033776323410691, + "learning_rate": 1.8891820580474937e-05, + "loss": 0.7958, + "step": 358 + }, + { + "epoch": 0.02844127550009903, + "grad_norm": 4.738923984100409, + "learning_rate": 1.894459102902375e-05, + "loss": 0.7353, + "step": 359 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 4.246229627710574, + "learning_rate": 1.899736147757256e-05, + "loss": 0.7837, + "step": 360 + }, + { + "epoch": 0.028599722717369776, + "grad_norm": 5.233523278268373, + "learning_rate": 1.9050131926121373e-05, + "loss": 0.7994, + "step": 361 + }, + { + "epoch": 0.02867894632600515, + "grad_norm": 5.000247119050768, + "learning_rate": 1.9102902374670186e-05, + "loss": 0.8393, + "step": 362 + }, + { + "epoch": 0.02875816993464052, + "grad_norm": 4.990627757505972, + "learning_rate": 1.9155672823219e-05, + "loss": 0.683, + "step": 363 + }, + { + "epoch": 0.028837393543275896, + "grad_norm": 3.5378068556875655, + "learning_rate": 1.9208443271767812e-05, + "loss": 0.698, + "step": 364 + }, + { + "epoch": 0.02891661715191127, + "grad_norm": 4.707473097246092, + "learning_rate": 1.9261213720316625e-05, + "loss": 0.7491, + "step": 365 + }, + { + "epoch": 0.02899584076054664, + "grad_norm": 4.264970516367911, + "learning_rate": 1.9313984168865435e-05, + "loss": 0.6915, + "step": 366 + }, + { + "epoch": 0.029075064369182016, + "grad_norm": 4.745594773515386, + "learning_rate": 1.9366754617414248e-05, + "loss": 0.6245, + "step": 367 + }, + { + "epoch": 0.02915428797781739, + "grad_norm": 4.218060854608493, + "learning_rate": 1.9419525065963065e-05, + "loss": 0.7101, + "step": 368 + }, + { + "epoch": 0.02923351158645276, + "grad_norm": 4.422163537716093, + "learning_rate": 1.9472295514511874e-05, + "loss": 0.6612, + "step": 369 + }, + { + "epoch": 0.029312735195088136, + "grad_norm": 5.059970177902883, + "learning_rate": 1.9525065963060687e-05, + "loss": 0.6844, + "step": 370 + }, + { + "epoch": 0.02939195880372351, + "grad_norm": 5.425596923779999, + "learning_rate": 1.95778364116095e-05, + "loss": 0.7581, + "step": 371 + }, + { + "epoch": 0.02947118241235888, + "grad_norm": 4.448599851923129, + "learning_rate": 1.9630606860158313e-05, + "loss": 0.5906, + "step": 372 + }, + { + "epoch": 0.029550406020994256, + "grad_norm": 5.867014061178323, + "learning_rate": 1.9683377308707127e-05, + "loss": 0.777, + "step": 373 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 4.633622660915606, + "learning_rate": 1.973614775725594e-05, + "loss": 0.7659, + "step": 374 + }, + { + "epoch": 0.029708853238265002, + "grad_norm": 5.2984904666256005, + "learning_rate": 1.978891820580475e-05, + "loss": 0.7074, + "step": 375 + }, + { + "epoch": 0.029788076846900376, + "grad_norm": 4.861656259919689, + "learning_rate": 1.9841688654353562e-05, + "loss": 0.6616, + "step": 376 + }, + { + "epoch": 0.02986730045553575, + "grad_norm": 5.545022792268187, + "learning_rate": 1.9894459102902375e-05, + "loss": 0.6232, + "step": 377 + }, + { + "epoch": 0.029946524064171122, + "grad_norm": 4.911902408072508, + "learning_rate": 1.994722955145119e-05, + "loss": 0.8349, + "step": 378 + }, + { + "epoch": 0.030025747672806496, + "grad_norm": 4.712000002428415, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 379 + }, + { + "epoch": 0.03010497128144187, + "grad_norm": 5.0463324227267154, + "learning_rate": 1.999999967077406e-05, + "loss": 0.753, + "step": 380 + }, + { + "epoch": 0.030184194890077242, + "grad_norm": 4.467950933977687, + "learning_rate": 1.9999998683096255e-05, + "loss": 0.6661, + "step": 381 + }, + { + "epoch": 0.030263418498712617, + "grad_norm": 5.685728093385676, + "learning_rate": 1.999999703696666e-05, + "loss": 0.826, + "step": 382 + }, + { + "epoch": 0.03034264210734799, + "grad_norm": 4.0431594610378285, + "learning_rate": 1.999999473238537e-05, + "loss": 0.659, + "step": 383 + }, + { + "epoch": 0.030421865715983362, + "grad_norm": 4.2065187710344665, + "learning_rate": 1.9999991769352545e-05, + "loss": 0.716, + "step": 384 + }, + { + "epoch": 0.030501089324618737, + "grad_norm": 4.699634931223346, + "learning_rate": 1.9999988147868384e-05, + "loss": 0.7331, + "step": 385 + }, + { + "epoch": 0.03058031293325411, + "grad_norm": 3.830676038932044, + "learning_rate": 1.9999983867933114e-05, + "loss": 0.6457, + "step": 386 + }, + { + "epoch": 0.030659536541889482, + "grad_norm": 4.187993752123971, + "learning_rate": 1.999997892954703e-05, + "loss": 0.6993, + "step": 387 + }, + { + "epoch": 0.030738760150524857, + "grad_norm": 4.842800869373137, + "learning_rate": 1.9999973332710443e-05, + "loss": 0.7597, + "step": 388 + }, + { + "epoch": 0.03081798375916023, + "grad_norm": 4.729110283875682, + "learning_rate": 1.9999967077423732e-05, + "loss": 0.5969, + "step": 389 + }, + { + "epoch": 0.030897207367795602, + "grad_norm": 5.142680374559456, + "learning_rate": 1.9999960163687307e-05, + "loss": 0.6302, + "step": 390 + }, + { + "epoch": 0.030976430976430977, + "grad_norm": 4.864329373176803, + "learning_rate": 1.999995259150162e-05, + "loss": 0.6236, + "step": 391 + }, + { + "epoch": 0.031055654585066348, + "grad_norm": 4.1234032843182655, + "learning_rate": 1.999994436086717e-05, + "loss": 0.6444, + "step": 392 + }, + { + "epoch": 0.031134878193701723, + "grad_norm": 4.281847243114499, + "learning_rate": 1.9999935471784508e-05, + "loss": 0.7837, + "step": 393 + }, + { + "epoch": 0.031214101802337097, + "grad_norm": 4.213055829765473, + "learning_rate": 1.9999925924254203e-05, + "loss": 0.5674, + "step": 394 + }, + { + "epoch": 0.03129332541097247, + "grad_norm": 4.714674138266144, + "learning_rate": 1.9999915718276898e-05, + "loss": 0.7518, + "step": 395 + }, + { + "epoch": 0.03137254901960784, + "grad_norm": 3.7935243587876877, + "learning_rate": 1.9999904853853256e-05, + "loss": 0.649, + "step": 396 + }, + { + "epoch": 0.03145177262824322, + "grad_norm": 4.260232514025721, + "learning_rate": 1.9999893330983998e-05, + "loss": 0.6109, + "step": 397 + }, + { + "epoch": 0.03153099623687859, + "grad_norm": 4.537641317809541, + "learning_rate": 1.999988114966988e-05, + "loss": 0.7417, + "step": 398 + }, + { + "epoch": 0.031610219845513966, + "grad_norm": 4.701561160728191, + "learning_rate": 1.9999868309911704e-05, + "loss": 0.7249, + "step": 399 + }, + { + "epoch": 0.031689443454149334, + "grad_norm": 4.22285593155482, + "learning_rate": 1.9999854811710317e-05, + "loss": 0.7386, + "step": 400 + }, + { + "epoch": 0.03176866706278471, + "grad_norm": 4.276842833278445, + "learning_rate": 1.9999840655066608e-05, + "loss": 0.7625, + "step": 401 + }, + { + "epoch": 0.03184789067142008, + "grad_norm": 4.001443827946862, + "learning_rate": 1.9999825839981506e-05, + "loss": 0.7701, + "step": 402 + }, + { + "epoch": 0.03192711428005546, + "grad_norm": 4.18069583531695, + "learning_rate": 1.9999810366455986e-05, + "loss": 0.6873, + "step": 403 + }, + { + "epoch": 0.03200633788869083, + "grad_norm": 4.472852463492354, + "learning_rate": 1.9999794234491075e-05, + "loss": 0.6958, + "step": 404 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 5.545645546601008, + "learning_rate": 1.9999777444087826e-05, + "loss": 0.7381, + "step": 405 + }, + { + "epoch": 0.032164785105961574, + "grad_norm": 4.477348669868605, + "learning_rate": 1.999975999524735e-05, + "loss": 0.6911, + "step": 406 + }, + { + "epoch": 0.03224400871459695, + "grad_norm": 4.318832171882154, + "learning_rate": 1.9999741887970795e-05, + "loss": 0.6146, + "step": 407 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 3.4676427320653227, + "learning_rate": 1.999972312225935e-05, + "loss": 0.7041, + "step": 408 + }, + { + "epoch": 0.0324024559318677, + "grad_norm": 4.035350874161928, + "learning_rate": 1.999970369811425e-05, + "loss": 0.7943, + "step": 409 + }, + { + "epoch": 0.03248167954050307, + "grad_norm": 4.3804215167531755, + "learning_rate": 1.9999683615536784e-05, + "loss": 0.5909, + "step": 410 + }, + { + "epoch": 0.03256090314913844, + "grad_norm": 3.607200114286588, + "learning_rate": 1.9999662874528264e-05, + "loss": 0.624, + "step": 411 + }, + { + "epoch": 0.032640126757773814, + "grad_norm": 4.531218352807698, + "learning_rate": 1.999964147509006e-05, + "loss": 0.6523, + "step": 412 + }, + { + "epoch": 0.03271935036640919, + "grad_norm": 4.204126764299906, + "learning_rate": 1.999961941722358e-05, + "loss": 0.6777, + "step": 413 + }, + { + "epoch": 0.03279857397504456, + "grad_norm": 4.034517262908247, + "learning_rate": 1.9999596700930274e-05, + "loss": 0.5886, + "step": 414 + }, + { + "epoch": 0.03287779758367994, + "grad_norm": 4.840497432146151, + "learning_rate": 1.999957332621164e-05, + "loss": 0.7662, + "step": 415 + }, + { + "epoch": 0.03295702119231531, + "grad_norm": 4.423505255377689, + "learning_rate": 1.999954929306922e-05, + "loss": 0.6206, + "step": 416 + }, + { + "epoch": 0.03303624480095068, + "grad_norm": 4.715010799490287, + "learning_rate": 1.999952460150459e-05, + "loss": 0.824, + "step": 417 + }, + { + "epoch": 0.033115468409586055, + "grad_norm": 4.255583388027712, + "learning_rate": 1.9999499251519388e-05, + "loss": 0.7422, + "step": 418 + }, + { + "epoch": 0.03319469201822143, + "grad_norm": 5.197441633535912, + "learning_rate": 1.9999473243115268e-05, + "loss": 0.8425, + "step": 419 + }, + { + "epoch": 0.033273915626856804, + "grad_norm": 3.448163491789807, + "learning_rate": 1.999944657629395e-05, + "loss": 0.6634, + "step": 420 + }, + { + "epoch": 0.03335313923549218, + "grad_norm": 4.0169855304030015, + "learning_rate": 1.999941925105719e-05, + "loss": 0.6716, + "step": 421 + }, + { + "epoch": 0.03343236284412755, + "grad_norm": 3.740364010912455, + "learning_rate": 1.9999391267406786e-05, + "loss": 0.7694, + "step": 422 + }, + { + "epoch": 0.03351158645276292, + "grad_norm": 3.549221462165083, + "learning_rate": 1.9999362625344584e-05, + "loss": 0.6881, + "step": 423 + }, + { + "epoch": 0.033590810061398295, + "grad_norm": 4.830348989262113, + "learning_rate": 1.9999333324872464e-05, + "loss": 0.8412, + "step": 424 + }, + { + "epoch": 0.03367003367003367, + "grad_norm": 3.7762810766178956, + "learning_rate": 1.9999303365992357e-05, + "loss": 0.735, + "step": 425 + }, + { + "epoch": 0.033749257278669044, + "grad_norm": 3.9829759791080654, + "learning_rate": 1.999927274870624e-05, + "loss": 0.5417, + "step": 426 + }, + { + "epoch": 0.03382848088730442, + "grad_norm": 4.075751711811119, + "learning_rate": 1.9999241473016126e-05, + "loss": 0.7003, + "step": 427 + }, + { + "epoch": 0.03390770449593979, + "grad_norm": 4.0480284487019675, + "learning_rate": 1.999920953892407e-05, + "loss": 0.7324, + "step": 428 + }, + { + "epoch": 0.03398692810457516, + "grad_norm": 4.730819648450008, + "learning_rate": 1.9999176946432183e-05, + "loss": 0.6864, + "step": 429 + }, + { + "epoch": 0.034066151713210535, + "grad_norm": 4.297433006698665, + "learning_rate": 1.9999143695542606e-05, + "loss": 0.7106, + "step": 430 + }, + { + "epoch": 0.03414537532184591, + "grad_norm": 4.459895139884524, + "learning_rate": 1.9999109786257528e-05, + "loss": 0.798, + "step": 431 + }, + { + "epoch": 0.034224598930481284, + "grad_norm": 4.155135795233358, + "learning_rate": 1.9999075218579184e-05, + "loss": 0.6899, + "step": 432 + }, + { + "epoch": 0.03430382253911666, + "grad_norm": 3.550121768386916, + "learning_rate": 1.999903999250985e-05, + "loss": 0.5274, + "step": 433 + }, + { + "epoch": 0.03438304614775203, + "grad_norm": 4.178491310694602, + "learning_rate": 1.9999004108051846e-05, + "loss": 0.6853, + "step": 434 + }, + { + "epoch": 0.0344622697563874, + "grad_norm": 3.6249342104934437, + "learning_rate": 1.999896756520753e-05, + "loss": 0.5693, + "step": 435 + }, + { + "epoch": 0.034541493365022775, + "grad_norm": 3.710555922301624, + "learning_rate": 1.9998930363979315e-05, + "loss": 0.6222, + "step": 436 + }, + { + "epoch": 0.03462071697365815, + "grad_norm": 4.282896553340935, + "learning_rate": 1.999889250436965e-05, + "loss": 0.6877, + "step": 437 + }, + { + "epoch": 0.034699940582293524, + "grad_norm": 4.070929739318065, + "learning_rate": 1.9998853986381018e-05, + "loss": 0.6578, + "step": 438 + }, + { + "epoch": 0.0347791641909289, + "grad_norm": 3.5806771607523715, + "learning_rate": 1.9998814810015968e-05, + "loss": 0.6339, + "step": 439 + }, + { + "epoch": 0.034858387799564274, + "grad_norm": 4.275256492797494, + "learning_rate": 1.9998774975277074e-05, + "loss": 0.7398, + "step": 440 + }, + { + "epoch": 0.03493761140819964, + "grad_norm": 4.198337935981686, + "learning_rate": 1.9998734482166954e-05, + "loss": 0.5238, + "step": 441 + }, + { + "epoch": 0.035016835016835016, + "grad_norm": 4.222553552201761, + "learning_rate": 1.9998693330688283e-05, + "loss": 0.7514, + "step": 442 + }, + { + "epoch": 0.03509605862547039, + "grad_norm": 4.03782342518217, + "learning_rate": 1.9998651520843766e-05, + "loss": 0.7049, + "step": 443 + }, + { + "epoch": 0.035175282234105765, + "grad_norm": 3.4146854878819033, + "learning_rate": 1.999860905263616e-05, + "loss": 0.6022, + "step": 444 + }, + { + "epoch": 0.03525450584274114, + "grad_norm": 4.528935499317078, + "learning_rate": 1.9998565926068253e-05, + "loss": 0.7426, + "step": 445 + }, + { + "epoch": 0.03533372945137651, + "grad_norm": 3.7491079949721673, + "learning_rate": 1.999852214114289e-05, + "loss": 0.5667, + "step": 446 + }, + { + "epoch": 0.03541295306001188, + "grad_norm": 3.6964639629032487, + "learning_rate": 1.9998477697862956e-05, + "loss": 0.6659, + "step": 447 + }, + { + "epoch": 0.035492176668647256, + "grad_norm": 4.156133240212877, + "learning_rate": 1.9998432596231373e-05, + "loss": 0.6773, + "step": 448 + }, + { + "epoch": 0.03557140027728263, + "grad_norm": 4.293205440872425, + "learning_rate": 1.9998386836251116e-05, + "loss": 0.6266, + "step": 449 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 4.0772084133636985, + "learning_rate": 1.9998340417925193e-05, + "loss": 0.6588, + "step": 450 + }, + { + "epoch": 0.03572984749455338, + "grad_norm": 4.329855059640332, + "learning_rate": 1.9998293341256664e-05, + "loss": 0.8068, + "step": 451 + }, + { + "epoch": 0.03580907110318875, + "grad_norm": 4.053701004155578, + "learning_rate": 1.9998245606248627e-05, + "loss": 0.6841, + "step": 452 + }, + { + "epoch": 0.03588829471182412, + "grad_norm": 4.202351721500618, + "learning_rate": 1.999819721290422e-05, + "loss": 0.6414, + "step": 453 + }, + { + "epoch": 0.035967518320459496, + "grad_norm": 3.676834437809905, + "learning_rate": 1.9998148161226645e-05, + "loss": 0.5713, + "step": 454 + }, + { + "epoch": 0.03604674192909487, + "grad_norm": 3.694595043871052, + "learning_rate": 1.9998098451219115e-05, + "loss": 0.6078, + "step": 455 + }, + { + "epoch": 0.036125965537730245, + "grad_norm": 3.9757704962804667, + "learning_rate": 1.999804808288491e-05, + "loss": 0.7277, + "step": 456 + }, + { + "epoch": 0.03620518914636562, + "grad_norm": 4.633541706729874, + "learning_rate": 1.9997997056227347e-05, + "loss": 0.8114, + "step": 457 + }, + { + "epoch": 0.03628441275500099, + "grad_norm": 3.93767707947965, + "learning_rate": 1.9997945371249784e-05, + "loss": 0.6533, + "step": 458 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 3.724852327104761, + "learning_rate": 1.999789302795563e-05, + "loss": 0.6025, + "step": 459 + }, + { + "epoch": 0.036442859972271736, + "grad_norm": 4.195074369301747, + "learning_rate": 1.999784002634832e-05, + "loss": 0.7006, + "step": 460 + }, + { + "epoch": 0.03652208358090711, + "grad_norm": 3.978391724157097, + "learning_rate": 1.9997786366431354e-05, + "loss": 0.7393, + "step": 461 + }, + { + "epoch": 0.036601307189542485, + "grad_norm": 3.6045949315417776, + "learning_rate": 1.9997732048208264e-05, + "loss": 0.6342, + "step": 462 + }, + { + "epoch": 0.03668053079817786, + "grad_norm": 4.037839325371844, + "learning_rate": 1.9997677071682623e-05, + "loss": 0.6058, + "step": 463 + }, + { + "epoch": 0.03675975440681323, + "grad_norm": 4.278379001148626, + "learning_rate": 1.9997621436858053e-05, + "loss": 0.6623, + "step": 464 + }, + { + "epoch": 0.0368389780154486, + "grad_norm": 3.4950522159289217, + "learning_rate": 1.9997565143738216e-05, + "loss": 0.5441, + "step": 465 + }, + { + "epoch": 0.03691820162408398, + "grad_norm": 3.8381685733649866, + "learning_rate": 1.999750819232682e-05, + "loss": 0.6601, + "step": 466 + }, + { + "epoch": 0.03699742523271935, + "grad_norm": 3.620568988823789, + "learning_rate": 1.9997450582627614e-05, + "loss": 0.6263, + "step": 467 + }, + { + "epoch": 0.037076648841354726, + "grad_norm": 3.6614380431722573, + "learning_rate": 1.9997392314644392e-05, + "loss": 0.4856, + "step": 468 + }, + { + "epoch": 0.0371558724499901, + "grad_norm": 4.473212402515391, + "learning_rate": 1.999733338838099e-05, + "loss": 0.5263, + "step": 469 + }, + { + "epoch": 0.03723509605862547, + "grad_norm": 4.225398591977323, + "learning_rate": 1.999727380384129e-05, + "loss": 0.676, + "step": 470 + }, + { + "epoch": 0.03731431966726084, + "grad_norm": 3.8574841104712903, + "learning_rate": 1.999721356102921e-05, + "loss": 0.5476, + "step": 471 + }, + { + "epoch": 0.03739354327589622, + "grad_norm": 3.9291359486112425, + "learning_rate": 1.9997152659948727e-05, + "loss": 0.6404, + "step": 472 + }, + { + "epoch": 0.03747276688453159, + "grad_norm": 4.012932102434357, + "learning_rate": 1.9997091100603842e-05, + "loss": 0.6552, + "step": 473 + }, + { + "epoch": 0.037551990493166966, + "grad_norm": 3.9281726486511763, + "learning_rate": 1.999702888299861e-05, + "loss": 0.7265, + "step": 474 + }, + { + "epoch": 0.03763121410180234, + "grad_norm": 3.7828949019621088, + "learning_rate": 1.9996966007137125e-05, + "loss": 0.7433, + "step": 475 + }, + { + "epoch": 0.03771043771043771, + "grad_norm": 3.1150374498198112, + "learning_rate": 1.9996902473023537e-05, + "loss": 0.6051, + "step": 476 + }, + { + "epoch": 0.03778966131907308, + "grad_norm": 3.677285993882122, + "learning_rate": 1.999683828066202e-05, + "loss": 0.6053, + "step": 477 + }, + { + "epoch": 0.03786888492770846, + "grad_norm": 4.193161098209371, + "learning_rate": 1.9996773430056806e-05, + "loss": 0.7045, + "step": 478 + }, + { + "epoch": 0.03794810853634383, + "grad_norm": 3.559327316006344, + "learning_rate": 1.999670792121216e-05, + "loss": 0.5634, + "step": 479 + }, + { + "epoch": 0.038027332144979206, + "grad_norm": 3.4381744224022994, + "learning_rate": 1.99966417541324e-05, + "loss": 0.5958, + "step": 480 + }, + { + "epoch": 0.038106555753614574, + "grad_norm": 3.859694920178025, + "learning_rate": 1.9996574928821883e-05, + "loss": 0.5779, + "step": 481 + }, + { + "epoch": 0.03818577936224995, + "grad_norm": 3.7604500656206317, + "learning_rate": 1.9996507445285003e-05, + "loss": 0.6985, + "step": 482 + }, + { + "epoch": 0.03826500297088532, + "grad_norm": 3.663635354080971, + "learning_rate": 1.999643930352621e-05, + "loss": 0.5569, + "step": 483 + }, + { + "epoch": 0.0383442265795207, + "grad_norm": 3.7080033322823667, + "learning_rate": 1.999637050354999e-05, + "loss": 0.6284, + "step": 484 + }, + { + "epoch": 0.03842345018815607, + "grad_norm": 4.031681695337317, + "learning_rate": 1.9996301045360874e-05, + "loss": 0.5587, + "step": 485 + }, + { + "epoch": 0.038502673796791446, + "grad_norm": 3.5617223215792735, + "learning_rate": 1.999623092896343e-05, + "loss": 0.5596, + "step": 486 + }, + { + "epoch": 0.038581897405426814, + "grad_norm": 4.643660776617633, + "learning_rate": 1.9996160154362275e-05, + "loss": 0.7314, + "step": 487 + }, + { + "epoch": 0.03866112101406219, + "grad_norm": 3.6784759763962955, + "learning_rate": 1.9996088721562076e-05, + "loss": 0.6118, + "step": 488 + }, + { + "epoch": 0.03874034462269756, + "grad_norm": 3.9519303685126803, + "learning_rate": 1.9996016630567535e-05, + "loss": 0.5629, + "step": 489 + }, + { + "epoch": 0.03881956823133294, + "grad_norm": 3.517417534187261, + "learning_rate": 1.9995943881383393e-05, + "loss": 0.5969, + "step": 490 + }, + { + "epoch": 0.03889879183996831, + "grad_norm": 3.5399321332043723, + "learning_rate": 1.9995870474014444e-05, + "loss": 0.6577, + "step": 491 + }, + { + "epoch": 0.03897801544860369, + "grad_norm": 3.3975854379183206, + "learning_rate": 1.9995796408465523e-05, + "loss": 0.5913, + "step": 492 + }, + { + "epoch": 0.039057239057239054, + "grad_norm": 4.795982077421927, + "learning_rate": 1.9995721684741505e-05, + "loss": 0.8022, + "step": 493 + }, + { + "epoch": 0.03913646266587443, + "grad_norm": 3.756175720690799, + "learning_rate": 1.9995646302847307e-05, + "loss": 0.7024, + "step": 494 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 3.5933857493175796, + "learning_rate": 1.9995570262787903e-05, + "loss": 0.5881, + "step": 495 + }, + { + "epoch": 0.03929490988314518, + "grad_norm": 3.669507362553083, + "learning_rate": 1.9995493564568286e-05, + "loss": 0.6628, + "step": 496 + }, + { + "epoch": 0.03937413349178055, + "grad_norm": 4.407602280926834, + "learning_rate": 1.9995416208193518e-05, + "loss": 0.6663, + "step": 497 + }, + { + "epoch": 0.03945335710041593, + "grad_norm": 3.525706353779159, + "learning_rate": 1.999533819366868e-05, + "loss": 0.6049, + "step": 498 + }, + { + "epoch": 0.039532580709051295, + "grad_norm": 3.6774942001914295, + "learning_rate": 1.9995259520998927e-05, + "loss": 0.6637, + "step": 499 + }, + { + "epoch": 0.03961180431768667, + "grad_norm": 3.3815464260140033, + "learning_rate": 1.9995180190189424e-05, + "loss": 0.5997, + "step": 500 + }, + { + "epoch": 0.039691027926322044, + "grad_norm": 3.6349553469130966, + "learning_rate": 1.9995100201245397e-05, + "loss": 0.5628, + "step": 501 + }, + { + "epoch": 0.03977025153495742, + "grad_norm": 4.008880962208993, + "learning_rate": 1.999501955417212e-05, + "loss": 0.5247, + "step": 502 + }, + { + "epoch": 0.03984947514359279, + "grad_norm": 4.113040679313265, + "learning_rate": 1.999493824897489e-05, + "loss": 0.6521, + "step": 503 + }, + { + "epoch": 0.03992869875222817, + "grad_norm": 3.360324773968465, + "learning_rate": 1.9994856285659073e-05, + "loss": 0.6559, + "step": 504 + }, + { + "epoch": 0.040007922360863535, + "grad_norm": 3.897122630625106, + "learning_rate": 1.9994773664230064e-05, + "loss": 0.6236, + "step": 505 + }, + { + "epoch": 0.04008714596949891, + "grad_norm": 4.082730649124917, + "learning_rate": 1.99946903846933e-05, + "loss": 0.6917, + "step": 506 + }, + { + "epoch": 0.040166369578134284, + "grad_norm": 4.375710444717418, + "learning_rate": 1.9994606447054265e-05, + "loss": 0.6463, + "step": 507 + }, + { + "epoch": 0.04024559318676966, + "grad_norm": 3.9407990653496214, + "learning_rate": 1.999452185131849e-05, + "loss": 0.7824, + "step": 508 + }, + { + "epoch": 0.04032481679540503, + "grad_norm": 3.874965037146742, + "learning_rate": 1.9994436597491537e-05, + "loss": 0.7309, + "step": 509 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 4.22520270593105, + "learning_rate": 1.9994350685579024e-05, + "loss": 0.6914, + "step": 510 + }, + { + "epoch": 0.040483264012675775, + "grad_norm": 3.5866394165801587, + "learning_rate": 1.999426411558661e-05, + "loss": 0.6996, + "step": 511 + }, + { + "epoch": 0.04056248762131115, + "grad_norm": 3.8611810503765573, + "learning_rate": 1.9994176887519994e-05, + "loss": 0.6387, + "step": 512 + }, + { + "epoch": 0.040641711229946524, + "grad_norm": 3.5121356727512403, + "learning_rate": 1.9994089001384918e-05, + "loss": 0.6522, + "step": 513 + }, + { + "epoch": 0.0407209348385819, + "grad_norm": 3.4597192033887527, + "learning_rate": 1.9994000457187167e-05, + "loss": 0.593, + "step": 514 + }, + { + "epoch": 0.04080015844721727, + "grad_norm": 3.474541699099251, + "learning_rate": 1.999391125493258e-05, + "loss": 0.6385, + "step": 515 + }, + { + "epoch": 0.04087938205585264, + "grad_norm": 3.2548781364117816, + "learning_rate": 1.9993821394627018e-05, + "loss": 0.6725, + "step": 516 + }, + { + "epoch": 0.040958605664488015, + "grad_norm": 3.9666359076360136, + "learning_rate": 1.9993730876276407e-05, + "loss": 0.6341, + "step": 517 + }, + { + "epoch": 0.04103782927312339, + "grad_norm": 4.298069093873997, + "learning_rate": 1.9993639699886707e-05, + "loss": 0.8401, + "step": 518 + }, + { + "epoch": 0.041117052881758764, + "grad_norm": 3.549914286472616, + "learning_rate": 1.9993547865463916e-05, + "loss": 0.6659, + "step": 519 + }, + { + "epoch": 0.04119627649039414, + "grad_norm": 4.009238618845933, + "learning_rate": 1.9993455373014087e-05, + "loss": 0.6957, + "step": 520 + }, + { + "epoch": 0.04127550009902951, + "grad_norm": 3.2311627282206965, + "learning_rate": 1.99933622225433e-05, + "loss": 0.5976, + "step": 521 + }, + { + "epoch": 0.04135472370766488, + "grad_norm": 3.1454133738763277, + "learning_rate": 1.9993268414057704e-05, + "loss": 0.5969, + "step": 522 + }, + { + "epoch": 0.041433947316300256, + "grad_norm": 3.785407899378014, + "learning_rate": 1.9993173947563466e-05, + "loss": 0.6444, + "step": 523 + }, + { + "epoch": 0.04151317092493563, + "grad_norm": 4.960922683132975, + "learning_rate": 1.9993078823066804e-05, + "loss": 0.7575, + "step": 524 + }, + { + "epoch": 0.041592394533571005, + "grad_norm": 3.2686721469199655, + "learning_rate": 1.9992983040573986e-05, + "loss": 0.653, + "step": 525 + }, + { + "epoch": 0.04167161814220638, + "grad_norm": 3.6688999626644154, + "learning_rate": 1.9992886600091318e-05, + "loss": 0.619, + "step": 526 + }, + { + "epoch": 0.041750841750841754, + "grad_norm": 3.559109787862381, + "learning_rate": 1.9992789501625155e-05, + "loss": 0.6763, + "step": 527 + }, + { + "epoch": 0.04183006535947712, + "grad_norm": 3.986658913468185, + "learning_rate": 1.9992691745181882e-05, + "loss": 0.7319, + "step": 528 + }, + { + "epoch": 0.041909288968112496, + "grad_norm": 3.641431065115546, + "learning_rate": 1.9992593330767938e-05, + "loss": 0.6168, + "step": 529 + }, + { + "epoch": 0.04198851257674787, + "grad_norm": 3.4363018932511062, + "learning_rate": 1.9992494258389805e-05, + "loss": 0.583, + "step": 530 + }, + { + "epoch": 0.042067736185383245, + "grad_norm": 3.8248432426142203, + "learning_rate": 1.9992394528054006e-05, + "loss": 0.6341, + "step": 531 + }, + { + "epoch": 0.04214695979401862, + "grad_norm": 3.8247778936282466, + "learning_rate": 1.9992294139767106e-05, + "loss": 0.5237, + "step": 532 + }, + { + "epoch": 0.042226183402653994, + "grad_norm": 3.4828614340990223, + "learning_rate": 1.999219309353572e-05, + "loss": 0.5949, + "step": 533 + }, + { + "epoch": 0.04230540701128936, + "grad_norm": 3.733805629820094, + "learning_rate": 1.9992091389366497e-05, + "loss": 0.6166, + "step": 534 + }, + { + "epoch": 0.042384630619924736, + "grad_norm": 3.623047965210853, + "learning_rate": 1.9991989027266134e-05, + "loss": 0.5641, + "step": 535 + }, + { + "epoch": 0.04246385422856011, + "grad_norm": 3.7851844307402116, + "learning_rate": 1.9991886007241375e-05, + "loss": 0.6686, + "step": 536 + }, + { + "epoch": 0.042543077837195485, + "grad_norm": 3.5008686577460226, + "learning_rate": 1.9991782329298998e-05, + "loss": 0.6061, + "step": 537 + }, + { + "epoch": 0.04262230144583086, + "grad_norm": 3.697592505120355, + "learning_rate": 1.9991677993445832e-05, + "loss": 0.7002, + "step": 538 + }, + { + "epoch": 0.042701525054466234, + "grad_norm": 3.819382943544156, + "learning_rate": 1.9991572999688746e-05, + "loss": 0.6522, + "step": 539 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 3.3522736691510864, + "learning_rate": 1.9991467348034653e-05, + "loss": 0.6039, + "step": 540 + }, + { + "epoch": 0.042859972271736976, + "grad_norm": 3.618967536556049, + "learning_rate": 1.9991361038490515e-05, + "loss": 0.644, + "step": 541 + }, + { + "epoch": 0.04293919588037235, + "grad_norm": 3.37957259389003, + "learning_rate": 1.9991254071063327e-05, + "loss": 0.5794, + "step": 542 + }, + { + "epoch": 0.043018419489007725, + "grad_norm": 3.73008674961396, + "learning_rate": 1.9991146445760133e-05, + "loss": 0.678, + "step": 543 + }, + { + "epoch": 0.0430976430976431, + "grad_norm": 3.150033832160963, + "learning_rate": 1.9991038162588018e-05, + "loss": 0.6449, + "step": 544 + }, + { + "epoch": 0.043176866706278474, + "grad_norm": 3.93201705920017, + "learning_rate": 1.9990929221554117e-05, + "loss": 0.551, + "step": 545 + }, + { + "epoch": 0.04325609031491384, + "grad_norm": 3.1690786607972963, + "learning_rate": 1.99908196226656e-05, + "loss": 0.6372, + "step": 546 + }, + { + "epoch": 0.04333531392354922, + "grad_norm": 4.31888228933249, + "learning_rate": 1.9990709365929678e-05, + "loss": 0.5972, + "step": 547 + }, + { + "epoch": 0.04341453753218459, + "grad_norm": 3.323048595645411, + "learning_rate": 1.999059845135362e-05, + "loss": 0.6934, + "step": 548 + }, + { + "epoch": 0.043493761140819966, + "grad_norm": 3.5673902357162652, + "learning_rate": 1.9990486878944727e-05, + "loss": 0.7046, + "step": 549 + }, + { + "epoch": 0.04357298474945534, + "grad_norm": 3.6871286430597126, + "learning_rate": 1.9990374648710343e-05, + "loss": 0.6601, + "step": 550 + }, + { + "epoch": 0.04365220835809071, + "grad_norm": 2.853772466079235, + "learning_rate": 1.9990261760657858e-05, + "loss": 0.5641, + "step": 551 + }, + { + "epoch": 0.04373143196672608, + "grad_norm": 4.072027412899879, + "learning_rate": 1.9990148214794713e-05, + "loss": 0.5734, + "step": 552 + }, + { + "epoch": 0.04381065557536146, + "grad_norm": 3.2089212935944107, + "learning_rate": 1.999003401112837e-05, + "loss": 0.5388, + "step": 553 + }, + { + "epoch": 0.04388987918399683, + "grad_norm": 4.094850113621171, + "learning_rate": 1.9989919149666356e-05, + "loss": 0.769, + "step": 554 + }, + { + "epoch": 0.043969102792632206, + "grad_norm": 3.461478011178914, + "learning_rate": 1.998980363041624e-05, + "loss": 0.6186, + "step": 555 + }, + { + "epoch": 0.04404832640126758, + "grad_norm": 4.300151340714484, + "learning_rate": 1.9989687453385617e-05, + "loss": 0.6646, + "step": 556 + }, + { + "epoch": 0.04412755000990295, + "grad_norm": 3.4734137803963687, + "learning_rate": 1.9989570618582145e-05, + "loss": 0.5299, + "step": 557 + }, + { + "epoch": 0.04420677361853832, + "grad_norm": 4.066686199052264, + "learning_rate": 1.9989453126013515e-05, + "loss": 0.6054, + "step": 558 + }, + { + "epoch": 0.0442859972271737, + "grad_norm": 3.9748322815357597, + "learning_rate": 1.9989334975687462e-05, + "loss": 0.615, + "step": 559 + }, + { + "epoch": 0.04436522083580907, + "grad_norm": 4.005368167611016, + "learning_rate": 1.9989216167611766e-05, + "loss": 0.61, + "step": 560 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.881254275697099, + "learning_rate": 1.998909670179425e-05, + "loss": 0.7321, + "step": 561 + }, + { + "epoch": 0.04452366805307982, + "grad_norm": 3.2785677512469675, + "learning_rate": 1.9988976578242785e-05, + "loss": 0.5244, + "step": 562 + }, + { + "epoch": 0.04460289166171519, + "grad_norm": 3.422106865180319, + "learning_rate": 1.9988855796965275e-05, + "loss": 0.5331, + "step": 563 + }, + { + "epoch": 0.04468211527035056, + "grad_norm": 3.530076162896214, + "learning_rate": 1.998873435796967e-05, + "loss": 0.6064, + "step": 564 + }, + { + "epoch": 0.04476133887898594, + "grad_norm": 3.4016962967487503, + "learning_rate": 1.9988612261263972e-05, + "loss": 0.6349, + "step": 565 + }, + { + "epoch": 0.04484056248762131, + "grad_norm": 3.3263454727489443, + "learning_rate": 1.9988489506856218e-05, + "loss": 0.6439, + "step": 566 + }, + { + "epoch": 0.044919786096256686, + "grad_norm": 3.2261381597517627, + "learning_rate": 1.9988366094754493e-05, + "loss": 0.4953, + "step": 567 + }, + { + "epoch": 0.04499900970489206, + "grad_norm": 3.4719934634535368, + "learning_rate": 1.9988242024966924e-05, + "loss": 0.6628, + "step": 568 + }, + { + "epoch": 0.04507823331352743, + "grad_norm": 3.8283800160440635, + "learning_rate": 1.9988117297501674e-05, + "loss": 0.4674, + "step": 569 + }, + { + "epoch": 0.0451574569221628, + "grad_norm": 3.891449460148191, + "learning_rate": 1.998799191236696e-05, + "loss": 0.5612, + "step": 570 + }, + { + "epoch": 0.04523668053079818, + "grad_norm": 3.8415122292573387, + "learning_rate": 1.998786586957104e-05, + "loss": 0.5787, + "step": 571 + }, + { + "epoch": 0.04531590413943355, + "grad_norm": 3.279413849083167, + "learning_rate": 1.998773916912221e-05, + "loss": 0.4776, + "step": 572 + }, + { + "epoch": 0.04539512774806893, + "grad_norm": 3.807566952571174, + "learning_rate": 1.9987611811028814e-05, + "loss": 0.7145, + "step": 573 + }, + { + "epoch": 0.0454743513567043, + "grad_norm": 3.669444622209576, + "learning_rate": 1.9987483795299236e-05, + "loss": 0.6527, + "step": 574 + }, + { + "epoch": 0.04555357496533967, + "grad_norm": 3.459414739368479, + "learning_rate": 1.9987355121941907e-05, + "loss": 0.5198, + "step": 575 + }, + { + "epoch": 0.04563279857397504, + "grad_norm": 3.577146066788758, + "learning_rate": 1.99872257909653e-05, + "loss": 0.5355, + "step": 576 + }, + { + "epoch": 0.04571202218261042, + "grad_norm": 3.814856415338765, + "learning_rate": 1.9987095802377933e-05, + "loss": 0.6196, + "step": 577 + }, + { + "epoch": 0.04579124579124579, + "grad_norm": 4.620429956328929, + "learning_rate": 1.9986965156188357e-05, + "loss": 0.7046, + "step": 578 + }, + { + "epoch": 0.04587046939988117, + "grad_norm": 4.285619576434249, + "learning_rate": 1.9986833852405183e-05, + "loss": 0.5899, + "step": 579 + }, + { + "epoch": 0.045949693008516534, + "grad_norm": 3.595812363622929, + "learning_rate": 1.9986701891037053e-05, + "loss": 0.5648, + "step": 580 + }, + { + "epoch": 0.04602891661715191, + "grad_norm": 4.715717315399082, + "learning_rate": 1.9986569272092656e-05, + "loss": 0.7187, + "step": 581 + }, + { + "epoch": 0.046108140225787284, + "grad_norm": 3.9011693035201938, + "learning_rate": 1.9986435995580725e-05, + "loss": 0.7205, + "step": 582 + }, + { + "epoch": 0.04618736383442266, + "grad_norm": 3.598522283966038, + "learning_rate": 1.9986302061510036e-05, + "loss": 0.6788, + "step": 583 + }, + { + "epoch": 0.04626658744305803, + "grad_norm": 3.8073028481044813, + "learning_rate": 1.9986167469889405e-05, + "loss": 0.5908, + "step": 584 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 2.891955514307686, + "learning_rate": 1.9986032220727698e-05, + "loss": 0.5999, + "step": 585 + }, + { + "epoch": 0.046425034660328775, + "grad_norm": 3.876749628271864, + "learning_rate": 1.9985896314033816e-05, + "loss": 0.6567, + "step": 586 + }, + { + "epoch": 0.04650425826896415, + "grad_norm": 3.6940316887176894, + "learning_rate": 1.9985759749816715e-05, + "loss": 0.6342, + "step": 587 + }, + { + "epoch": 0.046583481877599524, + "grad_norm": 3.785463738453173, + "learning_rate": 1.9985622528085382e-05, + "loss": 0.7148, + "step": 588 + }, + { + "epoch": 0.0466627054862349, + "grad_norm": 3.21042838436158, + "learning_rate": 1.9985484648848854e-05, + "loss": 0.6131, + "step": 589 + }, + { + "epoch": 0.04674192909487027, + "grad_norm": 4.05802326939007, + "learning_rate": 1.9985346112116207e-05, + "loss": 0.5711, + "step": 590 + }, + { + "epoch": 0.04682115270350565, + "grad_norm": 3.523847614097594, + "learning_rate": 1.9985206917896563e-05, + "loss": 0.6842, + "step": 591 + }, + { + "epoch": 0.046900376312141015, + "grad_norm": 3.6119114711140603, + "learning_rate": 1.9985067066199093e-05, + "loss": 0.6983, + "step": 592 + }, + { + "epoch": 0.04697959992077639, + "grad_norm": 3.315918989225194, + "learning_rate": 1.9984926557033003e-05, + "loss": 0.5243, + "step": 593 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 3.470617520958609, + "learning_rate": 1.998478539040754e-05, + "loss": 0.57, + "step": 594 + }, + { + "epoch": 0.04713804713804714, + "grad_norm": 3.608138347682558, + "learning_rate": 1.9984643566332005e-05, + "loss": 0.6612, + "step": 595 + }, + { + "epoch": 0.04721727074668251, + "grad_norm": 3.390556418320335, + "learning_rate": 1.9984501084815734e-05, + "loss": 0.5658, + "step": 596 + }, + { + "epoch": 0.04729649435531789, + "grad_norm": 3.5497709627847502, + "learning_rate": 1.9984357945868106e-05, + "loss": 0.5289, + "step": 597 + }, + { + "epoch": 0.047375717963953255, + "grad_norm": 3.594448280790585, + "learning_rate": 1.998421414949855e-05, + "loss": 0.6217, + "step": 598 + }, + { + "epoch": 0.04745494157258863, + "grad_norm": 4.123128545858547, + "learning_rate": 1.9984069695716534e-05, + "loss": 0.6952, + "step": 599 + }, + { + "epoch": 0.047534165181224004, + "grad_norm": 3.079482379339533, + "learning_rate": 1.998392458453157e-05, + "loss": 0.6006, + "step": 600 + }, + { + "epoch": 0.04761338878985938, + "grad_norm": 3.5932147243803314, + "learning_rate": 1.998377881595321e-05, + "loss": 0.7653, + "step": 601 + }, + { + "epoch": 0.04769261239849475, + "grad_norm": 3.925361417911679, + "learning_rate": 1.9983632389991056e-05, + "loss": 0.6688, + "step": 602 + }, + { + "epoch": 0.04777183600713013, + "grad_norm": 4.385638428834529, + "learning_rate": 1.9983485306654745e-05, + "loss": 0.642, + "step": 603 + }, + { + "epoch": 0.047851059615765495, + "grad_norm": 3.0550077881772717, + "learning_rate": 1.9983337565953968e-05, + "loss": 0.6014, + "step": 604 + }, + { + "epoch": 0.04793028322440087, + "grad_norm": 3.163657261536773, + "learning_rate": 1.9983189167898446e-05, + "loss": 0.4853, + "step": 605 + }, + { + "epoch": 0.048009506833036245, + "grad_norm": 3.6297702170718176, + "learning_rate": 1.998304011249795e-05, + "loss": 0.5661, + "step": 606 + }, + { + "epoch": 0.04808873044167162, + "grad_norm": 3.5235373603016606, + "learning_rate": 1.9982890399762303e-05, + "loss": 0.5475, + "step": 607 + }, + { + "epoch": 0.048167954050306994, + "grad_norm": 3.843292708555788, + "learning_rate": 1.9982740029701356e-05, + "loss": 0.6951, + "step": 608 + }, + { + "epoch": 0.04824717765894237, + "grad_norm": 4.3231238472977545, + "learning_rate": 1.998258900232501e-05, + "loss": 0.528, + "step": 609 + }, + { + "epoch": 0.048326401267577736, + "grad_norm": 4.391070317986208, + "learning_rate": 1.9982437317643218e-05, + "loss": 0.6609, + "step": 610 + }, + { + "epoch": 0.04840562487621311, + "grad_norm": 3.587332401529109, + "learning_rate": 1.9982284975665952e-05, + "loss": 0.485, + "step": 611 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 3.6881210400497233, + "learning_rate": 1.998213197640326e-05, + "loss": 0.658, + "step": 612 + }, + { + "epoch": 0.04856407209348386, + "grad_norm": 3.476604257152046, + "learning_rate": 1.9981978319865204e-05, + "loss": 0.4793, + "step": 613 + }, + { + "epoch": 0.048643295702119234, + "grad_norm": 4.127252418881677, + "learning_rate": 1.9981824006061904e-05, + "loss": 0.5326, + "step": 614 + }, + { + "epoch": 0.0487225193107546, + "grad_norm": 3.4441649802115095, + "learning_rate": 1.998166903500353e-05, + "loss": 0.5361, + "step": 615 + }, + { + "epoch": 0.048801742919389976, + "grad_norm": 3.3247716254900244, + "learning_rate": 1.998151340670027e-05, + "loss": 0.5887, + "step": 616 + }, + { + "epoch": 0.04888096652802535, + "grad_norm": 3.294291573099863, + "learning_rate": 1.9981357121162385e-05, + "loss": 0.5829, + "step": 617 + }, + { + "epoch": 0.048960190136660725, + "grad_norm": 3.6509061621178254, + "learning_rate": 1.998120017840016e-05, + "loss": 0.5756, + "step": 618 + }, + { + "epoch": 0.0490394137452961, + "grad_norm": 3.799488526771851, + "learning_rate": 1.998104257842393e-05, + "loss": 0.5147, + "step": 619 + }, + { + "epoch": 0.049118637353931474, + "grad_norm": 3.637657796620207, + "learning_rate": 1.9980884321244072e-05, + "loss": 0.6066, + "step": 620 + }, + { + "epoch": 0.04919786096256684, + "grad_norm": 3.5414243921427575, + "learning_rate": 1.9980725406871007e-05, + "loss": 0.6376, + "step": 621 + }, + { + "epoch": 0.049277084571202216, + "grad_norm": 5.147506741814145, + "learning_rate": 1.9980565835315196e-05, + "loss": 0.7115, + "step": 622 + }, + { + "epoch": 0.04935630817983759, + "grad_norm": 3.7449299160511225, + "learning_rate": 1.9980405606587148e-05, + "loss": 0.531, + "step": 623 + }, + { + "epoch": 0.049435531788472965, + "grad_norm": 3.1465453382891417, + "learning_rate": 1.9980244720697417e-05, + "loss": 0.5825, + "step": 624 + }, + { + "epoch": 0.04951475539710834, + "grad_norm": 3.7358004188853884, + "learning_rate": 1.9980083177656588e-05, + "loss": 0.5773, + "step": 625 + }, + { + "epoch": 0.049593979005743714, + "grad_norm": 4.8536296010364754, + "learning_rate": 1.9979920977475306e-05, + "loss": 0.6305, + "step": 626 + }, + { + "epoch": 0.04967320261437908, + "grad_norm": 2.9820039525426374, + "learning_rate": 1.9979758120164248e-05, + "loss": 0.5118, + "step": 627 + }, + { + "epoch": 0.049752426223014456, + "grad_norm": 3.56312148323265, + "learning_rate": 1.997959460573414e-05, + "loss": 0.6544, + "step": 628 + }, + { + "epoch": 0.04983164983164983, + "grad_norm": 2.8358001569516316, + "learning_rate": 1.9979430434195742e-05, + "loss": 0.5189, + "step": 629 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 3.9870526074281405, + "learning_rate": 1.9979265605559868e-05, + "loss": 0.5472, + "step": 630 + }, + { + "epoch": 0.04999009704892058, + "grad_norm": 3.645653150403932, + "learning_rate": 1.997910011983737e-05, + "loss": 0.5923, + "step": 631 + }, + { + "epoch": 0.050069320657555955, + "grad_norm": 3.6778139708866933, + "learning_rate": 1.997893397703915e-05, + "loss": 0.7483, + "step": 632 + }, + { + "epoch": 0.05014854426619132, + "grad_norm": 3.3697830000040154, + "learning_rate": 1.997876717717614e-05, + "loss": 0.6071, + "step": 633 + }, + { + "epoch": 0.0502277678748267, + "grad_norm": 3.9377510963745896, + "learning_rate": 1.9978599720259325e-05, + "loss": 0.6231, + "step": 634 + }, + { + "epoch": 0.05030699148346207, + "grad_norm": 3.049836859632232, + "learning_rate": 1.9978431606299736e-05, + "loss": 0.6054, + "step": 635 + }, + { + "epoch": 0.050386215092097446, + "grad_norm": 3.4704182926753466, + "learning_rate": 1.9978262835308437e-05, + "loss": 0.5647, + "step": 636 + }, + { + "epoch": 0.05046543870073282, + "grad_norm": 3.863271887491197, + "learning_rate": 1.997809340729654e-05, + "loss": 0.6242, + "step": 637 + }, + { + "epoch": 0.050544662309368195, + "grad_norm": 3.9731140115433927, + "learning_rate": 1.9977923322275206e-05, + "loss": 0.7247, + "step": 638 + }, + { + "epoch": 0.05062388591800356, + "grad_norm": 4.064037520685318, + "learning_rate": 1.997775258025563e-05, + "loss": 0.6728, + "step": 639 + }, + { + "epoch": 0.05070310952663894, + "grad_norm": 3.7457814493593484, + "learning_rate": 1.997758118124906e-05, + "loss": 0.6004, + "step": 640 + }, + { + "epoch": 0.05078233313527431, + "grad_norm": 3.629777712494507, + "learning_rate": 1.997740912526678e-05, + "loss": 0.552, + "step": 641 + }, + { + "epoch": 0.050861556743909686, + "grad_norm": 4.825013274018131, + "learning_rate": 1.9977236412320112e-05, + "loss": 0.6208, + "step": 642 + }, + { + "epoch": 0.05094078035254506, + "grad_norm": 3.4541197056587016, + "learning_rate": 1.9977063042420438e-05, + "loss": 0.6652, + "step": 643 + }, + { + "epoch": 0.051020003961180435, + "grad_norm": 3.3331748995754293, + "learning_rate": 1.9976889015579167e-05, + "loss": 0.7036, + "step": 644 + }, + { + "epoch": 0.0510992275698158, + "grad_norm": 3.5838952992165303, + "learning_rate": 1.997671433180776e-05, + "loss": 0.5595, + "step": 645 + }, + { + "epoch": 0.05117845117845118, + "grad_norm": 3.1556606457071186, + "learning_rate": 1.997653899111772e-05, + "loss": 0.4995, + "step": 646 + }, + { + "epoch": 0.05125767478708655, + "grad_norm": 3.760593567611662, + "learning_rate": 1.9976362993520587e-05, + "loss": 0.5686, + "step": 647 + }, + { + "epoch": 0.051336898395721926, + "grad_norm": 5.766104592551594, + "learning_rate": 1.9976186339027958e-05, + "loss": 0.6967, + "step": 648 + }, + { + "epoch": 0.0514161220043573, + "grad_norm": 4.661421527766419, + "learning_rate": 1.9976009027651463e-05, + "loss": 0.6744, + "step": 649 + }, + { + "epoch": 0.05149534561299267, + "grad_norm": 3.8041767704286142, + "learning_rate": 1.9975831059402774e-05, + "loss": 0.6685, + "step": 650 + }, + { + "epoch": 0.05157456922162804, + "grad_norm": 3.8501502113577852, + "learning_rate": 1.9975652434293607e-05, + "loss": 0.5533, + "step": 651 + }, + { + "epoch": 0.05165379283026342, + "grad_norm": 3.1331313213684826, + "learning_rate": 1.9975473152335726e-05, + "loss": 0.4754, + "step": 652 + }, + { + "epoch": 0.05173301643889879, + "grad_norm": 3.0091468362632083, + "learning_rate": 1.9975293213540942e-05, + "loss": 0.4827, + "step": 653 + }, + { + "epoch": 0.05181224004753417, + "grad_norm": 4.045039447568997, + "learning_rate": 1.9975112617921097e-05, + "loss": 0.6001, + "step": 654 + }, + { + "epoch": 0.05189146365616954, + "grad_norm": 4.096840240355661, + "learning_rate": 1.997493136548808e-05, + "loss": 0.6506, + "step": 655 + }, + { + "epoch": 0.05197068726480491, + "grad_norm": 3.57659214395994, + "learning_rate": 1.9974749456253834e-05, + "loss": 0.52, + "step": 656 + }, + { + "epoch": 0.05204991087344028, + "grad_norm": 3.044935910167429, + "learning_rate": 1.9974566890230327e-05, + "loss": 0.5726, + "step": 657 + }, + { + "epoch": 0.05212913448207566, + "grad_norm": 3.152422843320445, + "learning_rate": 1.9974383667429585e-05, + "loss": 0.4847, + "step": 658 + }, + { + "epoch": 0.05220835809071103, + "grad_norm": 3.6432092561424922, + "learning_rate": 1.9974199787863674e-05, + "loss": 0.7887, + "step": 659 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 3.2339171936264464, + "learning_rate": 1.99740152515447e-05, + "loss": 0.5148, + "step": 660 + }, + { + "epoch": 0.05236680530798178, + "grad_norm": 3.5111120243733254, + "learning_rate": 1.9973830058484813e-05, + "loss": 0.5545, + "step": 661 + }, + { + "epoch": 0.05244602891661715, + "grad_norm": 3.4524811312893906, + "learning_rate": 1.9973644208696208e-05, + "loss": 0.5938, + "step": 662 + }, + { + "epoch": 0.052525252525252523, + "grad_norm": 3.1500280594662735, + "learning_rate": 1.9973457702191123e-05, + "loss": 0.5447, + "step": 663 + }, + { + "epoch": 0.0526044761338879, + "grad_norm": 3.499106629291908, + "learning_rate": 1.9973270538981835e-05, + "loss": 0.5781, + "step": 664 + }, + { + "epoch": 0.05268369974252327, + "grad_norm": 3.3935092155293805, + "learning_rate": 1.9973082719080673e-05, + "loss": 0.6165, + "step": 665 + }, + { + "epoch": 0.05276292335115865, + "grad_norm": 3.5242328602672406, + "learning_rate": 1.9972894242499997e-05, + "loss": 0.6048, + "step": 666 + }, + { + "epoch": 0.05284214695979402, + "grad_norm": 4.19898474198839, + "learning_rate": 1.9972705109252227e-05, + "loss": 0.4411, + "step": 667 + }, + { + "epoch": 0.05292137056842939, + "grad_norm": 3.6410434507080462, + "learning_rate": 1.997251531934981e-05, + "loss": 0.6123, + "step": 668 + }, + { + "epoch": 0.053000594177064764, + "grad_norm": 3.280657444340508, + "learning_rate": 1.997232487280524e-05, + "loss": 0.5898, + "step": 669 + }, + { + "epoch": 0.05307981778570014, + "grad_norm": 4.14228533618234, + "learning_rate": 1.9972133769631065e-05, + "loss": 0.5315, + "step": 670 + }, + { + "epoch": 0.05315904139433551, + "grad_norm": 3.887517174916096, + "learning_rate": 1.9971942009839862e-05, + "loss": 0.5781, + "step": 671 + }, + { + "epoch": 0.05323826500297089, + "grad_norm": 3.6687004662067664, + "learning_rate": 1.997174959344426e-05, + "loss": 0.4738, + "step": 672 + }, + { + "epoch": 0.05331748861160626, + "grad_norm": 3.3939499875207657, + "learning_rate": 1.9971556520456928e-05, + "loss": 0.6866, + "step": 673 + }, + { + "epoch": 0.05339671222024163, + "grad_norm": 3.512062688304022, + "learning_rate": 1.997136279089058e-05, + "loss": 0.4789, + "step": 674 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 3.0518120209989705, + "learning_rate": 1.9971168404757972e-05, + "loss": 0.4802, + "step": 675 + }, + { + "epoch": 0.05355515943751238, + "grad_norm": 3.8237950433416903, + "learning_rate": 1.99709733620719e-05, + "loss": 0.5675, + "step": 676 + }, + { + "epoch": 0.05363438304614775, + "grad_norm": 3.195166886044514, + "learning_rate": 1.9970777662845212e-05, + "loss": 0.5459, + "step": 677 + }, + { + "epoch": 0.05371360665478313, + "grad_norm": 3.6631282865603247, + "learning_rate": 1.997058130709079e-05, + "loss": 0.6171, + "step": 678 + }, + { + "epoch": 0.0537928302634185, + "grad_norm": 3.538844194225932, + "learning_rate": 1.9970384294821565e-05, + "loss": 0.5658, + "step": 679 + }, + { + "epoch": 0.05387205387205387, + "grad_norm": 2.9626158006583854, + "learning_rate": 1.9970186626050507e-05, + "loss": 0.4446, + "step": 680 + }, + { + "epoch": 0.053951277480689244, + "grad_norm": 3.2659000156780404, + "learning_rate": 1.9969988300790636e-05, + "loss": 0.6007, + "step": 681 + }, + { + "epoch": 0.05403050108932462, + "grad_norm": 3.4968292044822995, + "learning_rate": 1.9969789319055007e-05, + "loss": 0.5461, + "step": 682 + }, + { + "epoch": 0.05410972469795999, + "grad_norm": 3.8729078974161215, + "learning_rate": 1.996958968085672e-05, + "loss": 0.6358, + "step": 683 + }, + { + "epoch": 0.05418894830659537, + "grad_norm": 3.2081829398888586, + "learning_rate": 1.9969389386208927e-05, + "loss": 0.534, + "step": 684 + }, + { + "epoch": 0.054268171915230735, + "grad_norm": 3.3877845870813723, + "learning_rate": 1.9969188435124812e-05, + "loss": 0.5699, + "step": 685 + }, + { + "epoch": 0.05434739552386611, + "grad_norm": 3.84551200253368, + "learning_rate": 1.9968986827617603e-05, + "loss": 0.5622, + "step": 686 + }, + { + "epoch": 0.054426619132501484, + "grad_norm": 3.4932566370322515, + "learning_rate": 1.9968784563700586e-05, + "loss": 0.5722, + "step": 687 + }, + { + "epoch": 0.05450584274113686, + "grad_norm": 3.2637182722557143, + "learning_rate": 1.9968581643387065e-05, + "loss": 0.5292, + "step": 688 + }, + { + "epoch": 0.054585066349772234, + "grad_norm": 3.5018617824780116, + "learning_rate": 1.9968378066690414e-05, + "loss": 0.5713, + "step": 689 + }, + { + "epoch": 0.05466428995840761, + "grad_norm": 3.2199561612336227, + "learning_rate": 1.996817383362403e-05, + "loss": 0.5341, + "step": 690 + }, + { + "epoch": 0.054743513567042976, + "grad_norm": 3.0004900966966797, + "learning_rate": 1.996796894420136e-05, + "loss": 0.6363, + "step": 691 + }, + { + "epoch": 0.05482273717567835, + "grad_norm": 3.17012251363694, + "learning_rate": 1.9967763398435904e-05, + "loss": 0.5366, + "step": 692 + }, + { + "epoch": 0.054901960784313725, + "grad_norm": 2.8231583747276043, + "learning_rate": 1.9967557196341184e-05, + "loss": 0.4645, + "step": 693 + }, + { + "epoch": 0.0549811843929491, + "grad_norm": 3.1533863075108672, + "learning_rate": 1.996735033793079e-05, + "loss": 0.5379, + "step": 694 + }, + { + "epoch": 0.055060408001584474, + "grad_norm": 3.3719384236897993, + "learning_rate": 1.996714282321833e-05, + "loss": 0.4714, + "step": 695 + }, + { + "epoch": 0.05513963161021985, + "grad_norm": 3.050102698974, + "learning_rate": 1.9966934652217477e-05, + "loss": 0.5152, + "step": 696 + }, + { + "epoch": 0.055218855218855216, + "grad_norm": 3.1816568381011243, + "learning_rate": 1.9966725824941933e-05, + "loss": 0.4703, + "step": 697 + }, + { + "epoch": 0.05529807882749059, + "grad_norm": 3.725553056239348, + "learning_rate": 1.9966516341405452e-05, + "loss": 0.6012, + "step": 698 + }, + { + "epoch": 0.055377302436125965, + "grad_norm": 3.340483571981605, + "learning_rate": 1.9966306201621826e-05, + "loss": 0.7178, + "step": 699 + }, + { + "epoch": 0.05545652604476134, + "grad_norm": 3.672446409460007, + "learning_rate": 1.996609540560489e-05, + "loss": 0.629, + "step": 700 + }, + { + "epoch": 0.055535749653396714, + "grad_norm": 3.2971947821087957, + "learning_rate": 1.9965883953368527e-05, + "loss": 0.5387, + "step": 701 + }, + { + "epoch": 0.05561497326203209, + "grad_norm": 3.7545851473692924, + "learning_rate": 1.9965671844926656e-05, + "loss": 0.5285, + "step": 702 + }, + { + "epoch": 0.055694196870667456, + "grad_norm": 3.7044687887216705, + "learning_rate": 1.9965459080293247e-05, + "loss": 0.6099, + "step": 703 + }, + { + "epoch": 0.05577342047930283, + "grad_norm": 2.8440632568968223, + "learning_rate": 1.9965245659482312e-05, + "loss": 0.4754, + "step": 704 + }, + { + "epoch": 0.055852644087938205, + "grad_norm": 3.3518752387605097, + "learning_rate": 1.9965031582507896e-05, + "loss": 0.5202, + "step": 705 + }, + { + "epoch": 0.05593186769657358, + "grad_norm": 3.322222655996487, + "learning_rate": 1.99648168493841e-05, + "loss": 0.5195, + "step": 706 + }, + { + "epoch": 0.056011091305208954, + "grad_norm": 3.604406890842087, + "learning_rate": 1.996460146012506e-05, + "loss": 0.7097, + "step": 707 + }, + { + "epoch": 0.05609031491384433, + "grad_norm": 3.0958758769349424, + "learning_rate": 1.996438541474496e-05, + "loss": 0.6299, + "step": 708 + }, + { + "epoch": 0.056169538522479696, + "grad_norm": 4.564409778153522, + "learning_rate": 1.996416871325803e-05, + "loss": 0.6776, + "step": 709 + }, + { + "epoch": 0.05624876213111507, + "grad_norm": 3.217151554432102, + "learning_rate": 1.9963951355678533e-05, + "loss": 0.4426, + "step": 710 + }, + { + "epoch": 0.056327985739750445, + "grad_norm": 3.893164198008113, + "learning_rate": 1.996373334202078e-05, + "loss": 0.7279, + "step": 711 + }, + { + "epoch": 0.05640720934838582, + "grad_norm": 3.483196537256004, + "learning_rate": 1.9963514672299135e-05, + "loss": 0.5573, + "step": 712 + }, + { + "epoch": 0.056486432957021195, + "grad_norm": 3.2607345588255865, + "learning_rate": 1.9963295346527984e-05, + "loss": 0.5353, + "step": 713 + }, + { + "epoch": 0.05656565656565657, + "grad_norm": 3.9139375896392177, + "learning_rate": 1.996307536472178e-05, + "loss": 0.5619, + "step": 714 + }, + { + "epoch": 0.05664488017429194, + "grad_norm": 3.442690740042697, + "learning_rate": 1.9962854726894997e-05, + "loss": 0.5902, + "step": 715 + }, + { + "epoch": 0.05672410378292731, + "grad_norm": 3.6606012908133336, + "learning_rate": 1.9962633433062174e-05, + "loss": 0.5121, + "step": 716 + }, + { + "epoch": 0.056803327391562686, + "grad_norm": 3.3497865631212824, + "learning_rate": 1.996241148323787e-05, + "loss": 0.4978, + "step": 717 + }, + { + "epoch": 0.05688255100019806, + "grad_norm": 3.4827552644308226, + "learning_rate": 1.996218887743671e-05, + "loss": 0.5625, + "step": 718 + }, + { + "epoch": 0.056961774608833435, + "grad_norm": 3.2127906527141388, + "learning_rate": 1.996196561567335e-05, + "loss": 0.5686, + "step": 719 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 3.2648095261708776, + "learning_rate": 1.996174169796248e-05, + "loss": 0.5225, + "step": 720 + }, + { + "epoch": 0.05712022182610418, + "grad_norm": 2.9951694124526034, + "learning_rate": 1.996151712431886e-05, + "loss": 0.4255, + "step": 721 + }, + { + "epoch": 0.05719944543473955, + "grad_norm": 3.442800049571805, + "learning_rate": 1.9961291894757267e-05, + "loss": 0.5549, + "step": 722 + }, + { + "epoch": 0.057278669043374926, + "grad_norm": 3.2800047603653195, + "learning_rate": 1.9961066009292532e-05, + "loss": 0.5902, + "step": 723 + }, + { + "epoch": 0.0573578926520103, + "grad_norm": 3.0687037289043166, + "learning_rate": 1.9960839467939534e-05, + "loss": 0.4605, + "step": 724 + }, + { + "epoch": 0.057437116260645675, + "grad_norm": 3.2619902936278753, + "learning_rate": 1.996061227071318e-05, + "loss": 0.599, + "step": 725 + }, + { + "epoch": 0.05751633986928104, + "grad_norm": 2.771994014100968, + "learning_rate": 1.996038441762844e-05, + "loss": 0.4764, + "step": 726 + }, + { + "epoch": 0.05759556347791642, + "grad_norm": 3.5187912842147724, + "learning_rate": 1.9960155908700306e-05, + "loss": 0.3719, + "step": 727 + }, + { + "epoch": 0.05767478708655179, + "grad_norm": 3.808869301374037, + "learning_rate": 1.9959926743943836e-05, + "loss": 0.6481, + "step": 728 + }, + { + "epoch": 0.057754010695187166, + "grad_norm": 3.1915056219699816, + "learning_rate": 1.9959696923374113e-05, + "loss": 0.4986, + "step": 729 + }, + { + "epoch": 0.05783323430382254, + "grad_norm": 3.161022277767463, + "learning_rate": 1.995946644700627e-05, + "loss": 0.3726, + "step": 730 + }, + { + "epoch": 0.057912457912457915, + "grad_norm": 3.583963080905145, + "learning_rate": 1.9959235314855485e-05, + "loss": 0.5534, + "step": 731 + }, + { + "epoch": 0.05799168152109328, + "grad_norm": 4.065214801449088, + "learning_rate": 1.9959003526936972e-05, + "loss": 0.6399, + "step": 732 + }, + { + "epoch": 0.05807090512972866, + "grad_norm": 3.6006608035634433, + "learning_rate": 1.9958771083266e-05, + "loss": 0.4556, + "step": 733 + }, + { + "epoch": 0.05815012873836403, + "grad_norm": 3.809853857364322, + "learning_rate": 1.995853798385787e-05, + "loss": 0.6825, + "step": 734 + }, + { + "epoch": 0.058229352346999406, + "grad_norm": 3.2408338330169797, + "learning_rate": 1.9958304228727928e-05, + "loss": 0.4771, + "step": 735 + }, + { + "epoch": 0.05830857595563478, + "grad_norm": 3.1505271905137024, + "learning_rate": 1.995806981789157e-05, + "loss": 0.4676, + "step": 736 + }, + { + "epoch": 0.058387799564270156, + "grad_norm": 3.7435750688790135, + "learning_rate": 1.9957834751364232e-05, + "loss": 0.5649, + "step": 737 + }, + { + "epoch": 0.05846702317290552, + "grad_norm": 3.468380145982312, + "learning_rate": 1.995759902916139e-05, + "loss": 0.5544, + "step": 738 + }, + { + "epoch": 0.0585462467815409, + "grad_norm": 3.1078384827281305, + "learning_rate": 1.995736265129856e-05, + "loss": 0.4375, + "step": 739 + }, + { + "epoch": 0.05862547039017627, + "grad_norm": 3.299000059642858, + "learning_rate": 1.9957125617791314e-05, + "loss": 0.5883, + "step": 740 + }, + { + "epoch": 0.05870469399881165, + "grad_norm": 3.1178855470679663, + "learning_rate": 1.995688792865526e-05, + "loss": 0.4847, + "step": 741 + }, + { + "epoch": 0.05878391760744702, + "grad_norm": 2.7074170027979565, + "learning_rate": 1.995664958390604e-05, + "loss": 0.6599, + "step": 742 + }, + { + "epoch": 0.058863141216082396, + "grad_norm": 3.3223520370253254, + "learning_rate": 1.995641058355936e-05, + "loss": 0.4712, + "step": 743 + }, + { + "epoch": 0.05894236482471776, + "grad_norm": 3.1110034863320193, + "learning_rate": 1.9956170927630946e-05, + "loss": 0.428, + "step": 744 + }, + { + "epoch": 0.05902158843335314, + "grad_norm": 3.102205220331527, + "learning_rate": 1.9955930616136582e-05, + "loss": 0.5479, + "step": 745 + }, + { + "epoch": 0.05910081204198851, + "grad_norm": 3.677342014539697, + "learning_rate": 1.995568964909209e-05, + "loss": 0.5976, + "step": 746 + }, + { + "epoch": 0.05918003565062389, + "grad_norm": 3.5375691834245573, + "learning_rate": 1.995544802651334e-05, + "loss": 0.557, + "step": 747 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 2.985052620237613, + "learning_rate": 1.995520574841624e-05, + "loss": 0.5389, + "step": 748 + }, + { + "epoch": 0.059338482867894636, + "grad_norm": 3.258792656618429, + "learning_rate": 1.9954962814816744e-05, + "loss": 0.595, + "step": 749 + }, + { + "epoch": 0.059417706476530004, + "grad_norm": 3.4019492829900817, + "learning_rate": 1.9954719225730847e-05, + "loss": 0.4949, + "step": 750 + }, + { + "epoch": 0.05949693008516538, + "grad_norm": 3.3758435744471678, + "learning_rate": 1.995447498117459e-05, + "loss": 0.4804, + "step": 751 + }, + { + "epoch": 0.05957615369380075, + "grad_norm": 3.248604476746306, + "learning_rate": 1.9954230081164047e-05, + "loss": 0.5321, + "step": 752 + }, + { + "epoch": 0.05965537730243613, + "grad_norm": 3.603087466697579, + "learning_rate": 1.9953984525715354e-05, + "loss": 0.6247, + "step": 753 + }, + { + "epoch": 0.0597346009110715, + "grad_norm": 3.1764205510708643, + "learning_rate": 1.9953738314844676e-05, + "loss": 0.5138, + "step": 754 + }, + { + "epoch": 0.05981382451970687, + "grad_norm": 3.785327413957289, + "learning_rate": 1.9953491448568222e-05, + "loss": 0.5865, + "step": 755 + }, + { + "epoch": 0.059893048128342244, + "grad_norm": 3.0494167171533517, + "learning_rate": 1.9953243926902254e-05, + "loss": 0.5465, + "step": 756 + }, + { + "epoch": 0.05997227173697762, + "grad_norm": 3.758585541255481, + "learning_rate": 1.995299574986306e-05, + "loss": 0.6202, + "step": 757 + }, + { + "epoch": 0.06005149534561299, + "grad_norm": 2.8546698404153905, + "learning_rate": 1.9952746917466988e-05, + "loss": 0.5589, + "step": 758 + }, + { + "epoch": 0.06013071895424837, + "grad_norm": 3.381098485240035, + "learning_rate": 1.9952497429730423e-05, + "loss": 0.5346, + "step": 759 + }, + { + "epoch": 0.06020994256288374, + "grad_norm": 4.071612969289404, + "learning_rate": 1.9952247286669787e-05, + "loss": 0.5611, + "step": 760 + }, + { + "epoch": 0.06028916617151911, + "grad_norm": 3.5487326277659577, + "learning_rate": 1.995199648830156e-05, + "loss": 0.5495, + "step": 761 + }, + { + "epoch": 0.060368389780154484, + "grad_norm": 3.929517760998896, + "learning_rate": 1.9951745034642245e-05, + "loss": 0.582, + "step": 762 + }, + { + "epoch": 0.06044761338878986, + "grad_norm": 3.572933592182264, + "learning_rate": 1.995149292570841e-05, + "loss": 0.697, + "step": 763 + }, + { + "epoch": 0.06052683699742523, + "grad_norm": 3.3905931641718583, + "learning_rate": 1.9951240161516643e-05, + "loss": 0.4269, + "step": 764 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 4.048560499346723, + "learning_rate": 1.9950986742083594e-05, + "loss": 0.7241, + "step": 765 + }, + { + "epoch": 0.06068528421469598, + "grad_norm": 3.3746790643609756, + "learning_rate": 1.9950732667425953e-05, + "loss": 0.6714, + "step": 766 + }, + { + "epoch": 0.06076450782333135, + "grad_norm": 3.0467545961335816, + "learning_rate": 1.9950477937560442e-05, + "loss": 0.5577, + "step": 767 + }, + { + "epoch": 0.060843731431966724, + "grad_norm": 5.485705362789386, + "learning_rate": 1.995022255250384e-05, + "loss": 0.5648, + "step": 768 + }, + { + "epoch": 0.0609229550406021, + "grad_norm": 2.9539704464487593, + "learning_rate": 1.9949966512272964e-05, + "loss": 0.569, + "step": 769 + }, + { + "epoch": 0.06100217864923747, + "grad_norm": 4.318479392563151, + "learning_rate": 1.994970981688466e-05, + "loss": 0.6147, + "step": 770 + }, + { + "epoch": 0.06108140225787285, + "grad_norm": 3.5923912297788547, + "learning_rate": 1.9949452466355847e-05, + "loss": 0.5515, + "step": 771 + }, + { + "epoch": 0.06116062586650822, + "grad_norm": 3.303299276932963, + "learning_rate": 1.9949194460703462e-05, + "loss": 0.5222, + "step": 772 + }, + { + "epoch": 0.06123984947514359, + "grad_norm": 3.088897216647192, + "learning_rate": 1.9948935799944492e-05, + "loss": 0.5554, + "step": 773 + }, + { + "epoch": 0.061319073083778965, + "grad_norm": 3.2686023826405126, + "learning_rate": 1.994867648409597e-05, + "loss": 0.5565, + "step": 774 + }, + { + "epoch": 0.06139829669241434, + "grad_norm": 3.2418057508706632, + "learning_rate": 1.9948416513174976e-05, + "loss": 0.6196, + "step": 775 + }, + { + "epoch": 0.061477520301049714, + "grad_norm": 3.2647364379838946, + "learning_rate": 1.994815588719862e-05, + "loss": 0.5637, + "step": 776 + }, + { + "epoch": 0.06155674390968509, + "grad_norm": 3.1972071587145496, + "learning_rate": 1.9947894606184065e-05, + "loss": 0.538, + "step": 777 + }, + { + "epoch": 0.06163596751832046, + "grad_norm": 5.0454017184462705, + "learning_rate": 1.9947632670148517e-05, + "loss": 0.6505, + "step": 778 + }, + { + "epoch": 0.06171519112695583, + "grad_norm": 4.580361567919214, + "learning_rate": 1.9947370079109224e-05, + "loss": 0.6131, + "step": 779 + }, + { + "epoch": 0.061794414735591205, + "grad_norm": 3.5160028389933253, + "learning_rate": 1.9947106833083474e-05, + "loss": 0.6069, + "step": 780 + }, + { + "epoch": 0.06187363834422658, + "grad_norm": 4.3939801486340535, + "learning_rate": 1.9946842932088603e-05, + "loss": 0.6809, + "step": 781 + }, + { + "epoch": 0.061952861952861954, + "grad_norm": 3.5523256337609013, + "learning_rate": 1.9946578376141985e-05, + "loss": 0.5473, + "step": 782 + }, + { + "epoch": 0.06203208556149733, + "grad_norm": 3.084117043302228, + "learning_rate": 1.9946313165261042e-05, + "loss": 0.4179, + "step": 783 + }, + { + "epoch": 0.062111309170132696, + "grad_norm": 3.236057231283532, + "learning_rate": 1.9946047299463234e-05, + "loss": 0.392, + "step": 784 + }, + { + "epoch": 0.06219053277876807, + "grad_norm": 4.1043521682677, + "learning_rate": 1.994578077876607e-05, + "loss": 0.546, + "step": 785 + }, + { + "epoch": 0.062269756387403445, + "grad_norm": 2.875456920695374, + "learning_rate": 1.9945513603187096e-05, + "loss": 0.5148, + "step": 786 + }, + { + "epoch": 0.06234897999603882, + "grad_norm": 3.3628937602042517, + "learning_rate": 1.994524577274391e-05, + "loss": 0.5202, + "step": 787 + }, + { + "epoch": 0.062428203604674194, + "grad_norm": 3.2304470588568073, + "learning_rate": 1.994497728745414e-05, + "loss": 0.6103, + "step": 788 + }, + { + "epoch": 0.06250742721330957, + "grad_norm": 3.6065857419719354, + "learning_rate": 1.9944708147335466e-05, + "loss": 0.5804, + "step": 789 + }, + { + "epoch": 0.06258665082194494, + "grad_norm": 4.08051276743454, + "learning_rate": 1.9944438352405614e-05, + "loss": 0.6671, + "step": 790 + }, + { + "epoch": 0.06266587443058032, + "grad_norm": 4.008142336960553, + "learning_rate": 1.9944167902682345e-05, + "loss": 0.5497, + "step": 791 + }, + { + "epoch": 0.06274509803921569, + "grad_norm": 3.1747393804679223, + "learning_rate": 1.994389679818347e-05, + "loss": 0.4977, + "step": 792 + }, + { + "epoch": 0.06282432164785105, + "grad_norm": 3.331522779491226, + "learning_rate": 1.9943625038926834e-05, + "loss": 0.6059, + "step": 793 + }, + { + "epoch": 0.06290354525648643, + "grad_norm": 2.6405997147387015, + "learning_rate": 1.9943352624930336e-05, + "loss": 0.4948, + "step": 794 + }, + { + "epoch": 0.0629827688651218, + "grad_norm": 2.873587286137695, + "learning_rate": 1.9943079556211915e-05, + "loss": 0.5497, + "step": 795 + }, + { + "epoch": 0.06306199247375718, + "grad_norm": 3.7373979810372138, + "learning_rate": 1.9942805832789548e-05, + "loss": 0.4954, + "step": 796 + }, + { + "epoch": 0.06314121608239255, + "grad_norm": 4.052674250578069, + "learning_rate": 1.9942531454681254e-05, + "loss": 0.5153, + "step": 797 + }, + { + "epoch": 0.06322043969102793, + "grad_norm": 3.265300570351352, + "learning_rate": 1.994225642190511e-05, + "loss": 0.6229, + "step": 798 + }, + { + "epoch": 0.0632996632996633, + "grad_norm": 3.5495566354263843, + "learning_rate": 1.9941980734479214e-05, + "loss": 0.643, + "step": 799 + }, + { + "epoch": 0.06337888690829867, + "grad_norm": 3.8547303759832974, + "learning_rate": 1.994170439242173e-05, + "loss": 0.4973, + "step": 800 + }, + { + "epoch": 0.06345811051693405, + "grad_norm": 4.494308152772487, + "learning_rate": 1.9941427395750844e-05, + "loss": 0.6769, + "step": 801 + }, + { + "epoch": 0.06353733412556942, + "grad_norm": 3.1210903642974483, + "learning_rate": 1.99411497444848e-05, + "loss": 0.4375, + "step": 802 + }, + { + "epoch": 0.0636165577342048, + "grad_norm": 2.9562743507116824, + "learning_rate": 1.994087143864188e-05, + "loss": 0.4864, + "step": 803 + }, + { + "epoch": 0.06369578134284017, + "grad_norm": 3.0621586910450884, + "learning_rate": 1.994059247824041e-05, + "loss": 0.5716, + "step": 804 + }, + { + "epoch": 0.06377500495147553, + "grad_norm": 3.174053051013673, + "learning_rate": 1.994031286329875e-05, + "loss": 0.4088, + "step": 805 + }, + { + "epoch": 0.06385422856011091, + "grad_norm": 2.3628598277423674, + "learning_rate": 1.9940032593835324e-05, + "loss": 0.3996, + "step": 806 + }, + { + "epoch": 0.06393345216874628, + "grad_norm": 3.5657883786748243, + "learning_rate": 1.993975166986858e-05, + "loss": 0.5882, + "step": 807 + }, + { + "epoch": 0.06401267577738166, + "grad_norm": 3.106973540018159, + "learning_rate": 1.9939470091417012e-05, + "loss": 0.5068, + "step": 808 + }, + { + "epoch": 0.06409189938601703, + "grad_norm": 2.8094472265240698, + "learning_rate": 1.9939187858499166e-05, + "loss": 0.6059, + "step": 809 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 3.0464270934163484, + "learning_rate": 1.9938904971133626e-05, + "loss": 0.5343, + "step": 810 + }, + { + "epoch": 0.06425034660328778, + "grad_norm": 3.318874701089729, + "learning_rate": 1.9938621429339012e-05, + "loss": 0.5146, + "step": 811 + }, + { + "epoch": 0.06432957021192315, + "grad_norm": 3.2667968994129364, + "learning_rate": 1.9938337233134e-05, + "loss": 0.4356, + "step": 812 + }, + { + "epoch": 0.06440879382055853, + "grad_norm": 3.3281028621426345, + "learning_rate": 1.9938052382537304e-05, + "loss": 0.4901, + "step": 813 + }, + { + "epoch": 0.0644880174291939, + "grad_norm": 3.28552622467835, + "learning_rate": 1.9937766877567676e-05, + "loss": 0.5669, + "step": 814 + }, + { + "epoch": 0.06456724103782928, + "grad_norm": 3.6196509856752423, + "learning_rate": 1.9937480718243914e-05, + "loss": 0.5781, + "step": 815 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 3.602465658372545, + "learning_rate": 1.9937193904584865e-05, + "loss": 0.6029, + "step": 816 + }, + { + "epoch": 0.06472568825510001, + "grad_norm": 3.0275404943149806, + "learning_rate": 1.9936906436609413e-05, + "loss": 0.5235, + "step": 817 + }, + { + "epoch": 0.0648049118637354, + "grad_norm": 3.6567518441982276, + "learning_rate": 1.9936618314336486e-05, + "loss": 0.5523, + "step": 818 + }, + { + "epoch": 0.06488413547237076, + "grad_norm": 3.600521347136226, + "learning_rate": 1.9936329537785054e-05, + "loss": 0.434, + "step": 819 + }, + { + "epoch": 0.06496335908100614, + "grad_norm": 2.814730311151901, + "learning_rate": 1.9936040106974132e-05, + "loss": 0.5441, + "step": 820 + }, + { + "epoch": 0.06504258268964151, + "grad_norm": 3.0705772882904383, + "learning_rate": 1.9935750021922778e-05, + "loss": 0.5583, + "step": 821 + }, + { + "epoch": 0.06512180629827688, + "grad_norm": 3.153976794235165, + "learning_rate": 1.993545928265009e-05, + "loss": 0.5385, + "step": 822 + }, + { + "epoch": 0.06520102990691226, + "grad_norm": 3.2671450413470784, + "learning_rate": 1.993516788917522e-05, + "loss": 0.5837, + "step": 823 + }, + { + "epoch": 0.06528025351554763, + "grad_norm": 3.0541618783472986, + "learning_rate": 1.9934875841517346e-05, + "loss": 0.477, + "step": 824 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 3.0794903304719106, + "learning_rate": 1.9934583139695703e-05, + "loss": 0.5423, + "step": 825 + }, + { + "epoch": 0.06543870073281838, + "grad_norm": 3.3781063720373345, + "learning_rate": 1.9934289783729564e-05, + "loss": 0.5394, + "step": 826 + }, + { + "epoch": 0.06551792434145376, + "grad_norm": 3.226646288433327, + "learning_rate": 1.993399577363824e-05, + "loss": 0.4867, + "step": 827 + }, + { + "epoch": 0.06559714795008913, + "grad_norm": 3.8081904719350765, + "learning_rate": 1.9933701109441093e-05, + "loss": 0.5372, + "step": 828 + }, + { + "epoch": 0.0656763715587245, + "grad_norm": 3.1738409548591284, + "learning_rate": 1.993340579115753e-05, + "loss": 0.5519, + "step": 829 + }, + { + "epoch": 0.06575559516735988, + "grad_norm": 3.1668036877365227, + "learning_rate": 1.993310981880699e-05, + "loss": 0.6013, + "step": 830 + }, + { + "epoch": 0.06583481877599524, + "grad_norm": 3.249838730162251, + "learning_rate": 1.9932813192408964e-05, + "loss": 0.5012, + "step": 831 + }, + { + "epoch": 0.06591404238463063, + "grad_norm": 3.1605404828049664, + "learning_rate": 1.9932515911982983e-05, + "loss": 0.5964, + "step": 832 + }, + { + "epoch": 0.06599326599326599, + "grad_norm": 3.3002585332126313, + "learning_rate": 1.993221797754862e-05, + "loss": 0.6101, + "step": 833 + }, + { + "epoch": 0.06607248960190136, + "grad_norm": 3.544986135899622, + "learning_rate": 1.9931919389125496e-05, + "loss": 0.4522, + "step": 834 + }, + { + "epoch": 0.06615171321053674, + "grad_norm": 3.1080756194695107, + "learning_rate": 1.9931620146733264e-05, + "loss": 0.555, + "step": 835 + }, + { + "epoch": 0.06623093681917211, + "grad_norm": 2.9532720023537, + "learning_rate": 1.993132025039164e-05, + "loss": 0.5338, + "step": 836 + }, + { + "epoch": 0.06631016042780749, + "grad_norm": 2.9387886959896665, + "learning_rate": 1.9931019700120363e-05, + "loss": 0.4974, + "step": 837 + }, + { + "epoch": 0.06638938403644286, + "grad_norm": 2.8833467108706756, + "learning_rate": 1.9930718495939222e-05, + "loss": 0.4894, + "step": 838 + }, + { + "epoch": 0.06646860764507824, + "grad_norm": 2.982357852199496, + "learning_rate": 1.9930416637868053e-05, + "loss": 0.4433, + "step": 839 + }, + { + "epoch": 0.06654783125371361, + "grad_norm": 3.517244070083994, + "learning_rate": 1.993011412592673e-05, + "loss": 0.5498, + "step": 840 + }, + { + "epoch": 0.06662705486234898, + "grad_norm": 3.6511980208377843, + "learning_rate": 1.992981096013517e-05, + "loss": 0.5865, + "step": 841 + }, + { + "epoch": 0.06670627847098436, + "grad_norm": 3.0976463342317753, + "learning_rate": 1.9929507140513342e-05, + "loss": 0.6326, + "step": 842 + }, + { + "epoch": 0.06678550207961972, + "grad_norm": 3.759032585907262, + "learning_rate": 1.9929202667081246e-05, + "loss": 0.4802, + "step": 843 + }, + { + "epoch": 0.0668647256882551, + "grad_norm": 3.152741725864914, + "learning_rate": 1.9928897539858926e-05, + "loss": 0.4866, + "step": 844 + }, + { + "epoch": 0.06694394929689047, + "grad_norm": 3.006762251009012, + "learning_rate": 1.992859175886648e-05, + "loss": 0.5257, + "step": 845 + }, + { + "epoch": 0.06702317290552584, + "grad_norm": 3.1440988527317493, + "learning_rate": 1.9928285324124038e-05, + "loss": 0.4722, + "step": 846 + }, + { + "epoch": 0.06710239651416122, + "grad_norm": 3.9346948563166375, + "learning_rate": 1.9927978235651782e-05, + "loss": 0.5036, + "step": 847 + }, + { + "epoch": 0.06718162012279659, + "grad_norm": 3.261745891095376, + "learning_rate": 1.992767049346993e-05, + "loss": 0.5603, + "step": 848 + }, + { + "epoch": 0.06726084373143197, + "grad_norm": 3.4979426665820537, + "learning_rate": 1.9927362097598746e-05, + "loss": 0.5236, + "step": 849 + }, + { + "epoch": 0.06734006734006734, + "grad_norm": 3.701323033633284, + "learning_rate": 1.9927053048058534e-05, + "loss": 0.53, + "step": 850 + }, + { + "epoch": 0.06741929094870272, + "grad_norm": 4.27900706688922, + "learning_rate": 1.9926743344869645e-05, + "loss": 0.5007, + "step": 851 + }, + { + "epoch": 0.06749851455733809, + "grad_norm": 3.2862540823863315, + "learning_rate": 1.992643298805247e-05, + "loss": 0.55, + "step": 852 + }, + { + "epoch": 0.06757773816597346, + "grad_norm": 4.161286276262577, + "learning_rate": 1.9926121977627447e-05, + "loss": 0.5335, + "step": 853 + }, + { + "epoch": 0.06765696177460884, + "grad_norm": 3.3258757718395535, + "learning_rate": 1.9925810313615052e-05, + "loss": 0.4599, + "step": 854 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 2.9489275773156565, + "learning_rate": 1.9925497996035807e-05, + "loss": 0.5385, + "step": 855 + }, + { + "epoch": 0.06781540899187959, + "grad_norm": 3.016060849369381, + "learning_rate": 1.992518502491028e-05, + "loss": 0.5752, + "step": 856 + }, + { + "epoch": 0.06789463260051495, + "grad_norm": 3.281559578229815, + "learning_rate": 1.9924871400259074e-05, + "loss": 0.5521, + "step": 857 + }, + { + "epoch": 0.06797385620915032, + "grad_norm": 3.3400649850747537, + "learning_rate": 1.9924557122102843e-05, + "loss": 0.6003, + "step": 858 + }, + { + "epoch": 0.0680530798177857, + "grad_norm": 3.0509399072498304, + "learning_rate": 1.9924242190462276e-05, + "loss": 0.6108, + "step": 859 + }, + { + "epoch": 0.06813230342642107, + "grad_norm": 3.789874651458056, + "learning_rate": 1.992392660535812e-05, + "loss": 0.7623, + "step": 860 + }, + { + "epoch": 0.06821152703505645, + "grad_norm": 3.2651667785669245, + "learning_rate": 1.9923610366811142e-05, + "loss": 0.695, + "step": 861 + }, + { + "epoch": 0.06829075064369182, + "grad_norm": 3.4776044999931193, + "learning_rate": 1.9923293474842175e-05, + "loss": 0.5909, + "step": 862 + }, + { + "epoch": 0.06836997425232719, + "grad_norm": 3.611249398263171, + "learning_rate": 1.9922975929472076e-05, + "loss": 0.5961, + "step": 863 + }, + { + "epoch": 0.06844919786096257, + "grad_norm": 3.506222268871883, + "learning_rate": 1.9922657730721758e-05, + "loss": 0.5333, + "step": 864 + }, + { + "epoch": 0.06852842146959794, + "grad_norm": 3.19384462797647, + "learning_rate": 1.9922338878612177e-05, + "loss": 0.6063, + "step": 865 + }, + { + "epoch": 0.06860764507823332, + "grad_norm": 3.1499780815233187, + "learning_rate": 1.9922019373164324e-05, + "loss": 0.4373, + "step": 866 + }, + { + "epoch": 0.06868686868686869, + "grad_norm": 3.220586048290242, + "learning_rate": 1.9921699214399238e-05, + "loss": 0.5253, + "step": 867 + }, + { + "epoch": 0.06876609229550407, + "grad_norm": 3.1435798789486387, + "learning_rate": 1.9921378402337996e-05, + "loss": 0.5183, + "step": 868 + }, + { + "epoch": 0.06884531590413943, + "grad_norm": 3.208414684842551, + "learning_rate": 1.9921056937001725e-05, + "loss": 0.5066, + "step": 869 + }, + { + "epoch": 0.0689245395127748, + "grad_norm": 3.2094310440452922, + "learning_rate": 1.9920734818411592e-05, + "loss": 0.4729, + "step": 870 + }, + { + "epoch": 0.06900376312141018, + "grad_norm": 3.331217135565873, + "learning_rate": 1.9920412046588807e-05, + "loss": 0.5208, + "step": 871 + }, + { + "epoch": 0.06908298673004555, + "grad_norm": 2.769705825482414, + "learning_rate": 1.992008862155462e-05, + "loss": 0.5024, + "step": 872 + }, + { + "epoch": 0.06916221033868093, + "grad_norm": 2.9648064214569754, + "learning_rate": 1.9919764543330334e-05, + "loss": 0.4394, + "step": 873 + }, + { + "epoch": 0.0692414339473163, + "grad_norm": 3.016760835507678, + "learning_rate": 1.9919439811937283e-05, + "loss": 0.5889, + "step": 874 + }, + { + "epoch": 0.06932065755595167, + "grad_norm": 4.31693011635622, + "learning_rate": 1.991911442739685e-05, + "loss": 0.5902, + "step": 875 + }, + { + "epoch": 0.06939988116458705, + "grad_norm": 3.3614015321940096, + "learning_rate": 1.9918788389730457e-05, + "loss": 0.4873, + "step": 876 + }, + { + "epoch": 0.06947910477322242, + "grad_norm": 3.405796265129666, + "learning_rate": 1.9918461698959576e-05, + "loss": 0.5877, + "step": 877 + }, + { + "epoch": 0.0695583283818578, + "grad_norm": 2.7882997771111797, + "learning_rate": 1.9918134355105717e-05, + "loss": 0.4429, + "step": 878 + }, + { + "epoch": 0.06963755199049317, + "grad_norm": 3.405965968471252, + "learning_rate": 1.9917806358190434e-05, + "loss": 0.4426, + "step": 879 + }, + { + "epoch": 0.06971677559912855, + "grad_norm": 2.8137216742036624, + "learning_rate": 1.9917477708235324e-05, + "loss": 0.5395, + "step": 880 + }, + { + "epoch": 0.06979599920776391, + "grad_norm": 3.59334653364238, + "learning_rate": 1.9917148405262027e-05, + "loss": 0.6111, + "step": 881 + }, + { + "epoch": 0.06987522281639928, + "grad_norm": 3.3384551086839624, + "learning_rate": 1.9916818449292223e-05, + "loss": 0.5071, + "step": 882 + }, + { + "epoch": 0.06995444642503466, + "grad_norm": 2.9900469898376936, + "learning_rate": 1.9916487840347644e-05, + "loss": 0.5367, + "step": 883 + }, + { + "epoch": 0.07003367003367003, + "grad_norm": 3.2517739697263717, + "learning_rate": 1.9916156578450052e-05, + "loss": 0.5962, + "step": 884 + }, + { + "epoch": 0.07011289364230541, + "grad_norm": 3.3366988670980895, + "learning_rate": 1.9915824663621267e-05, + "loss": 0.5497, + "step": 885 + }, + { + "epoch": 0.07019211725094078, + "grad_norm": 2.9955746621282087, + "learning_rate": 1.991549209588314e-05, + "loss": 0.3867, + "step": 886 + }, + { + "epoch": 0.07027134085957615, + "grad_norm": 2.896430710793849, + "learning_rate": 1.9915158875257566e-05, + "loss": 0.4654, + "step": 887 + }, + { + "epoch": 0.07035056446821153, + "grad_norm": 3.3018953564276354, + "learning_rate": 1.991482500176649e-05, + "loss": 0.4525, + "step": 888 + }, + { + "epoch": 0.0704297880768469, + "grad_norm": 3.821556077636759, + "learning_rate": 1.9914490475431892e-05, + "loss": 0.5595, + "step": 889 + }, + { + "epoch": 0.07050901168548228, + "grad_norm": 3.1537818655838143, + "learning_rate": 1.9914155296275804e-05, + "loss": 0.4943, + "step": 890 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 3.8677166672324432, + "learning_rate": 1.9913819464320295e-05, + "loss": 0.424, + "step": 891 + }, + { + "epoch": 0.07066745890275301, + "grad_norm": 3.2623026992414483, + "learning_rate": 1.9913482979587473e-05, + "loss": 0.4779, + "step": 892 + }, + { + "epoch": 0.0707466825113884, + "grad_norm": 3.8429588120053153, + "learning_rate": 1.9913145842099503e-05, + "loss": 0.5283, + "step": 893 + }, + { + "epoch": 0.07082590612002376, + "grad_norm": 3.1054141220693596, + "learning_rate": 1.9912808051878575e-05, + "loss": 0.4845, + "step": 894 + }, + { + "epoch": 0.07090512972865914, + "grad_norm": 3.2737740431901514, + "learning_rate": 1.9912469608946932e-05, + "loss": 0.4593, + "step": 895 + }, + { + "epoch": 0.07098435333729451, + "grad_norm": 2.8873162494726703, + "learning_rate": 1.9912130513326863e-05, + "loss": 0.5449, + "step": 896 + }, + { + "epoch": 0.0710635769459299, + "grad_norm": 3.5032690698395452, + "learning_rate": 1.9911790765040697e-05, + "loss": 0.6493, + "step": 897 + }, + { + "epoch": 0.07114280055456526, + "grad_norm": 3.232292158847636, + "learning_rate": 1.9911450364110798e-05, + "loss": 0.5729, + "step": 898 + }, + { + "epoch": 0.07122202416320063, + "grad_norm": 2.902377848042208, + "learning_rate": 1.9911109310559583e-05, + "loss": 0.4728, + "step": 899 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 2.743137754102563, + "learning_rate": 1.991076760440951e-05, + "loss": 0.54, + "step": 900 + }, + { + "epoch": 0.07138047138047138, + "grad_norm": 2.6029961026870536, + "learning_rate": 1.991042524568308e-05, + "loss": 0.5454, + "step": 901 + }, + { + "epoch": 0.07145969498910676, + "grad_norm": 2.9678516620106565, + "learning_rate": 1.991008223440283e-05, + "loss": 0.533, + "step": 902 + }, + { + "epoch": 0.07153891859774213, + "grad_norm": 3.3123247173520003, + "learning_rate": 1.9909738570591352e-05, + "loss": 0.4731, + "step": 903 + }, + { + "epoch": 0.0716181422063775, + "grad_norm": 2.7849369854194173, + "learning_rate": 1.990939425427127e-05, + "loss": 0.5149, + "step": 904 + }, + { + "epoch": 0.07169736581501288, + "grad_norm": 3.4621544705170715, + "learning_rate": 1.9909049285465258e-05, + "loss": 0.58, + "step": 905 + }, + { + "epoch": 0.07177658942364824, + "grad_norm": 2.8339815850791545, + "learning_rate": 1.990870366419603e-05, + "loss": 0.5151, + "step": 906 + }, + { + "epoch": 0.07185581303228362, + "grad_norm": 2.779417652718206, + "learning_rate": 1.9908357390486342e-05, + "loss": 0.5366, + "step": 907 + }, + { + "epoch": 0.07193503664091899, + "grad_norm": 3.3778322101015066, + "learning_rate": 1.9908010464358997e-05, + "loss": 0.4649, + "step": 908 + }, + { + "epoch": 0.07201426024955437, + "grad_norm": 3.7126540608521412, + "learning_rate": 1.9907662885836836e-05, + "loss": 0.6046, + "step": 909 + }, + { + "epoch": 0.07209348385818974, + "grad_norm": 3.7301662647735974, + "learning_rate": 1.9907314654942748e-05, + "loss": 0.4478, + "step": 910 + }, + { + "epoch": 0.07217270746682511, + "grad_norm": 3.505793056214211, + "learning_rate": 1.990696577169966e-05, + "loss": 0.6349, + "step": 911 + }, + { + "epoch": 0.07225193107546049, + "grad_norm": 2.981444684453008, + "learning_rate": 1.9906616236130543e-05, + "loss": 0.3976, + "step": 912 + }, + { + "epoch": 0.07233115468409586, + "grad_norm": 2.829423402072483, + "learning_rate": 1.990626604825842e-05, + "loss": 0.5443, + "step": 913 + }, + { + "epoch": 0.07241037829273124, + "grad_norm": 3.7201609741816433, + "learning_rate": 1.9905915208106342e-05, + "loss": 0.4463, + "step": 914 + }, + { + "epoch": 0.07248960190136661, + "grad_norm": 3.339315194927594, + "learning_rate": 1.990556371569741e-05, + "loss": 0.5617, + "step": 915 + }, + { + "epoch": 0.07256882551000197, + "grad_norm": 3.060217869462058, + "learning_rate": 1.990521157105477e-05, + "loss": 0.421, + "step": 916 + }, + { + "epoch": 0.07264804911863736, + "grad_norm": 2.9052077714352844, + "learning_rate": 1.990485877420161e-05, + "loss": 0.465, + "step": 917 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 2.9157033711189317, + "learning_rate": 1.990450532516116e-05, + "loss": 0.5046, + "step": 918 + }, + { + "epoch": 0.0728064963359081, + "grad_norm": 3.286068191469207, + "learning_rate": 1.9904151223956688e-05, + "loss": 0.5029, + "step": 919 + }, + { + "epoch": 0.07288571994454347, + "grad_norm": 3.935350582813888, + "learning_rate": 1.9903796470611515e-05, + "loss": 0.4149, + "step": 920 + }, + { + "epoch": 0.07296494355317884, + "grad_norm": 3.322013780640328, + "learning_rate": 1.9903441065149e-05, + "loss": 0.517, + "step": 921 + }, + { + "epoch": 0.07304416716181422, + "grad_norm": 2.9753811644754613, + "learning_rate": 1.990308500759254e-05, + "loss": 0.5145, + "step": 922 + }, + { + "epoch": 0.07312339077044959, + "grad_norm": 4.015814854715801, + "learning_rate": 1.9902728297965586e-05, + "loss": 0.6311, + "step": 923 + }, + { + "epoch": 0.07320261437908497, + "grad_norm": 3.4987862810611134, + "learning_rate": 1.990237093629162e-05, + "loss": 0.5554, + "step": 924 + }, + { + "epoch": 0.07328183798772034, + "grad_norm": 3.5241505027941304, + "learning_rate": 1.9902012922594178e-05, + "loss": 0.605, + "step": 925 + }, + { + "epoch": 0.07336106159635572, + "grad_norm": 4.3719104981238255, + "learning_rate": 1.990165425689683e-05, + "loss": 0.537, + "step": 926 + }, + { + "epoch": 0.07344028520499109, + "grad_norm": 3.69180102158796, + "learning_rate": 1.9901294939223192e-05, + "loss": 0.6398, + "step": 927 + }, + { + "epoch": 0.07351950881362646, + "grad_norm": 3.7689137373263617, + "learning_rate": 1.9900934969596925e-05, + "loss": 0.5357, + "step": 928 + }, + { + "epoch": 0.07359873242226184, + "grad_norm": 2.969961184715992, + "learning_rate": 1.9900574348041728e-05, + "loss": 0.385, + "step": 929 + }, + { + "epoch": 0.0736779560308972, + "grad_norm": 2.875534751970222, + "learning_rate": 1.990021307458135e-05, + "loss": 0.5323, + "step": 930 + }, + { + "epoch": 0.07375717963953259, + "grad_norm": 3.6147602203629576, + "learning_rate": 1.989985114923958e-05, + "loss": 0.6277, + "step": 931 + }, + { + "epoch": 0.07383640324816795, + "grad_norm": 3.328021476059367, + "learning_rate": 1.9899488572040244e-05, + "loss": 0.4645, + "step": 932 + }, + { + "epoch": 0.07391562685680332, + "grad_norm": 2.688794166016588, + "learning_rate": 1.989912534300722e-05, + "loss": 0.5001, + "step": 933 + }, + { + "epoch": 0.0739948504654387, + "grad_norm": 3.190149884710966, + "learning_rate": 1.9898761462164425e-05, + "loss": 0.6568, + "step": 934 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 2.8515037574618574, + "learning_rate": 1.989839692953581e-05, + "loss": 0.5239, + "step": 935 + }, + { + "epoch": 0.07415329768270945, + "grad_norm": 2.7269253995201077, + "learning_rate": 1.9898031745145397e-05, + "loss": 0.5187, + "step": 936 + }, + { + "epoch": 0.07423252129134482, + "grad_norm": 3.170772297887385, + "learning_rate": 1.989766590901721e-05, + "loss": 0.548, + "step": 937 + }, + { + "epoch": 0.0743117448999802, + "grad_norm": 3.5797341627772616, + "learning_rate": 1.9897299421175353e-05, + "loss": 0.6615, + "step": 938 + }, + { + "epoch": 0.07439096850861557, + "grad_norm": 3.738461780002007, + "learning_rate": 1.989693228164395e-05, + "loss": 0.6034, + "step": 939 + }, + { + "epoch": 0.07447019211725094, + "grad_norm": 3.0912636712003367, + "learning_rate": 1.989656449044718e-05, + "loss": 0.5045, + "step": 940 + }, + { + "epoch": 0.07454941572588632, + "grad_norm": 3.4284259629492757, + "learning_rate": 1.9896196047609255e-05, + "loss": 0.6413, + "step": 941 + }, + { + "epoch": 0.07462863933452168, + "grad_norm": 3.3548505539139084, + "learning_rate": 1.9895826953154437e-05, + "loss": 0.5221, + "step": 942 + }, + { + "epoch": 0.07470786294315707, + "grad_norm": 3.4262175625229734, + "learning_rate": 1.9895457207107032e-05, + "loss": 0.527, + "step": 943 + }, + { + "epoch": 0.07478708655179243, + "grad_norm": 2.9715757178923794, + "learning_rate": 1.9895086809491384e-05, + "loss": 0.5269, + "step": 944 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 3.010992141648601, + "learning_rate": 1.989471576033188e-05, + "loss": 0.5371, + "step": 945 + }, + { + "epoch": 0.07494553376906318, + "grad_norm": 3.009349473795652, + "learning_rate": 1.9894344059652953e-05, + "loss": 0.4421, + "step": 946 + }, + { + "epoch": 0.07502475737769855, + "grad_norm": 3.0029717342311972, + "learning_rate": 1.989397170747908e-05, + "loss": 0.6139, + "step": 947 + }, + { + "epoch": 0.07510398098633393, + "grad_norm": 3.099087801001541, + "learning_rate": 1.9893598703834773e-05, + "loss": 0.5155, + "step": 948 + }, + { + "epoch": 0.0751832045949693, + "grad_norm": 2.8075894256822416, + "learning_rate": 1.98932250487446e-05, + "loss": 0.5148, + "step": 949 + }, + { + "epoch": 0.07526242820360468, + "grad_norm": 3.1471239863147398, + "learning_rate": 1.989285074223316e-05, + "loss": 0.532, + "step": 950 + }, + { + "epoch": 0.07534165181224005, + "grad_norm": 2.91333039066608, + "learning_rate": 1.98924757843251e-05, + "loss": 0.5576, + "step": 951 + }, + { + "epoch": 0.07542087542087542, + "grad_norm": 3.717685257657358, + "learning_rate": 1.989210017504511e-05, + "loss": 0.4792, + "step": 952 + }, + { + "epoch": 0.0755000990295108, + "grad_norm": 2.969620364429761, + "learning_rate": 1.989172391441792e-05, + "loss": 0.5467, + "step": 953 + }, + { + "epoch": 0.07557932263814617, + "grad_norm": 3.287101332916613, + "learning_rate": 1.9891347002468307e-05, + "loss": 0.6465, + "step": 954 + }, + { + "epoch": 0.07565854624678155, + "grad_norm": 2.6125475489364436, + "learning_rate": 1.9890969439221086e-05, + "loss": 0.3559, + "step": 955 + }, + { + "epoch": 0.07573776985541691, + "grad_norm": 3.097947753883366, + "learning_rate": 1.989059122470112e-05, + "loss": 0.5442, + "step": 956 + }, + { + "epoch": 0.07581699346405228, + "grad_norm": 2.6040975305283522, + "learning_rate": 1.9890212358933316e-05, + "loss": 0.4987, + "step": 957 + }, + { + "epoch": 0.07589621707268766, + "grad_norm": 3.3980837162093196, + "learning_rate": 1.9889832841942613e-05, + "loss": 0.5733, + "step": 958 + }, + { + "epoch": 0.07597544068132303, + "grad_norm": 4.572176746607006, + "learning_rate": 1.988945267375401e-05, + "loss": 0.5469, + "step": 959 + }, + { + "epoch": 0.07605466428995841, + "grad_norm": 3.4094797998905575, + "learning_rate": 1.9889071854392528e-05, + "loss": 0.4718, + "step": 960 + }, + { + "epoch": 0.07613388789859378, + "grad_norm": 3.241185505042529, + "learning_rate": 1.9888690383883247e-05, + "loss": 0.4224, + "step": 961 + }, + { + "epoch": 0.07621311150722915, + "grad_norm": 2.9933607711239905, + "learning_rate": 1.9888308262251286e-05, + "loss": 0.4875, + "step": 962 + }, + { + "epoch": 0.07629233511586453, + "grad_norm": 2.8730272004508803, + "learning_rate": 1.988792548952181e-05, + "loss": 0.4977, + "step": 963 + }, + { + "epoch": 0.0763715587244999, + "grad_norm": 3.362786263610291, + "learning_rate": 1.9887542065720013e-05, + "loss": 0.575, + "step": 964 + }, + { + "epoch": 0.07645078233313528, + "grad_norm": 3.051133597509086, + "learning_rate": 1.988715799087115e-05, + "loss": 0.496, + "step": 965 + }, + { + "epoch": 0.07653000594177065, + "grad_norm": 2.8379262902364992, + "learning_rate": 1.9886773265000502e-05, + "loss": 0.4711, + "step": 966 + }, + { + "epoch": 0.07660922955040603, + "grad_norm": 2.662204304813247, + "learning_rate": 1.9886387888133413e-05, + "loss": 0.384, + "step": 967 + }, + { + "epoch": 0.0766884531590414, + "grad_norm": 2.820729426169255, + "learning_rate": 1.988600186029525e-05, + "loss": 0.4521, + "step": 968 + }, + { + "epoch": 0.07676767676767676, + "grad_norm": 3.0382179891294, + "learning_rate": 1.988561518151143e-05, + "loss": 0.4529, + "step": 969 + }, + { + "epoch": 0.07684690037631214, + "grad_norm": 2.957282539174493, + "learning_rate": 1.988522785180742e-05, + "loss": 0.5485, + "step": 970 + }, + { + "epoch": 0.07692612398494751, + "grad_norm": 3.005716280610578, + "learning_rate": 1.9884839871208717e-05, + "loss": 0.5376, + "step": 971 + }, + { + "epoch": 0.07700534759358289, + "grad_norm": 2.6369837769277096, + "learning_rate": 1.9884451239740877e-05, + "loss": 0.4363, + "step": 972 + }, + { + "epoch": 0.07708457120221826, + "grad_norm": 2.987326978594906, + "learning_rate": 1.988406195742948e-05, + "loss": 0.4927, + "step": 973 + }, + { + "epoch": 0.07716379481085363, + "grad_norm": 2.9301629653764256, + "learning_rate": 1.9883672024300163e-05, + "loss": 0.422, + "step": 974 + }, + { + "epoch": 0.07724301841948901, + "grad_norm": 2.8506892740950818, + "learning_rate": 1.98832814403786e-05, + "loss": 0.5169, + "step": 975 + }, + { + "epoch": 0.07732224202812438, + "grad_norm": 3.4548599576908745, + "learning_rate": 1.988289020569051e-05, + "loss": 0.6048, + "step": 976 + }, + { + "epoch": 0.07740146563675976, + "grad_norm": 3.522206007638793, + "learning_rate": 1.9882498320261652e-05, + "loss": 0.5529, + "step": 977 + }, + { + "epoch": 0.07748068924539513, + "grad_norm": 3.465933933430114, + "learning_rate": 1.9882105784117835e-05, + "loss": 0.4822, + "step": 978 + }, + { + "epoch": 0.07755991285403051, + "grad_norm": 4.146962473415494, + "learning_rate": 1.98817125972849e-05, + "loss": 0.5807, + "step": 979 + }, + { + "epoch": 0.07763913646266588, + "grad_norm": 3.238348667770255, + "learning_rate": 1.9881318759788738e-05, + "loss": 0.5667, + "step": 980 + }, + { + "epoch": 0.07771836007130124, + "grad_norm": 3.200935033788777, + "learning_rate": 1.988092427165528e-05, + "loss": 0.4223, + "step": 981 + }, + { + "epoch": 0.07779758367993662, + "grad_norm": 3.677610045711899, + "learning_rate": 1.98805291329105e-05, + "loss": 0.6305, + "step": 982 + }, + { + "epoch": 0.07787680728857199, + "grad_norm": 2.41384432750428, + "learning_rate": 1.9880133343580423e-05, + "loss": 0.5119, + "step": 983 + }, + { + "epoch": 0.07795603089720737, + "grad_norm": 2.9954557863928466, + "learning_rate": 1.9879736903691107e-05, + "loss": 0.5795, + "step": 984 + }, + { + "epoch": 0.07803525450584274, + "grad_norm": 2.962993648836908, + "learning_rate": 1.9879339813268653e-05, + "loss": 0.6063, + "step": 985 + }, + { + "epoch": 0.07811447811447811, + "grad_norm": 2.9711648085899753, + "learning_rate": 1.9878942072339208e-05, + "loss": 0.4009, + "step": 986 + }, + { + "epoch": 0.07819370172311349, + "grad_norm": 3.9742903438818415, + "learning_rate": 1.987854368092896e-05, + "loss": 0.6467, + "step": 987 + }, + { + "epoch": 0.07827292533174886, + "grad_norm": 2.4292974257509137, + "learning_rate": 1.9878144639064145e-05, + "loss": 0.4372, + "step": 988 + }, + { + "epoch": 0.07835214894038424, + "grad_norm": 2.8007721191078137, + "learning_rate": 1.9877744946771034e-05, + "loss": 0.3379, + "step": 989 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 2.5542649721521156, + "learning_rate": 1.987734460407595e-05, + "loss": 0.4264, + "step": 990 + }, + { + "epoch": 0.07851059615765497, + "grad_norm": 3.1371018904129824, + "learning_rate": 1.9876943611005252e-05, + "loss": 0.4664, + "step": 991 + }, + { + "epoch": 0.07858981976629036, + "grad_norm": 3.1621062811895775, + "learning_rate": 1.9876541967585337e-05, + "loss": 0.5031, + "step": 992 + }, + { + "epoch": 0.07866904337492572, + "grad_norm": 3.2344934277000315, + "learning_rate": 1.987613967384266e-05, + "loss": 0.4734, + "step": 993 + }, + { + "epoch": 0.0787482669835611, + "grad_norm": 2.7557556922391435, + "learning_rate": 1.9875736729803705e-05, + "loss": 0.5568, + "step": 994 + }, + { + "epoch": 0.07882749059219647, + "grad_norm": 3.567627748685407, + "learning_rate": 1.9875333135495e-05, + "loss": 0.5808, + "step": 995 + }, + { + "epoch": 0.07890671420083185, + "grad_norm": 6.11760922392571, + "learning_rate": 1.9874928890943134e-05, + "loss": 0.6194, + "step": 996 + }, + { + "epoch": 0.07898593780946722, + "grad_norm": 2.828729150471876, + "learning_rate": 1.9874523996174714e-05, + "loss": 0.5218, + "step": 997 + }, + { + "epoch": 0.07906516141810259, + "grad_norm": 3.21582827951103, + "learning_rate": 1.98741184512164e-05, + "loss": 0.617, + "step": 998 + }, + { + "epoch": 0.07914438502673797, + "grad_norm": 2.8138075594820897, + "learning_rate": 1.9873712256094898e-05, + "loss": 0.5541, + "step": 999 + }, + { + "epoch": 0.07922360863537334, + "grad_norm": 2.8557096380024474, + "learning_rate": 1.987330541083695e-05, + "loss": 0.5396, + "step": 1000 + }, + { + "epoch": 0.07930283224400872, + "grad_norm": 3.3567608766462964, + "learning_rate": 1.9872897915469353e-05, + "loss": 0.4047, + "step": 1001 + }, + { + "epoch": 0.07938205585264409, + "grad_norm": 3.0708985217595823, + "learning_rate": 1.987248977001893e-05, + "loss": 0.5186, + "step": 1002 + }, + { + "epoch": 0.07946127946127945, + "grad_norm": 3.3865895807018807, + "learning_rate": 1.987208097451256e-05, + "loss": 0.5101, + "step": 1003 + }, + { + "epoch": 0.07954050306991484, + "grad_norm": 3.3070447899063113, + "learning_rate": 1.987167152897716e-05, + "loss": 0.535, + "step": 1004 + }, + { + "epoch": 0.0796197266785502, + "grad_norm": 2.8231662483860056, + "learning_rate": 1.987126143343969e-05, + "loss": 0.4462, + "step": 1005 + }, + { + "epoch": 0.07969895028718559, + "grad_norm": 3.316521112845417, + "learning_rate": 1.987085068792715e-05, + "loss": 0.5232, + "step": 1006 + }, + { + "epoch": 0.07977817389582095, + "grad_norm": 3.540571595742248, + "learning_rate": 1.9870439292466587e-05, + "loss": 0.5254, + "step": 1007 + }, + { + "epoch": 0.07985739750445633, + "grad_norm": 3.3575150202003354, + "learning_rate": 1.9870027247085093e-05, + "loss": 0.6666, + "step": 1008 + }, + { + "epoch": 0.0799366211130917, + "grad_norm": 2.967612030596316, + "learning_rate": 1.9869614551809793e-05, + "loss": 0.4186, + "step": 1009 + }, + { + "epoch": 0.08001584472172707, + "grad_norm": 3.2481661369813275, + "learning_rate": 1.986920120666787e-05, + "loss": 0.471, + "step": 1010 + }, + { + "epoch": 0.08009506833036245, + "grad_norm": 3.1669031408264807, + "learning_rate": 1.986878721168653e-05, + "loss": 0.5178, + "step": 1011 + }, + { + "epoch": 0.08017429193899782, + "grad_norm": 11.351114085480855, + "learning_rate": 1.986837256689304e-05, + "loss": 0.5286, + "step": 1012 + }, + { + "epoch": 0.0802535155476332, + "grad_norm": 3.354968889104633, + "learning_rate": 1.98679572723147e-05, + "loss": 0.48, + "step": 1013 + }, + { + "epoch": 0.08033273915626857, + "grad_norm": 4.615116408425043, + "learning_rate": 1.9867541327978853e-05, + "loss": 0.6214, + "step": 1014 + }, + { + "epoch": 0.08041196276490394, + "grad_norm": 3.6421750123568346, + "learning_rate": 1.986712473391289e-05, + "loss": 0.5059, + "step": 1015 + }, + { + "epoch": 0.08049118637353932, + "grad_norm": 2.861637563480363, + "learning_rate": 1.986670749014424e-05, + "loss": 0.5235, + "step": 1016 + }, + { + "epoch": 0.08057040998217468, + "grad_norm": 3.306106734822648, + "learning_rate": 1.9866289596700383e-05, + "loss": 0.4339, + "step": 1017 + }, + { + "epoch": 0.08064963359081007, + "grad_norm": 3.6107048802811463, + "learning_rate": 1.9865871053608823e-05, + "loss": 0.4927, + "step": 1018 + }, + { + "epoch": 0.08072885719944543, + "grad_norm": 3.4573315071224844, + "learning_rate": 1.9865451860897126e-05, + "loss": 0.5167, + "step": 1019 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 3.2852436051003857, + "learning_rate": 1.98650320185929e-05, + "loss": 0.5141, + "step": 1020 + }, + { + "epoch": 0.08088730441671618, + "grad_norm": 3.5349466410761363, + "learning_rate": 1.986461152672378e-05, + "loss": 0.5828, + "step": 1021 + }, + { + "epoch": 0.08096652802535155, + "grad_norm": 3.395641606236405, + "learning_rate": 1.986419038531745e-05, + "loss": 0.5388, + "step": 1022 + }, + { + "epoch": 0.08104575163398693, + "grad_norm": 3.1130354848169786, + "learning_rate": 1.9863768594401654e-05, + "loss": 0.4576, + "step": 1023 + }, + { + "epoch": 0.0811249752426223, + "grad_norm": 3.124040096105198, + "learning_rate": 1.9863346154004155e-05, + "loss": 0.5245, + "step": 1024 + }, + { + "epoch": 0.08120419885125768, + "grad_norm": 2.7410260102914563, + "learning_rate": 1.986292306415277e-05, + "loss": 0.5119, + "step": 1025 + }, + { + "epoch": 0.08128342245989305, + "grad_norm": 2.7015648118243254, + "learning_rate": 1.9862499324875362e-05, + "loss": 0.447, + "step": 1026 + }, + { + "epoch": 0.08136264606852842, + "grad_norm": 3.0420700324094114, + "learning_rate": 1.9862074936199827e-05, + "loss": 0.4277, + "step": 1027 + }, + { + "epoch": 0.0814418696771638, + "grad_norm": 3.2072228570001777, + "learning_rate": 1.9861649898154107e-05, + "loss": 0.5813, + "step": 1028 + }, + { + "epoch": 0.08152109328579916, + "grad_norm": 3.350451358578708, + "learning_rate": 1.98612242107662e-05, + "loss": 0.4792, + "step": 1029 + }, + { + "epoch": 0.08160031689443455, + "grad_norm": 2.788663558206002, + "learning_rate": 1.9860797874064123e-05, + "loss": 0.3798, + "step": 1030 + }, + { + "epoch": 0.08167954050306991, + "grad_norm": 3.4858638818553125, + "learning_rate": 1.9860370888075954e-05, + "loss": 0.5046, + "step": 1031 + }, + { + "epoch": 0.08175876411170528, + "grad_norm": 3.1440843840329604, + "learning_rate": 1.9859943252829804e-05, + "loss": 0.5357, + "step": 1032 + }, + { + "epoch": 0.08183798772034066, + "grad_norm": 3.1891244014042974, + "learning_rate": 1.9859514968353836e-05, + "loss": 0.6391, + "step": 1033 + }, + { + "epoch": 0.08191721132897603, + "grad_norm": 3.4270411698823375, + "learning_rate": 1.985908603467625e-05, + "loss": 0.4699, + "step": 1034 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 3.169767999676323, + "learning_rate": 1.985865645182529e-05, + "loss": 0.4575, + "step": 1035 + }, + { + "epoch": 0.08207565854624678, + "grad_norm": 3.568540642474215, + "learning_rate": 1.9858226219829234e-05, + "loss": 0.4753, + "step": 1036 + }, + { + "epoch": 0.08215488215488216, + "grad_norm": 3.033743569390355, + "learning_rate": 1.985779533871642e-05, + "loss": 0.52, + "step": 1037 + }, + { + "epoch": 0.08223410576351753, + "grad_norm": 3.0604077852827256, + "learning_rate": 1.985736380851521e-05, + "loss": 0.4779, + "step": 1038 + }, + { + "epoch": 0.0823133293721529, + "grad_norm": 3.5317220145185138, + "learning_rate": 1.9856931629254032e-05, + "loss": 0.3752, + "step": 1039 + }, + { + "epoch": 0.08239255298078828, + "grad_norm": 3.1526157342567362, + "learning_rate": 1.9856498800961328e-05, + "loss": 0.4543, + "step": 1040 + }, + { + "epoch": 0.08247177658942365, + "grad_norm": 3.1841713648545813, + "learning_rate": 1.9856065323665606e-05, + "loss": 0.5466, + "step": 1041 + }, + { + "epoch": 0.08255100019805903, + "grad_norm": 3.2133769632346745, + "learning_rate": 1.9855631197395406e-05, + "loss": 0.5136, + "step": 1042 + }, + { + "epoch": 0.0826302238066944, + "grad_norm": 3.0391147020087055, + "learning_rate": 1.985519642217932e-05, + "loss": 0.4877, + "step": 1043 + }, + { + "epoch": 0.08270944741532976, + "grad_norm": 3.6858143921540405, + "learning_rate": 1.9854760998045964e-05, + "loss": 0.5523, + "step": 1044 + }, + { + "epoch": 0.08278867102396514, + "grad_norm": 4.016601076910318, + "learning_rate": 1.9854324925024017e-05, + "loss": 0.5545, + "step": 1045 + }, + { + "epoch": 0.08286789463260051, + "grad_norm": 2.903257820892303, + "learning_rate": 1.9853888203142184e-05, + "loss": 0.4813, + "step": 1046 + }, + { + "epoch": 0.08294711824123589, + "grad_norm": 2.840563751868195, + "learning_rate": 1.9853450832429234e-05, + "loss": 0.4203, + "step": 1047 + }, + { + "epoch": 0.08302634184987126, + "grad_norm": 2.7881233364286544, + "learning_rate": 1.9853012812913956e-05, + "loss": 0.3934, + "step": 1048 + }, + { + "epoch": 0.08310556545850664, + "grad_norm": 3.449584601555392, + "learning_rate": 1.9852574144625193e-05, + "loss": 0.5277, + "step": 1049 + }, + { + "epoch": 0.08318478906714201, + "grad_norm": 3.157646155820125, + "learning_rate": 1.985213482759183e-05, + "loss": 0.4682, + "step": 1050 + }, + { + "epoch": 0.08326401267577738, + "grad_norm": 3.27493690159151, + "learning_rate": 1.9851694861842795e-05, + "loss": 0.58, + "step": 1051 + }, + { + "epoch": 0.08334323628441276, + "grad_norm": 2.866102262051014, + "learning_rate": 1.9851254247407053e-05, + "loss": 0.3921, + "step": 1052 + }, + { + "epoch": 0.08342245989304813, + "grad_norm": 3.155110350052891, + "learning_rate": 1.9850812984313626e-05, + "loss": 0.5557, + "step": 1053 + }, + { + "epoch": 0.08350168350168351, + "grad_norm": 2.8562844113253276, + "learning_rate": 1.985037107259156e-05, + "loss": 0.5239, + "step": 1054 + }, + { + "epoch": 0.08358090711031887, + "grad_norm": 2.871566456820682, + "learning_rate": 1.984992851226996e-05, + "loss": 0.5293, + "step": 1055 + }, + { + "epoch": 0.08366013071895424, + "grad_norm": 3.4374142642811276, + "learning_rate": 1.9849485303377955e-05, + "loss": 0.4608, + "step": 1056 + }, + { + "epoch": 0.08373935432758962, + "grad_norm": 3.724586501248896, + "learning_rate": 1.984904144594474e-05, + "loss": 0.6747, + "step": 1057 + }, + { + "epoch": 0.08381857793622499, + "grad_norm": 3.6301441190979298, + "learning_rate": 1.9848596939999534e-05, + "loss": 0.6223, + "step": 1058 + }, + { + "epoch": 0.08389780154486037, + "grad_norm": 2.3854111436717997, + "learning_rate": 1.984815178557161e-05, + "loss": 0.398, + "step": 1059 + }, + { + "epoch": 0.08397702515349574, + "grad_norm": 3.0993186048035857, + "learning_rate": 1.9847705982690275e-05, + "loss": 0.5106, + "step": 1060 + }, + { + "epoch": 0.08405624876213111, + "grad_norm": 2.66489757437417, + "learning_rate": 1.984725953138489e-05, + "loss": 0.5062, + "step": 1061 + }, + { + "epoch": 0.08413547237076649, + "grad_norm": 2.8364142644307138, + "learning_rate": 1.9846812431684843e-05, + "loss": 0.4424, + "step": 1062 + }, + { + "epoch": 0.08421469597940186, + "grad_norm": 2.6253931479370594, + "learning_rate": 1.9846364683619575e-05, + "loss": 0.5658, + "step": 1063 + }, + { + "epoch": 0.08429391958803724, + "grad_norm": 3.0014747590276283, + "learning_rate": 1.9845916287218575e-05, + "loss": 0.5445, + "step": 1064 + }, + { + "epoch": 0.0843731431966726, + "grad_norm": 2.989609827158304, + "learning_rate": 1.9845467242511362e-05, + "loss": 0.5067, + "step": 1065 + }, + { + "epoch": 0.08445236680530799, + "grad_norm": 3.3724948216425337, + "learning_rate": 1.9845017549527502e-05, + "loss": 0.5039, + "step": 1066 + }, + { + "epoch": 0.08453159041394336, + "grad_norm": 2.945638085051838, + "learning_rate": 1.984456720829661e-05, + "loss": 0.4895, + "step": 1067 + }, + { + "epoch": 0.08461081402257872, + "grad_norm": 3.845012491238431, + "learning_rate": 1.9844116218848335e-05, + "loss": 0.4923, + "step": 1068 + }, + { + "epoch": 0.0846900376312141, + "grad_norm": 3.5786710825964576, + "learning_rate": 1.9843664581212374e-05, + "loss": 0.4495, + "step": 1069 + }, + { + "epoch": 0.08476926123984947, + "grad_norm": 3.4840226581121625, + "learning_rate": 1.9843212295418464e-05, + "loss": 0.4915, + "step": 1070 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 3.183119557311288, + "learning_rate": 1.984275936149639e-05, + "loss": 0.4792, + "step": 1071 + }, + { + "epoch": 0.08492770845712022, + "grad_norm": 3.0949654824978765, + "learning_rate": 1.984230577947597e-05, + "loss": 0.4828, + "step": 1072 + }, + { + "epoch": 0.08500693206575559, + "grad_norm": 2.80538534011389, + "learning_rate": 1.9841851549387074e-05, + "loss": 0.5201, + "step": 1073 + }, + { + "epoch": 0.08508615567439097, + "grad_norm": 2.5706908572715985, + "learning_rate": 1.9841396671259606e-05, + "loss": 0.4986, + "step": 1074 + }, + { + "epoch": 0.08516537928302634, + "grad_norm": 3.149705891963666, + "learning_rate": 1.9840941145123524e-05, + "loss": 0.4996, + "step": 1075 + }, + { + "epoch": 0.08524460289166172, + "grad_norm": 2.873645897622677, + "learning_rate": 1.984048497100882e-05, + "loss": 0.5226, + "step": 1076 + }, + { + "epoch": 0.08532382650029709, + "grad_norm": 2.5030867999637207, + "learning_rate": 1.9840028148945526e-05, + "loss": 0.5205, + "step": 1077 + }, + { + "epoch": 0.08540305010893247, + "grad_norm": 3.18791226303514, + "learning_rate": 1.983957067896373e-05, + "loss": 0.3476, + "step": 1078 + }, + { + "epoch": 0.08548227371756784, + "grad_norm": 2.787518424590658, + "learning_rate": 1.9839112561093548e-05, + "loss": 0.3596, + "step": 1079 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 3.378661751130679, + "learning_rate": 1.983865379536515e-05, + "loss": 0.5327, + "step": 1080 + }, + { + "epoch": 0.08564072093483859, + "grad_norm": 3.4768398111991488, + "learning_rate": 1.9838194381808737e-05, + "loss": 0.5766, + "step": 1081 + }, + { + "epoch": 0.08571994454347395, + "grad_norm": 2.85385584536561, + "learning_rate": 1.983773432045456e-05, + "loss": 0.4246, + "step": 1082 + }, + { + "epoch": 0.08579916815210933, + "grad_norm": 3.6371951984749638, + "learning_rate": 1.9837273611332918e-05, + "loss": 0.5942, + "step": 1083 + }, + { + "epoch": 0.0858783917607447, + "grad_norm": 2.677380059161476, + "learning_rate": 1.983681225447414e-05, + "loss": 0.5097, + "step": 1084 + }, + { + "epoch": 0.08595761536938007, + "grad_norm": 3.498117904952886, + "learning_rate": 1.9836350249908606e-05, + "loss": 0.6546, + "step": 1085 + }, + { + "epoch": 0.08603683897801545, + "grad_norm": 2.3158569524200354, + "learning_rate": 1.983588759766674e-05, + "loss": 0.2811, + "step": 1086 + }, + { + "epoch": 0.08611606258665082, + "grad_norm": 3.407167796845734, + "learning_rate": 1.9835424297779002e-05, + "loss": 0.4563, + "step": 1087 + }, + { + "epoch": 0.0861952861952862, + "grad_norm": 2.969757801601191, + "learning_rate": 1.98349603502759e-05, + "loss": 0.4085, + "step": 1088 + }, + { + "epoch": 0.08627450980392157, + "grad_norm": 2.9950622769790582, + "learning_rate": 1.983449575518798e-05, + "loss": 0.3886, + "step": 1089 + }, + { + "epoch": 0.08635373341255695, + "grad_norm": 2.5152208402796448, + "learning_rate": 1.983403051254584e-05, + "loss": 0.3387, + "step": 1090 + }, + { + "epoch": 0.08643295702119232, + "grad_norm": 3.004682481820772, + "learning_rate": 1.9833564622380105e-05, + "loss": 0.4687, + "step": 1091 + }, + { + "epoch": 0.08651218062982768, + "grad_norm": 5.160912022023601, + "learning_rate": 1.9833098084721455e-05, + "loss": 0.4735, + "step": 1092 + }, + { + "epoch": 0.08659140423846307, + "grad_norm": 2.810539005726432, + "learning_rate": 1.9832630899600607e-05, + "loss": 0.4921, + "step": 1093 + }, + { + "epoch": 0.08667062784709843, + "grad_norm": 3.575300057152066, + "learning_rate": 1.9832163067048335e-05, + "loss": 0.4792, + "step": 1094 + }, + { + "epoch": 0.08674985145573381, + "grad_norm": 3.2766962093931724, + "learning_rate": 1.9831694587095428e-05, + "loss": 0.4362, + "step": 1095 + }, + { + "epoch": 0.08682907506436918, + "grad_norm": 3.771766802577481, + "learning_rate": 1.983122545977274e-05, + "loss": 0.5149, + "step": 1096 + }, + { + "epoch": 0.08690829867300455, + "grad_norm": 2.91395083259566, + "learning_rate": 1.983075568511116e-05, + "loss": 0.5302, + "step": 1097 + }, + { + "epoch": 0.08698752228163993, + "grad_norm": 3.0171089968982585, + "learning_rate": 1.983028526314162e-05, + "loss": 0.5589, + "step": 1098 + }, + { + "epoch": 0.0870667458902753, + "grad_norm": 3.3110004246895763, + "learning_rate": 1.98298141938951e-05, + "loss": 0.5231, + "step": 1099 + }, + { + "epoch": 0.08714596949891068, + "grad_norm": 2.4694917990492797, + "learning_rate": 1.982934247740261e-05, + "loss": 0.4208, + "step": 1100 + }, + { + "epoch": 0.08722519310754605, + "grad_norm": 3.350969968362005, + "learning_rate": 1.9828870113695217e-05, + "loss": 0.5676, + "step": 1101 + }, + { + "epoch": 0.08730441671618142, + "grad_norm": 3.409640407416842, + "learning_rate": 1.9828397102804016e-05, + "loss": 0.3566, + "step": 1102 + }, + { + "epoch": 0.0873836403248168, + "grad_norm": 2.9697707973618397, + "learning_rate": 1.982792344476016e-05, + "loss": 0.4851, + "step": 1103 + }, + { + "epoch": 0.08746286393345216, + "grad_norm": 3.348477739289384, + "learning_rate": 1.982744913959483e-05, + "loss": 0.5918, + "step": 1104 + }, + { + "epoch": 0.08754208754208755, + "grad_norm": 2.8133201376495847, + "learning_rate": 1.9826974187339267e-05, + "loss": 0.5041, + "step": 1105 + }, + { + "epoch": 0.08762131115072291, + "grad_norm": 2.917907605356377, + "learning_rate": 1.9826498588024738e-05, + "loss": 0.4162, + "step": 1106 + }, + { + "epoch": 0.0877005347593583, + "grad_norm": 3.2671158388845085, + "learning_rate": 1.982602234168255e-05, + "loss": 0.5697, + "step": 1107 + }, + { + "epoch": 0.08777975836799366, + "grad_norm": 3.184169428945255, + "learning_rate": 1.9825545448344078e-05, + "loss": 0.4887, + "step": 1108 + }, + { + "epoch": 0.08785898197662903, + "grad_norm": 4.657450898572146, + "learning_rate": 1.9825067908040716e-05, + "loss": 0.4193, + "step": 1109 + }, + { + "epoch": 0.08793820558526441, + "grad_norm": 3.2471336756698554, + "learning_rate": 1.9824589720803906e-05, + "loss": 0.4368, + "step": 1110 + }, + { + "epoch": 0.08801742919389978, + "grad_norm": 3.113129707170449, + "learning_rate": 1.9824110886665137e-05, + "loss": 0.5021, + "step": 1111 + }, + { + "epoch": 0.08809665280253516, + "grad_norm": 3.556680450494994, + "learning_rate": 1.9823631405655933e-05, + "loss": 0.458, + "step": 1112 + }, + { + "epoch": 0.08817587641117053, + "grad_norm": 3.7669716447199537, + "learning_rate": 1.9823151277807873e-05, + "loss": 0.4883, + "step": 1113 + }, + { + "epoch": 0.0882551000198059, + "grad_norm": 2.9182702928130095, + "learning_rate": 1.9822670503152567e-05, + "loss": 0.3853, + "step": 1114 + }, + { + "epoch": 0.08833432362844128, + "grad_norm": 2.8541517457360435, + "learning_rate": 1.982218908172167e-05, + "loss": 0.3169, + "step": 1115 + }, + { + "epoch": 0.08841354723707665, + "grad_norm": 3.7973525542670665, + "learning_rate": 1.9821707013546885e-05, + "loss": 0.6661, + "step": 1116 + }, + { + "epoch": 0.08849277084571203, + "grad_norm": 2.5836900278744346, + "learning_rate": 1.9821224298659953e-05, + "loss": 0.455, + "step": 1117 + }, + { + "epoch": 0.0885719944543474, + "grad_norm": 4.9682610621552685, + "learning_rate": 1.9820740937092656e-05, + "loss": 0.5624, + "step": 1118 + }, + { + "epoch": 0.08865121806298278, + "grad_norm": 3.364726466311942, + "learning_rate": 1.982025692887682e-05, + "loss": 0.5158, + "step": 1119 + }, + { + "epoch": 0.08873044167161814, + "grad_norm": 2.9285130067546015, + "learning_rate": 1.9819772274044323e-05, + "loss": 0.3909, + "step": 1120 + }, + { + "epoch": 0.08880966528025351, + "grad_norm": 2.5780257901984354, + "learning_rate": 1.9819286972627066e-05, + "loss": 0.377, + "step": 1121 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.7393603295608515, + "learning_rate": 1.9818801024657014e-05, + "loss": 0.476, + "step": 1122 + }, + { + "epoch": 0.08896811249752426, + "grad_norm": 2.667074727959154, + "learning_rate": 1.9818314430166158e-05, + "loss": 0.4401, + "step": 1123 + }, + { + "epoch": 0.08904733610615964, + "grad_norm": 3.07080161359432, + "learning_rate": 1.981782718918654e-05, + "loss": 0.4849, + "step": 1124 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 2.693848335134863, + "learning_rate": 1.981733930175024e-05, + "loss": 0.3686, + "step": 1125 + }, + { + "epoch": 0.08920578332343038, + "grad_norm": 2.790447841814945, + "learning_rate": 1.9816850767889387e-05, + "loss": 0.4372, + "step": 1126 + }, + { + "epoch": 0.08928500693206576, + "grad_norm": 3.188075691790254, + "learning_rate": 1.9816361587636143e-05, + "loss": 0.5137, + "step": 1127 + }, + { + "epoch": 0.08936423054070113, + "grad_norm": 3.4018445055391227, + "learning_rate": 1.9815871761022727e-05, + "loss": 0.5543, + "step": 1128 + }, + { + "epoch": 0.08944345414933651, + "grad_norm": 2.9845546392497972, + "learning_rate": 1.9815381288081382e-05, + "loss": 0.4302, + "step": 1129 + }, + { + "epoch": 0.08952267775797187, + "grad_norm": 3.962935201548869, + "learning_rate": 1.9814890168844412e-05, + "loss": 0.559, + "step": 1130 + }, + { + "epoch": 0.08960190136660724, + "grad_norm": 2.690738336027175, + "learning_rate": 1.981439840334415e-05, + "loss": 0.4805, + "step": 1131 + }, + { + "epoch": 0.08968112497524262, + "grad_norm": 3.009150897335099, + "learning_rate": 1.9813905991612974e-05, + "loss": 0.5308, + "step": 1132 + }, + { + "epoch": 0.08976034858387799, + "grad_norm": 3.0239670839507853, + "learning_rate": 1.9813412933683312e-05, + "loss": 0.4036, + "step": 1133 + }, + { + "epoch": 0.08983957219251337, + "grad_norm": 2.907786408112091, + "learning_rate": 1.9812919229587626e-05, + "loss": 0.3564, + "step": 1134 + }, + { + "epoch": 0.08991879580114874, + "grad_norm": 3.0029111382877915, + "learning_rate": 1.9812424879358424e-05, + "loss": 0.4467, + "step": 1135 + }, + { + "epoch": 0.08999801940978412, + "grad_norm": 3.1882872194797196, + "learning_rate": 1.981192988302826e-05, + "loss": 0.5825, + "step": 1136 + }, + { + "epoch": 0.09007724301841949, + "grad_norm": 2.789041952722907, + "learning_rate": 1.981143424062973e-05, + "loss": 0.3841, + "step": 1137 + }, + { + "epoch": 0.09015646662705486, + "grad_norm": 3.575546504062069, + "learning_rate": 1.981093795219546e-05, + "loss": 0.5407, + "step": 1138 + }, + { + "epoch": 0.09023569023569024, + "grad_norm": 3.308899760154985, + "learning_rate": 1.9810441017758132e-05, + "loss": 0.4473, + "step": 1139 + }, + { + "epoch": 0.0903149138443256, + "grad_norm": 2.7232902149269957, + "learning_rate": 1.980994343735047e-05, + "loss": 0.4297, + "step": 1140 + }, + { + "epoch": 0.09039413745296099, + "grad_norm": 3.480343543569543, + "learning_rate": 1.9809445211005235e-05, + "loss": 0.4717, + "step": 1141 + }, + { + "epoch": 0.09047336106159636, + "grad_norm": 2.9985786572621125, + "learning_rate": 1.980894633875523e-05, + "loss": 0.488, + "step": 1142 + }, + { + "epoch": 0.09055258467023172, + "grad_norm": 2.875454160577531, + "learning_rate": 1.980844682063331e-05, + "loss": 0.4453, + "step": 1143 + }, + { + "epoch": 0.0906318082788671, + "grad_norm": 2.991050857390452, + "learning_rate": 1.980794665667236e-05, + "loss": 0.5236, + "step": 1144 + }, + { + "epoch": 0.09071103188750247, + "grad_norm": 2.7301328562405707, + "learning_rate": 1.9807445846905316e-05, + "loss": 0.5338, + "step": 1145 + }, + { + "epoch": 0.09079025549613785, + "grad_norm": 2.8157463123758677, + "learning_rate": 1.980694439136515e-05, + "loss": 0.5377, + "step": 1146 + }, + { + "epoch": 0.09086947910477322, + "grad_norm": 2.551010685786562, + "learning_rate": 1.980644229008489e-05, + "loss": 0.4777, + "step": 1147 + }, + { + "epoch": 0.0909487027134086, + "grad_norm": 2.759879934873259, + "learning_rate": 1.9805939543097586e-05, + "loss": 0.4694, + "step": 1148 + }, + { + "epoch": 0.09102792632204397, + "grad_norm": 2.8045017592137285, + "learning_rate": 1.9805436150436352e-05, + "loss": 0.4082, + "step": 1149 + }, + { + "epoch": 0.09110714993067934, + "grad_norm": 2.6590287712701115, + "learning_rate": 1.9804932112134323e-05, + "loss": 0.416, + "step": 1150 + }, + { + "epoch": 0.09118637353931472, + "grad_norm": 3.3508584458774107, + "learning_rate": 1.9804427428224696e-05, + "loss": 0.4654, + "step": 1151 + }, + { + "epoch": 0.09126559714795009, + "grad_norm": 3.6605674652380555, + "learning_rate": 1.9803922098740696e-05, + "loss": 0.4626, + "step": 1152 + }, + { + "epoch": 0.09134482075658547, + "grad_norm": 2.8853471445108805, + "learning_rate": 1.98034161237156e-05, + "loss": 0.5001, + "step": 1153 + }, + { + "epoch": 0.09142404436522084, + "grad_norm": 2.805697083500706, + "learning_rate": 1.9802909503182722e-05, + "loss": 0.4854, + "step": 1154 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 3.210169831192453, + "learning_rate": 1.9802402237175426e-05, + "loss": 0.4608, + "step": 1155 + }, + { + "epoch": 0.09158249158249158, + "grad_norm": 2.4652169232103653, + "learning_rate": 1.9801894325727104e-05, + "loss": 0.3638, + "step": 1156 + }, + { + "epoch": 0.09166171519112695, + "grad_norm": 3.13335446589247, + "learning_rate": 1.980138576887121e-05, + "loss": 0.6432, + "step": 1157 + }, + { + "epoch": 0.09174093879976233, + "grad_norm": 2.499935202609419, + "learning_rate": 1.980087656664122e-05, + "loss": 0.3888, + "step": 1158 + }, + { + "epoch": 0.0918201624083977, + "grad_norm": 2.911496172934588, + "learning_rate": 1.9800366719070668e-05, + "loss": 0.5239, + "step": 1159 + }, + { + "epoch": 0.09189938601703307, + "grad_norm": 3.309044976537077, + "learning_rate": 1.9799856226193125e-05, + "loss": 0.4401, + "step": 1160 + }, + { + "epoch": 0.09197860962566845, + "grad_norm": 3.003601119925414, + "learning_rate": 1.97993450880422e-05, + "loss": 0.4165, + "step": 1161 + }, + { + "epoch": 0.09205783323430382, + "grad_norm": 2.949334265018345, + "learning_rate": 1.9798833304651555e-05, + "loss": 0.4954, + "step": 1162 + }, + { + "epoch": 0.0921370568429392, + "grad_norm": 3.6785178209257974, + "learning_rate": 1.9798320876054882e-05, + "loss": 0.42, + "step": 1163 + }, + { + "epoch": 0.09221628045157457, + "grad_norm": 3.235641668418051, + "learning_rate": 1.9797807802285933e-05, + "loss": 0.5237, + "step": 1164 + }, + { + "epoch": 0.09229550406020995, + "grad_norm": 2.7879452355058136, + "learning_rate": 1.979729408337848e-05, + "loss": 0.4528, + "step": 1165 + }, + { + "epoch": 0.09237472766884532, + "grad_norm": 3.081500480265576, + "learning_rate": 1.9796779719366355e-05, + "loss": 0.4927, + "step": 1166 + }, + { + "epoch": 0.09245395127748068, + "grad_norm": 3.3545869585421304, + "learning_rate": 1.9796264710283425e-05, + "loss": 0.6186, + "step": 1167 + }, + { + "epoch": 0.09253317488611607, + "grad_norm": 3.4452432551640264, + "learning_rate": 1.9795749056163595e-05, + "loss": 0.48, + "step": 1168 + }, + { + "epoch": 0.09261239849475143, + "grad_norm": 2.5107690026929324, + "learning_rate": 1.9795232757040827e-05, + "loss": 0.3089, + "step": 1169 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 2.96025874370993, + "learning_rate": 1.9794715812949117e-05, + "loss": 0.3997, + "step": 1170 + }, + { + "epoch": 0.09277084571202218, + "grad_norm": 3.041303371007694, + "learning_rate": 1.9794198223922496e-05, + "loss": 0.5336, + "step": 1171 + }, + { + "epoch": 0.09285006932065755, + "grad_norm": 2.6198516778156384, + "learning_rate": 1.979367998999505e-05, + "loss": 0.3426, + "step": 1172 + }, + { + "epoch": 0.09292929292929293, + "grad_norm": 3.42753698324827, + "learning_rate": 1.97931611112009e-05, + "loss": 0.4556, + "step": 1173 + }, + { + "epoch": 0.0930085165379283, + "grad_norm": 2.6469552661719633, + "learning_rate": 1.9792641587574212e-05, + "loss": 0.3358, + "step": 1174 + }, + { + "epoch": 0.09308774014656368, + "grad_norm": 2.6983144734015796, + "learning_rate": 1.9792121419149196e-05, + "loss": 0.4735, + "step": 1175 + }, + { + "epoch": 0.09316696375519905, + "grad_norm": 2.863517246247922, + "learning_rate": 1.97916006059601e-05, + "loss": 0.4365, + "step": 1176 + }, + { + "epoch": 0.09324618736383443, + "grad_norm": 3.3008308287626313, + "learning_rate": 1.979107914804122e-05, + "loss": 0.4891, + "step": 1177 + }, + { + "epoch": 0.0933254109724698, + "grad_norm": 3.0788316380030936, + "learning_rate": 1.979055704542689e-05, + "loss": 0.5466, + "step": 1178 + }, + { + "epoch": 0.09340463458110516, + "grad_norm": 2.7879878542582306, + "learning_rate": 1.9790034298151486e-05, + "loss": 0.3846, + "step": 1179 + }, + { + "epoch": 0.09348385818974055, + "grad_norm": 2.6921605687371724, + "learning_rate": 1.9789510906249432e-05, + "loss": 0.418, + "step": 1180 + }, + { + "epoch": 0.09356308179837591, + "grad_norm": 3.339369111681325, + "learning_rate": 1.9788986869755187e-05, + "loss": 0.4613, + "step": 1181 + }, + { + "epoch": 0.0936423054070113, + "grad_norm": 2.862343906737892, + "learning_rate": 1.978846218870326e-05, + "loss": 0.7313, + "step": 1182 + }, + { + "epoch": 0.09372152901564666, + "grad_norm": 2.787712227607083, + "learning_rate": 1.9787936863128195e-05, + "loss": 0.3882, + "step": 1183 + }, + { + "epoch": 0.09380075262428203, + "grad_norm": 2.988853974362241, + "learning_rate": 1.9787410893064584e-05, + "loss": 0.5426, + "step": 1184 + }, + { + "epoch": 0.09387997623291741, + "grad_norm": 2.5836090903970055, + "learning_rate": 1.978688427854706e-05, + "loss": 0.4528, + "step": 1185 + }, + { + "epoch": 0.09395919984155278, + "grad_norm": 2.9320522894001892, + "learning_rate": 1.97863570196103e-05, + "loss": 0.4741, + "step": 1186 + }, + { + "epoch": 0.09403842345018816, + "grad_norm": 3.2130123005940767, + "learning_rate": 1.9785829116289017e-05, + "loss": 0.5473, + "step": 1187 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 2.792945193691272, + "learning_rate": 1.9785300568617973e-05, + "loss": 0.4636, + "step": 1188 + }, + { + "epoch": 0.09419687066745891, + "grad_norm": 2.7664361018067676, + "learning_rate": 1.978477137663197e-05, + "loss": 0.5127, + "step": 1189 + }, + { + "epoch": 0.09427609427609428, + "grad_norm": 3.2046791554771996, + "learning_rate": 1.9784241540365856e-05, + "loss": 0.5447, + "step": 1190 + }, + { + "epoch": 0.09435531788472964, + "grad_norm": 2.5187704934378017, + "learning_rate": 1.9783711059854514e-05, + "loss": 0.4364, + "step": 1191 + }, + { + "epoch": 0.09443454149336503, + "grad_norm": 2.955054836243262, + "learning_rate": 1.9783179935132874e-05, + "loss": 0.4794, + "step": 1192 + }, + { + "epoch": 0.0945137651020004, + "grad_norm": 3.305740028331367, + "learning_rate": 1.978264816623591e-05, + "loss": 0.4308, + "step": 1193 + }, + { + "epoch": 0.09459298871063578, + "grad_norm": 2.519206562418568, + "learning_rate": 1.9782115753198633e-05, + "loss": 0.4597, + "step": 1194 + }, + { + "epoch": 0.09467221231927114, + "grad_norm": 2.823003234561981, + "learning_rate": 1.9781582696056105e-05, + "loss": 0.4541, + "step": 1195 + }, + { + "epoch": 0.09475143592790651, + "grad_norm": 2.997973325370504, + "learning_rate": 1.9781048994843423e-05, + "loss": 0.5152, + "step": 1196 + }, + { + "epoch": 0.09483065953654189, + "grad_norm": 2.7588941737240047, + "learning_rate": 1.9780514649595727e-05, + "loss": 0.3504, + "step": 1197 + }, + { + "epoch": 0.09490988314517726, + "grad_norm": 2.962805872853815, + "learning_rate": 1.97799796603482e-05, + "loss": 0.5334, + "step": 1198 + }, + { + "epoch": 0.09498910675381264, + "grad_norm": 2.986469273864763, + "learning_rate": 1.9779444027136075e-05, + "loss": 0.5183, + "step": 1199 + }, + { + "epoch": 0.09506833036244801, + "grad_norm": 2.693312992978399, + "learning_rate": 1.977890774999461e-05, + "loss": 0.4658, + "step": 1200 + }, + { + "epoch": 0.09514755397108338, + "grad_norm": 2.933876446691811, + "learning_rate": 1.977837082895913e-05, + "loss": 0.4742, + "step": 1201 + }, + { + "epoch": 0.09522677757971876, + "grad_norm": 3.0448273571919056, + "learning_rate": 1.9777833264064977e-05, + "loss": 0.3984, + "step": 1202 + }, + { + "epoch": 0.09530600118835413, + "grad_norm": 2.919121030526456, + "learning_rate": 1.9777295055347553e-05, + "loss": 0.3911, + "step": 1203 + }, + { + "epoch": 0.0953852247969895, + "grad_norm": 2.7335837293134273, + "learning_rate": 1.9776756202842297e-05, + "loss": 0.3838, + "step": 1204 + }, + { + "epoch": 0.09546444840562487, + "grad_norm": 3.9222950670949848, + "learning_rate": 1.9776216706584682e-05, + "loss": 0.5776, + "step": 1205 + }, + { + "epoch": 0.09554367201426026, + "grad_norm": 2.808846082632548, + "learning_rate": 1.977567656661024e-05, + "loss": 0.4147, + "step": 1206 + }, + { + "epoch": 0.09562289562289562, + "grad_norm": 2.6027134733234436, + "learning_rate": 1.9775135782954534e-05, + "loss": 0.3953, + "step": 1207 + }, + { + "epoch": 0.09570211923153099, + "grad_norm": 2.802753304304745, + "learning_rate": 1.9774594355653175e-05, + "loss": 0.4492, + "step": 1208 + }, + { + "epoch": 0.09578134284016637, + "grad_norm": 2.9894080991048577, + "learning_rate": 1.9774052284741804e-05, + "loss": 0.4058, + "step": 1209 + }, + { + "epoch": 0.09586056644880174, + "grad_norm": 3.2959667997472355, + "learning_rate": 1.9773509570256124e-05, + "loss": 0.5794, + "step": 1210 + }, + { + "epoch": 0.09593979005743712, + "grad_norm": 3.4460465410906234, + "learning_rate": 1.9772966212231863e-05, + "loss": 0.4913, + "step": 1211 + }, + { + "epoch": 0.09601901366607249, + "grad_norm": 3.2370611897191317, + "learning_rate": 1.9772422210704803e-05, + "loss": 0.495, + "step": 1212 + }, + { + "epoch": 0.09609823727470786, + "grad_norm": 2.510565167325101, + "learning_rate": 1.977187756571076e-05, + "loss": 0.4402, + "step": 1213 + }, + { + "epoch": 0.09617746088334324, + "grad_norm": 3.3067656355834982, + "learning_rate": 1.9771332277285603e-05, + "loss": 0.4603, + "step": 1214 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.9415589484257945, + "learning_rate": 1.977078634546523e-05, + "loss": 0.5573, + "step": 1215 + }, + { + "epoch": 0.09633590810061399, + "grad_norm": 3.0337215428094932, + "learning_rate": 1.977023977028559e-05, + "loss": 0.5792, + "step": 1216 + }, + { + "epoch": 0.09641513170924935, + "grad_norm": 2.717568010051802, + "learning_rate": 1.9769692551782672e-05, + "loss": 0.5404, + "step": 1217 + }, + { + "epoch": 0.09649435531788474, + "grad_norm": 2.851477156302294, + "learning_rate": 1.976914468999251e-05, + "loss": 0.4395, + "step": 1218 + }, + { + "epoch": 0.0965735789265201, + "grad_norm": 3.739586339973208, + "learning_rate": 1.9768596184951174e-05, + "loss": 0.576, + "step": 1219 + }, + { + "epoch": 0.09665280253515547, + "grad_norm": 2.9827582120586724, + "learning_rate": 1.9768047036694785e-05, + "loss": 0.565, + "step": 1220 + }, + { + "epoch": 0.09673202614379085, + "grad_norm": 3.3495330648470376, + "learning_rate": 1.9767497245259496e-05, + "loss": 0.4631, + "step": 1221 + }, + { + "epoch": 0.09681124975242622, + "grad_norm": 2.7190958499253717, + "learning_rate": 1.9766946810681517e-05, + "loss": 0.4555, + "step": 1222 + }, + { + "epoch": 0.0968904733610616, + "grad_norm": 2.7463302295366, + "learning_rate": 1.9766395732997082e-05, + "loss": 0.4362, + "step": 1223 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 3.0146532239628328, + "learning_rate": 1.9765844012242482e-05, + "loss": 0.4235, + "step": 1224 + }, + { + "epoch": 0.09704892057833234, + "grad_norm": 3.3067482693186787, + "learning_rate": 1.9765291648454042e-05, + "loss": 0.4175, + "step": 1225 + }, + { + "epoch": 0.09712814418696772, + "grad_norm": 3.043234149394331, + "learning_rate": 1.9764738641668137e-05, + "loss": 0.49, + "step": 1226 + }, + { + "epoch": 0.09720736779560309, + "grad_norm": 3.091418048606579, + "learning_rate": 1.9764184991921178e-05, + "loss": 0.4488, + "step": 1227 + }, + { + "epoch": 0.09728659140423847, + "grad_norm": 2.882853625788255, + "learning_rate": 1.9763630699249615e-05, + "loss": 0.3688, + "step": 1228 + }, + { + "epoch": 0.09736581501287384, + "grad_norm": 3.0998745117146336, + "learning_rate": 1.9763075763689956e-05, + "loss": 0.3627, + "step": 1229 + }, + { + "epoch": 0.0974450386215092, + "grad_norm": 2.5904061856839493, + "learning_rate": 1.9762520185278734e-05, + "loss": 0.3482, + "step": 1230 + }, + { + "epoch": 0.09752426223014458, + "grad_norm": 3.017908058183374, + "learning_rate": 1.9761963964052528e-05, + "loss": 0.4379, + "step": 1231 + }, + { + "epoch": 0.09760348583877995, + "grad_norm": 2.8388266105162, + "learning_rate": 1.976140710004797e-05, + "loss": 0.5507, + "step": 1232 + }, + { + "epoch": 0.09768270944741533, + "grad_norm": 2.596324747927256, + "learning_rate": 1.976084959330172e-05, + "loss": 0.4643, + "step": 1233 + }, + { + "epoch": 0.0977619330560507, + "grad_norm": 2.7009960126368, + "learning_rate": 1.9760291443850496e-05, + "loss": 0.4732, + "step": 1234 + }, + { + "epoch": 0.09784115666468608, + "grad_norm": 3.1240752238483327, + "learning_rate": 1.9759732651731037e-05, + "loss": 0.4105, + "step": 1235 + }, + { + "epoch": 0.09792038027332145, + "grad_norm": 3.356670884619534, + "learning_rate": 1.975917321698015e-05, + "loss": 0.4793, + "step": 1236 + }, + { + "epoch": 0.09799960388195682, + "grad_norm": 2.720612851275538, + "learning_rate": 1.9758613139634662e-05, + "loss": 0.4929, + "step": 1237 + }, + { + "epoch": 0.0980788274905922, + "grad_norm": 3.127615023122187, + "learning_rate": 1.975805241973145e-05, + "loss": 0.5664, + "step": 1238 + }, + { + "epoch": 0.09815805109922757, + "grad_norm": 3.1065795642149774, + "learning_rate": 1.9757491057307448e-05, + "loss": 0.5476, + "step": 1239 + }, + { + "epoch": 0.09823727470786295, + "grad_norm": 2.4365780324468327, + "learning_rate": 1.9756929052399606e-05, + "loss": 0.4285, + "step": 1240 + }, + { + "epoch": 0.09831649831649832, + "grad_norm": 2.6205445849463302, + "learning_rate": 1.9756366405044928e-05, + "loss": 0.4204, + "step": 1241 + }, + { + "epoch": 0.09839572192513368, + "grad_norm": 2.5536730629831377, + "learning_rate": 1.9755803115280476e-05, + "loss": 0.4855, + "step": 1242 + }, + { + "epoch": 0.09847494553376906, + "grad_norm": 2.6571849723312058, + "learning_rate": 1.9755239183143323e-05, + "loss": 0.4306, + "step": 1243 + }, + { + "epoch": 0.09855416914240443, + "grad_norm": 2.7918639991751464, + "learning_rate": 1.9754674608670613e-05, + "loss": 0.5285, + "step": 1244 + }, + { + "epoch": 0.09863339275103981, + "grad_norm": 2.6216234041412934, + "learning_rate": 1.9754109391899514e-05, + "loss": 0.4769, + "step": 1245 + }, + { + "epoch": 0.09871261635967518, + "grad_norm": 2.6346465405630406, + "learning_rate": 1.975354353286725e-05, + "loss": 0.4246, + "step": 1246 + }, + { + "epoch": 0.09879183996831056, + "grad_norm": 2.6617286777674134, + "learning_rate": 1.9752977031611072e-05, + "loss": 0.4167, + "step": 1247 + }, + { + "epoch": 0.09887106357694593, + "grad_norm": 3.3104649516555256, + "learning_rate": 1.9752409888168285e-05, + "loss": 0.5252, + "step": 1248 + }, + { + "epoch": 0.0989502871855813, + "grad_norm": 2.6979592352141184, + "learning_rate": 1.975184210257623e-05, + "loss": 0.4282, + "step": 1249 + }, + { + "epoch": 0.09902951079421668, + "grad_norm": 2.558992126951326, + "learning_rate": 1.97512736748723e-05, + "loss": 0.3617, + "step": 1250 + }, + { + "epoch": 0.09910873440285205, + "grad_norm": 2.9826455564027654, + "learning_rate": 1.975070460509392e-05, + "loss": 0.4462, + "step": 1251 + }, + { + "epoch": 0.09918795801148743, + "grad_norm": 2.7119674793199007, + "learning_rate": 1.9750134893278553e-05, + "loss": 0.4626, + "step": 1252 + }, + { + "epoch": 0.0992671816201228, + "grad_norm": 2.737371496650108, + "learning_rate": 1.974956453946372e-05, + "loss": 0.3724, + "step": 1253 + }, + { + "epoch": 0.09934640522875816, + "grad_norm": 2.9481517962377763, + "learning_rate": 1.9748993543686973e-05, + "loss": 0.4287, + "step": 1254 + }, + { + "epoch": 0.09942562883739355, + "grad_norm": 2.7451605587656234, + "learning_rate": 1.9748421905985915e-05, + "loss": 0.461, + "step": 1255 + }, + { + "epoch": 0.09950485244602891, + "grad_norm": 3.0646336125877243, + "learning_rate": 1.9747849626398176e-05, + "loss": 0.4566, + "step": 1256 + }, + { + "epoch": 0.0995840760546643, + "grad_norm": 2.501200872654823, + "learning_rate": 1.9747276704961447e-05, + "loss": 0.3885, + "step": 1257 + }, + { + "epoch": 0.09966329966329966, + "grad_norm": 2.876388194153617, + "learning_rate": 1.9746703141713444e-05, + "loss": 0.4522, + "step": 1258 + }, + { + "epoch": 0.09974252327193504, + "grad_norm": 2.489857036328931, + "learning_rate": 1.974612893669194e-05, + "loss": 0.4146, + "step": 1259 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 3.3931939435239302, + "learning_rate": 1.974555408993474e-05, + "loss": 0.4765, + "step": 1260 + }, + { + "epoch": 0.09990097048920578, + "grad_norm": 3.257095038887568, + "learning_rate": 1.9744978601479693e-05, + "loss": 0.4967, + "step": 1261 + }, + { + "epoch": 0.09998019409784116, + "grad_norm": 2.7452639496150155, + "learning_rate": 1.97444024713647e-05, + "loss": 0.4455, + "step": 1262 + }, + { + "epoch": 0.10005941770647653, + "grad_norm": 2.7224516279761803, + "learning_rate": 1.9743825699627687e-05, + "loss": 0.4726, + "step": 1263 + }, + { + "epoch": 0.10013864131511191, + "grad_norm": 2.8141386805136372, + "learning_rate": 1.974324828630664e-05, + "loss": 0.5204, + "step": 1264 + }, + { + "epoch": 0.10021786492374728, + "grad_norm": 2.8844951089038564, + "learning_rate": 1.974267023143957e-05, + "loss": 0.4311, + "step": 1265 + }, + { + "epoch": 0.10029708853238264, + "grad_norm": 3.543302427401054, + "learning_rate": 1.974209153506455e-05, + "loss": 0.6045, + "step": 1266 + }, + { + "epoch": 0.10037631214101803, + "grad_norm": 3.1033939236310184, + "learning_rate": 1.9741512197219675e-05, + "loss": 0.4325, + "step": 1267 + }, + { + "epoch": 0.1004555357496534, + "grad_norm": 3.004199142849806, + "learning_rate": 1.9740932217943095e-05, + "loss": 0.4457, + "step": 1268 + }, + { + "epoch": 0.10053475935828877, + "grad_norm": 2.7598379335663807, + "learning_rate": 1.9740351597272998e-05, + "loss": 0.4384, + "step": 1269 + }, + { + "epoch": 0.10061398296692414, + "grad_norm": 3.3023441102956275, + "learning_rate": 1.9739770335247616e-05, + "loss": 0.5227, + "step": 1270 + }, + { + "epoch": 0.10069320657555951, + "grad_norm": 2.8851054191049954, + "learning_rate": 1.9739188431905223e-05, + "loss": 0.5745, + "step": 1271 + }, + { + "epoch": 0.10077243018419489, + "grad_norm": 2.6875763903633914, + "learning_rate": 1.9738605887284134e-05, + "loss": 0.4829, + "step": 1272 + }, + { + "epoch": 0.10085165379283026, + "grad_norm": 2.7641889321808772, + "learning_rate": 1.9738022701422705e-05, + "loss": 0.5405, + "step": 1273 + }, + { + "epoch": 0.10093087740146564, + "grad_norm": 2.4921301903870243, + "learning_rate": 1.973743887435934e-05, + "loss": 0.3679, + "step": 1274 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.951184153446708, + "learning_rate": 1.9736854406132476e-05, + "loss": 0.3986, + "step": 1275 + }, + { + "epoch": 0.10108932461873639, + "grad_norm": 3.0518140539278154, + "learning_rate": 1.9736269296780603e-05, + "loss": 0.4344, + "step": 1276 + }, + { + "epoch": 0.10116854822737176, + "grad_norm": 2.8974286033719823, + "learning_rate": 1.9735683546342243e-05, + "loss": 0.4743, + "step": 1277 + }, + { + "epoch": 0.10124777183600712, + "grad_norm": 3.306914067951883, + "learning_rate": 1.9735097154855968e-05, + "loss": 0.4694, + "step": 1278 + }, + { + "epoch": 0.1013269954446425, + "grad_norm": 2.835824077886425, + "learning_rate": 1.9734510122360383e-05, + "loss": 0.6355, + "step": 1279 + }, + { + "epoch": 0.10140621905327787, + "grad_norm": 2.9306969134881222, + "learning_rate": 1.973392244889415e-05, + "loss": 0.4893, + "step": 1280 + }, + { + "epoch": 0.10148544266191326, + "grad_norm": 2.634497960105454, + "learning_rate": 1.9733334134495963e-05, + "loss": 0.4683, + "step": 1281 + }, + { + "epoch": 0.10156466627054862, + "grad_norm": 2.436570027417308, + "learning_rate": 1.9732745179204553e-05, + "loss": 0.4734, + "step": 1282 + }, + { + "epoch": 0.10164388987918399, + "grad_norm": 2.6218428136824006, + "learning_rate": 1.9732155583058705e-05, + "loss": 0.4703, + "step": 1283 + }, + { + "epoch": 0.10172311348781937, + "grad_norm": 3.512924796677076, + "learning_rate": 1.973156534609724e-05, + "loss": 0.484, + "step": 1284 + }, + { + "epoch": 0.10180233709645474, + "grad_norm": 2.6546275957915006, + "learning_rate": 1.973097446835902e-05, + "loss": 0.4453, + "step": 1285 + }, + { + "epoch": 0.10188156070509012, + "grad_norm": 2.4609997246849944, + "learning_rate": 1.9730382949882955e-05, + "loss": 0.3135, + "step": 1286 + }, + { + "epoch": 0.10196078431372549, + "grad_norm": 3.120760233598349, + "learning_rate": 1.9729790790707995e-05, + "loss": 0.4991, + "step": 1287 + }, + { + "epoch": 0.10204000792236087, + "grad_norm": 3.866788013434518, + "learning_rate": 1.9729197990873127e-05, + "loss": 0.3917, + "step": 1288 + }, + { + "epoch": 0.10211923153099624, + "grad_norm": 2.554138894135903, + "learning_rate": 1.9728604550417385e-05, + "loss": 0.4203, + "step": 1289 + }, + { + "epoch": 0.1021984551396316, + "grad_norm": 2.7615173897506966, + "learning_rate": 1.9728010469379844e-05, + "loss": 0.3992, + "step": 1290 + }, + { + "epoch": 0.10227767874826699, + "grad_norm": 3.2799484048313845, + "learning_rate": 1.972741574779962e-05, + "loss": 0.5083, + "step": 1291 + }, + { + "epoch": 0.10235690235690235, + "grad_norm": 2.53808381770956, + "learning_rate": 1.9726820385715877e-05, + "loss": 0.3927, + "step": 1292 + }, + { + "epoch": 0.10243612596553774, + "grad_norm": 3.3855451710657025, + "learning_rate": 1.9726224383167815e-05, + "loss": 0.4632, + "step": 1293 + }, + { + "epoch": 0.1025153495741731, + "grad_norm": 2.5786065014178354, + "learning_rate": 1.9725627740194673e-05, + "loss": 0.396, + "step": 1294 + }, + { + "epoch": 0.10259457318280847, + "grad_norm": 2.5489295481440664, + "learning_rate": 1.9725030456835745e-05, + "loss": 0.3758, + "step": 1295 + }, + { + "epoch": 0.10267379679144385, + "grad_norm": 3.414059813835967, + "learning_rate": 1.9724432533130355e-05, + "loss": 0.5763, + "step": 1296 + }, + { + "epoch": 0.10275302040007922, + "grad_norm": 3.114689554867816, + "learning_rate": 1.972383396911787e-05, + "loss": 0.5306, + "step": 1297 + }, + { + "epoch": 0.1028322440087146, + "grad_norm": 3.4382850312425535, + "learning_rate": 1.9723234764837708e-05, + "loss": 0.4194, + "step": 1298 + }, + { + "epoch": 0.10291146761734997, + "grad_norm": 2.756221532025514, + "learning_rate": 1.9722634920329323e-05, + "loss": 0.4148, + "step": 1299 + }, + { + "epoch": 0.10299069122598534, + "grad_norm": 2.6681005484047198, + "learning_rate": 1.9722034435632207e-05, + "loss": 0.5743, + "step": 1300 + }, + { + "epoch": 0.10306991483462072, + "grad_norm": 3.221298897992381, + "learning_rate": 1.972143331078591e-05, + "loss": 0.5955, + "step": 1301 + }, + { + "epoch": 0.10314913844325609, + "grad_norm": 2.768257942623405, + "learning_rate": 1.972083154583e-05, + "loss": 0.4405, + "step": 1302 + }, + { + "epoch": 0.10322836205189147, + "grad_norm": 2.7263801265215974, + "learning_rate": 1.972022914080411e-05, + "loss": 0.3928, + "step": 1303 + }, + { + "epoch": 0.10330758566052684, + "grad_norm": 2.7670407144370515, + "learning_rate": 1.9719626095747897e-05, + "loss": 0.4937, + "step": 1304 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 2.2903908378339004, + "learning_rate": 1.971902241070108e-05, + "loss": 0.4047, + "step": 1305 + }, + { + "epoch": 0.10346603287779758, + "grad_norm": 3.3008979763050204, + "learning_rate": 1.9718418085703397e-05, + "loss": 0.5761, + "step": 1306 + }, + { + "epoch": 0.10354525648643295, + "grad_norm": 3.2484549345584584, + "learning_rate": 1.971781312079465e-05, + "loss": 0.4477, + "step": 1307 + }, + { + "epoch": 0.10362448009506833, + "grad_norm": 2.3328468828519147, + "learning_rate": 1.9717207516014664e-05, + "loss": 0.4287, + "step": 1308 + }, + { + "epoch": 0.1037037037037037, + "grad_norm": 2.6045631411131014, + "learning_rate": 1.9716601271403322e-05, + "loss": 0.3855, + "step": 1309 + }, + { + "epoch": 0.10378292731233908, + "grad_norm": 2.8979261277228665, + "learning_rate": 1.9715994387000537e-05, + "loss": 0.5173, + "step": 1310 + }, + { + "epoch": 0.10386215092097445, + "grad_norm": 2.890078085384356, + "learning_rate": 1.9715386862846272e-05, + "loss": 0.5025, + "step": 1311 + }, + { + "epoch": 0.10394137452960982, + "grad_norm": 3.6272945895360142, + "learning_rate": 1.971477869898053e-05, + "loss": 0.464, + "step": 1312 + }, + { + "epoch": 0.1040205981382452, + "grad_norm": 2.944361639139367, + "learning_rate": 1.9714169895443357e-05, + "loss": 0.4089, + "step": 1313 + }, + { + "epoch": 0.10409982174688057, + "grad_norm": 3.3656323936878616, + "learning_rate": 1.971356045227484e-05, + "loss": 0.5242, + "step": 1314 + }, + { + "epoch": 0.10417904535551595, + "grad_norm": 3.4605313243018108, + "learning_rate": 1.97129503695151e-05, + "loss": 0.4686, + "step": 1315 + }, + { + "epoch": 0.10425826896415132, + "grad_norm": 3.0248871789387266, + "learning_rate": 1.9712339647204313e-05, + "loss": 0.4329, + "step": 1316 + }, + { + "epoch": 0.1043374925727867, + "grad_norm": 2.287099410442511, + "learning_rate": 1.97117282853827e-05, + "loss": 0.2722, + "step": 1317 + }, + { + "epoch": 0.10441671618142206, + "grad_norm": 3.4545845525213936, + "learning_rate": 1.9711116284090506e-05, + "loss": 0.5384, + "step": 1318 + }, + { + "epoch": 0.10449593979005743, + "grad_norm": 3.2054629517779527, + "learning_rate": 1.971050364336803e-05, + "loss": 0.5123, + "step": 1319 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 3.140498249635979, + "learning_rate": 1.9709890363255617e-05, + "loss": 0.3961, + "step": 1320 + }, + { + "epoch": 0.10465438700732818, + "grad_norm": 2.8254917972564324, + "learning_rate": 1.9709276443793638e-05, + "loss": 0.5797, + "step": 1321 + }, + { + "epoch": 0.10473361061596356, + "grad_norm": 2.8340757617909627, + "learning_rate": 1.970866188502253e-05, + "loss": 0.6164, + "step": 1322 + }, + { + "epoch": 0.10481283422459893, + "grad_norm": 2.947562598547281, + "learning_rate": 1.970804668698275e-05, + "loss": 0.3962, + "step": 1323 + }, + { + "epoch": 0.1048920578332343, + "grad_norm": 2.6833697600873663, + "learning_rate": 1.970743084971481e-05, + "loss": 0.5205, + "step": 1324 + }, + { + "epoch": 0.10497128144186968, + "grad_norm": 2.2107233355507048, + "learning_rate": 1.970681437325925e-05, + "loss": 0.3524, + "step": 1325 + }, + { + "epoch": 0.10505050505050505, + "grad_norm": 2.8771146522101234, + "learning_rate": 1.9706197257656675e-05, + "loss": 0.5506, + "step": 1326 + }, + { + "epoch": 0.10512972865914043, + "grad_norm": 2.956514573657737, + "learning_rate": 1.9705579502947712e-05, + "loss": 0.3825, + "step": 1327 + }, + { + "epoch": 0.1052089522677758, + "grad_norm": 2.991217178321492, + "learning_rate": 1.9704961109173042e-05, + "loss": 0.5809, + "step": 1328 + }, + { + "epoch": 0.10528817587641116, + "grad_norm": 2.7483954060197235, + "learning_rate": 1.9704342076373378e-05, + "loss": 0.4556, + "step": 1329 + }, + { + "epoch": 0.10536739948504655, + "grad_norm": 2.739167393451204, + "learning_rate": 1.9703722404589484e-05, + "loss": 0.401, + "step": 1330 + }, + { + "epoch": 0.10544662309368191, + "grad_norm": 2.563836171318864, + "learning_rate": 1.970310209386216e-05, + "loss": 0.4047, + "step": 1331 + }, + { + "epoch": 0.1055258467023173, + "grad_norm": 2.9131518200373416, + "learning_rate": 1.9702481144232253e-05, + "loss": 0.4113, + "step": 1332 + }, + { + "epoch": 0.10560507031095266, + "grad_norm": 2.6988588195233487, + "learning_rate": 1.9701859555740647e-05, + "loss": 0.3987, + "step": 1333 + }, + { + "epoch": 0.10568429391958804, + "grad_norm": 2.771397946408388, + "learning_rate": 1.9701237328428272e-05, + "loss": 0.4994, + "step": 1334 + }, + { + "epoch": 0.10576351752822341, + "grad_norm": 3.0871542899897806, + "learning_rate": 1.9700614462336096e-05, + "loss": 0.5022, + "step": 1335 + }, + { + "epoch": 0.10584274113685878, + "grad_norm": 2.934758055292221, + "learning_rate": 1.9699990957505136e-05, + "loss": 0.4916, + "step": 1336 + }, + { + "epoch": 0.10592196474549416, + "grad_norm": 2.690723191029154, + "learning_rate": 1.9699366813976443e-05, + "loss": 0.4011, + "step": 1337 + }, + { + "epoch": 0.10600118835412953, + "grad_norm": 2.7481322324087953, + "learning_rate": 1.9698742031791118e-05, + "loss": 0.5052, + "step": 1338 + }, + { + "epoch": 0.10608041196276491, + "grad_norm": 2.933957788297982, + "learning_rate": 1.96981166109903e-05, + "loss": 0.3777, + "step": 1339 + }, + { + "epoch": 0.10615963557140028, + "grad_norm": 2.6683514820307073, + "learning_rate": 1.9697490551615162e-05, + "loss": 0.4573, + "step": 1340 + }, + { + "epoch": 0.10623885918003564, + "grad_norm": 2.9824026178728955, + "learning_rate": 1.9696863853706937e-05, + "loss": 0.3636, + "step": 1341 + }, + { + "epoch": 0.10631808278867103, + "grad_norm": 2.7216684954457997, + "learning_rate": 1.969623651730688e-05, + "loss": 0.3133, + "step": 1342 + }, + { + "epoch": 0.1063973063973064, + "grad_norm": 4.250271251525364, + "learning_rate": 1.969560854245631e-05, + "loss": 0.5336, + "step": 1343 + }, + { + "epoch": 0.10647653000594177, + "grad_norm": 2.902405821987489, + "learning_rate": 1.9694979929196566e-05, + "loss": 0.403, + "step": 1344 + }, + { + "epoch": 0.10655575361457714, + "grad_norm": 2.574935133721008, + "learning_rate": 1.9694350677569043e-05, + "loss": 0.3917, + "step": 1345 + }, + { + "epoch": 0.10663497722321252, + "grad_norm": 3.1614680037023533, + "learning_rate": 1.9693720787615174e-05, + "loss": 0.4921, + "step": 1346 + }, + { + "epoch": 0.10671420083184789, + "grad_norm": 3.552061231809605, + "learning_rate": 1.9693090259376436e-05, + "loss": 0.3219, + "step": 1347 + }, + { + "epoch": 0.10679342444048326, + "grad_norm": 2.5293648415989236, + "learning_rate": 1.9692459092894343e-05, + "loss": 0.467, + "step": 1348 + }, + { + "epoch": 0.10687264804911864, + "grad_norm": 2.895155207518888, + "learning_rate": 1.969182728821046e-05, + "loss": 0.5326, + "step": 1349 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 2.8137066537947257, + "learning_rate": 1.969119484536638e-05, + "loss": 0.3978, + "step": 1350 + }, + { + "epoch": 0.10703109526638939, + "grad_norm": 3.159236741602848, + "learning_rate": 1.969056176440375e-05, + "loss": 0.3872, + "step": 1351 + }, + { + "epoch": 0.10711031887502476, + "grad_norm": 2.854905581531992, + "learning_rate": 1.9689928045364258e-05, + "loss": 0.3368, + "step": 1352 + }, + { + "epoch": 0.10718954248366012, + "grad_norm": 2.7140487020464974, + "learning_rate": 1.9689293688289627e-05, + "loss": 0.4743, + "step": 1353 + }, + { + "epoch": 0.1072687660922955, + "grad_norm": 2.934282122513982, + "learning_rate": 1.968865869322163e-05, + "loss": 0.3888, + "step": 1354 + }, + { + "epoch": 0.10734798970093087, + "grad_norm": 3.0119871931050435, + "learning_rate": 1.968802306020208e-05, + "loss": 0.51, + "step": 1355 + }, + { + "epoch": 0.10742721330956626, + "grad_norm": 2.8227366406959433, + "learning_rate": 1.968738678927282e-05, + "loss": 0.5394, + "step": 1356 + }, + { + "epoch": 0.10750643691820162, + "grad_norm": 3.6029885179202052, + "learning_rate": 1.9686749880475756e-05, + "loss": 0.4084, + "step": 1357 + }, + { + "epoch": 0.107585660526837, + "grad_norm": 3.034326363589282, + "learning_rate": 1.9686112333852826e-05, + "loss": 0.4646, + "step": 1358 + }, + { + "epoch": 0.10766488413547237, + "grad_norm": 2.664156739194555, + "learning_rate": 1.9685474149446e-05, + "loss": 0.5023, + "step": 1359 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 2.9981760489161426, + "learning_rate": 1.9684835327297306e-05, + "loss": 0.4371, + "step": 1360 + }, + { + "epoch": 0.10782333135274312, + "grad_norm": 2.8022547037558785, + "learning_rate": 1.9684195867448806e-05, + "loss": 0.4122, + "step": 1361 + }, + { + "epoch": 0.10790255496137849, + "grad_norm": 2.9128267225258946, + "learning_rate": 1.9683555769942608e-05, + "loss": 0.4876, + "step": 1362 + }, + { + "epoch": 0.10798177857001387, + "grad_norm": 2.7603908869200477, + "learning_rate": 1.968291503482086e-05, + "loss": 0.3496, + "step": 1363 + }, + { + "epoch": 0.10806100217864924, + "grad_norm": 2.980576722482925, + "learning_rate": 1.968227366212574e-05, + "loss": 0.4767, + "step": 1364 + }, + { + "epoch": 0.1081402257872846, + "grad_norm": 2.6897521455535105, + "learning_rate": 1.968163165189949e-05, + "loss": 0.4636, + "step": 1365 + }, + { + "epoch": 0.10821944939591999, + "grad_norm": 3.186484192734618, + "learning_rate": 1.9680989004184383e-05, + "loss": 0.3885, + "step": 1366 + }, + { + "epoch": 0.10829867300455535, + "grad_norm": 2.8663366302588806, + "learning_rate": 1.968034571902273e-05, + "loss": 0.5326, + "step": 1367 + }, + { + "epoch": 0.10837789661319074, + "grad_norm": 3.067361813403921, + "learning_rate": 1.967970179645689e-05, + "loss": 0.2948, + "step": 1368 + }, + { + "epoch": 0.1084571202218261, + "grad_norm": 3.101527575569622, + "learning_rate": 1.9679057236529266e-05, + "loss": 0.4802, + "step": 1369 + }, + { + "epoch": 0.10853634383046147, + "grad_norm": 3.1289935421466004, + "learning_rate": 1.9678412039282292e-05, + "loss": 0.587, + "step": 1370 + }, + { + "epoch": 0.10861556743909685, + "grad_norm": 2.539607321168991, + "learning_rate": 1.967776620475846e-05, + "loss": 0.3966, + "step": 1371 + }, + { + "epoch": 0.10869479104773222, + "grad_norm": 3.0787692872874817, + "learning_rate": 1.9677119733000283e-05, + "loss": 0.4989, + "step": 1372 + }, + { + "epoch": 0.1087740146563676, + "grad_norm": 3.1317016650621956, + "learning_rate": 1.967647262405034e-05, + "loss": 0.4847, + "step": 1373 + }, + { + "epoch": 0.10885323826500297, + "grad_norm": 2.705818741295503, + "learning_rate": 1.967582487795123e-05, + "loss": 0.3081, + "step": 1374 + }, + { + "epoch": 0.10893246187363835, + "grad_norm": 2.670554827304836, + "learning_rate": 1.967517649474561e-05, + "loss": 0.3429, + "step": 1375 + }, + { + "epoch": 0.10901168548227372, + "grad_norm": 3.007658030824364, + "learning_rate": 1.9674527474476175e-05, + "loss": 0.4141, + "step": 1376 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 2.548967813017338, + "learning_rate": 1.9673877817185656e-05, + "loss": 0.3844, + "step": 1377 + }, + { + "epoch": 0.10917013269954447, + "grad_norm": 2.9694557965998007, + "learning_rate": 1.9673227522916827e-05, + "loss": 0.4681, + "step": 1378 + }, + { + "epoch": 0.10924935630817983, + "grad_norm": 2.5642802366939788, + "learning_rate": 1.9672576591712517e-05, + "loss": 0.4044, + "step": 1379 + }, + { + "epoch": 0.10932857991681522, + "grad_norm": 2.508676900501366, + "learning_rate": 1.9671925023615572e-05, + "loss": 0.3884, + "step": 1380 + }, + { + "epoch": 0.10940780352545058, + "grad_norm": 2.3718081104156945, + "learning_rate": 1.9671272818668906e-05, + "loss": 0.4117, + "step": 1381 + }, + { + "epoch": 0.10948702713408595, + "grad_norm": 2.849798967159578, + "learning_rate": 1.967061997691546e-05, + "loss": 0.3316, + "step": 1382 + }, + { + "epoch": 0.10956625074272133, + "grad_norm": 3.4316362850203928, + "learning_rate": 1.966996649839822e-05, + "loss": 0.4964, + "step": 1383 + }, + { + "epoch": 0.1096454743513567, + "grad_norm": 2.7490391210747593, + "learning_rate": 1.9669312383160217e-05, + "loss": 0.468, + "step": 1384 + }, + { + "epoch": 0.10972469795999208, + "grad_norm": 2.9271533233930738, + "learning_rate": 1.966865763124452e-05, + "loss": 0.5332, + "step": 1385 + }, + { + "epoch": 0.10980392156862745, + "grad_norm": 2.8859706306788584, + "learning_rate": 1.966800224269424e-05, + "loss": 0.4232, + "step": 1386 + }, + { + "epoch": 0.10988314517726283, + "grad_norm": 2.2786571318011406, + "learning_rate": 1.9667346217552528e-05, + "loss": 0.4454, + "step": 1387 + }, + { + "epoch": 0.1099623687858982, + "grad_norm": 3.1330257578407377, + "learning_rate": 1.9666689555862586e-05, + "loss": 0.4288, + "step": 1388 + }, + { + "epoch": 0.11004159239453357, + "grad_norm": 2.2193116635216708, + "learning_rate": 1.966603225766765e-05, + "loss": 0.3197, + "step": 1389 + }, + { + "epoch": 0.11012081600316895, + "grad_norm": 2.503762279641449, + "learning_rate": 1.9665374323011002e-05, + "loss": 0.4297, + "step": 1390 + }, + { + "epoch": 0.11020003961180432, + "grad_norm": 2.882519747928288, + "learning_rate": 1.9664715751935958e-05, + "loss": 0.3071, + "step": 1391 + }, + { + "epoch": 0.1102792632204397, + "grad_norm": 2.7560061587342037, + "learning_rate": 1.9664056544485887e-05, + "loss": 0.4895, + "step": 1392 + }, + { + "epoch": 0.11035848682907506, + "grad_norm": 3.029556398712317, + "learning_rate": 1.9663396700704195e-05, + "loss": 0.4292, + "step": 1393 + }, + { + "epoch": 0.11043771043771043, + "grad_norm": 2.5938390277030217, + "learning_rate": 1.9662736220634325e-05, + "loss": 0.395, + "step": 1394 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 2.8757926968452336, + "learning_rate": 1.966207510431977e-05, + "loss": 0.3962, + "step": 1395 + }, + { + "epoch": 0.11059615765498118, + "grad_norm": 2.9582498728870465, + "learning_rate": 1.966141335180406e-05, + "loss": 0.3903, + "step": 1396 + }, + { + "epoch": 0.11067538126361656, + "grad_norm": 2.559628817192077, + "learning_rate": 1.966075096313077e-05, + "loss": 0.4029, + "step": 1397 + }, + { + "epoch": 0.11075460487225193, + "grad_norm": 2.8927067368568204, + "learning_rate": 1.966008793834351e-05, + "loss": 0.5218, + "step": 1398 + }, + { + "epoch": 0.1108338284808873, + "grad_norm": 2.9418809327128352, + "learning_rate": 1.9659424277485943e-05, + "loss": 0.4159, + "step": 1399 + }, + { + "epoch": 0.11091305208952268, + "grad_norm": 2.4917968755479225, + "learning_rate": 1.9658759980601766e-05, + "loss": 0.3447, + "step": 1400 + }, + { + "epoch": 0.11099227569815805, + "grad_norm": 2.531783157451545, + "learning_rate": 1.9658095047734718e-05, + "loss": 0.4312, + "step": 1401 + }, + { + "epoch": 0.11107149930679343, + "grad_norm": 2.253388942254369, + "learning_rate": 1.965742947892858e-05, + "loss": 0.2864, + "step": 1402 + }, + { + "epoch": 0.1111507229154288, + "grad_norm": 3.0106414612026287, + "learning_rate": 1.9656763274227188e-05, + "loss": 0.3669, + "step": 1403 + }, + { + "epoch": 0.11122994652406418, + "grad_norm": 2.7566736990307357, + "learning_rate": 1.9656096433674393e-05, + "loss": 0.3761, + "step": 1404 + }, + { + "epoch": 0.11130917013269954, + "grad_norm": 2.815809055562609, + "learning_rate": 1.965542895731411e-05, + "loss": 0.4547, + "step": 1405 + }, + { + "epoch": 0.11138839374133491, + "grad_norm": 2.900163182335524, + "learning_rate": 1.965476084519029e-05, + "loss": 0.434, + "step": 1406 + }, + { + "epoch": 0.1114676173499703, + "grad_norm": 2.672358056016173, + "learning_rate": 1.9654092097346925e-05, + "loss": 0.475, + "step": 1407 + }, + { + "epoch": 0.11154684095860566, + "grad_norm": 2.764266974356445, + "learning_rate": 1.965342271382805e-05, + "loss": 0.4598, + "step": 1408 + }, + { + "epoch": 0.11162606456724104, + "grad_norm": 2.798055913816958, + "learning_rate": 1.9652752694677735e-05, + "loss": 0.3977, + "step": 1409 + }, + { + "epoch": 0.11170528817587641, + "grad_norm": 2.753817231232736, + "learning_rate": 1.9652082039940102e-05, + "loss": 0.3721, + "step": 1410 + }, + { + "epoch": 0.11178451178451178, + "grad_norm": 2.8710377716268978, + "learning_rate": 1.965141074965931e-05, + "loss": 0.542, + "step": 1411 + }, + { + "epoch": 0.11186373539314716, + "grad_norm": 2.8060099901292803, + "learning_rate": 1.965073882387956e-05, + "loss": 0.4729, + "step": 1412 + }, + { + "epoch": 0.11194295900178253, + "grad_norm": 2.628235033010763, + "learning_rate": 1.9650066262645097e-05, + "loss": 0.3152, + "step": 1413 + }, + { + "epoch": 0.11202218261041791, + "grad_norm": 2.4459879445174417, + "learning_rate": 1.96493930660002e-05, + "loss": 0.4459, + "step": 1414 + }, + { + "epoch": 0.11210140621905328, + "grad_norm": 2.963065102384047, + "learning_rate": 1.9648719233989202e-05, + "loss": 0.4337, + "step": 1415 + }, + { + "epoch": 0.11218062982768866, + "grad_norm": 3.2144249945801198, + "learning_rate": 1.9648044766656466e-05, + "loss": 0.4277, + "step": 1416 + }, + { + "epoch": 0.11225985343632403, + "grad_norm": 2.5725117331658365, + "learning_rate": 1.9647369664046407e-05, + "loss": 0.3982, + "step": 1417 + }, + { + "epoch": 0.11233907704495939, + "grad_norm": 2.9849598213319606, + "learning_rate": 1.9646693926203477e-05, + "loss": 0.5583, + "step": 1418 + }, + { + "epoch": 0.11241830065359477, + "grad_norm": 2.9912189161815195, + "learning_rate": 1.964601755317217e-05, + "loss": 0.3894, + "step": 1419 + }, + { + "epoch": 0.11249752426223014, + "grad_norm": 3.536987419440896, + "learning_rate": 1.9645340544997017e-05, + "loss": 0.4096, + "step": 1420 + }, + { + "epoch": 0.11257674787086552, + "grad_norm": 2.244945625177073, + "learning_rate": 1.9644662901722603e-05, + "loss": 0.4485, + "step": 1421 + }, + { + "epoch": 0.11265597147950089, + "grad_norm": 2.80778540466449, + "learning_rate": 1.9643984623393542e-05, + "loss": 0.3135, + "step": 1422 + }, + { + "epoch": 0.11273519508813626, + "grad_norm": 2.9179584183705236, + "learning_rate": 1.96433057100545e-05, + "loss": 0.4784, + "step": 1423 + }, + { + "epoch": 0.11281441869677164, + "grad_norm": 3.0869873158919185, + "learning_rate": 1.9642626161750176e-05, + "loss": 0.4835, + "step": 1424 + }, + { + "epoch": 0.11289364230540701, + "grad_norm": 3.3458155387685053, + "learning_rate": 1.9641945978525318e-05, + "loss": 0.3864, + "step": 1425 + }, + { + "epoch": 0.11297286591404239, + "grad_norm": 2.888818373390241, + "learning_rate": 1.9641265160424705e-05, + "loss": 0.4938, + "step": 1426 + }, + { + "epoch": 0.11305208952267776, + "grad_norm": 3.1532745532807893, + "learning_rate": 1.9640583707493176e-05, + "loss": 0.401, + "step": 1427 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 2.3329892328999984, + "learning_rate": 1.96399016197756e-05, + "loss": 0.3788, + "step": 1428 + }, + { + "epoch": 0.1132105367399485, + "grad_norm": 3.4879416768328237, + "learning_rate": 1.9639218897316885e-05, + "loss": 0.5169, + "step": 1429 + }, + { + "epoch": 0.11328976034858387, + "grad_norm": 2.799181262316917, + "learning_rate": 1.9638535540161988e-05, + "loss": 0.4039, + "step": 1430 + }, + { + "epoch": 0.11336898395721925, + "grad_norm": 2.6342776358790414, + "learning_rate": 1.96378515483559e-05, + "loss": 0.4094, + "step": 1431 + }, + { + "epoch": 0.11344820756585462, + "grad_norm": 3.1289878899691264, + "learning_rate": 1.9637166921943663e-05, + "loss": 0.4729, + "step": 1432 + }, + { + "epoch": 0.11352743117449, + "grad_norm": 2.573218788917751, + "learning_rate": 1.963648166097036e-05, + "loss": 0.3954, + "step": 1433 + }, + { + "epoch": 0.11360665478312537, + "grad_norm": 2.7620209010719012, + "learning_rate": 1.9635795765481102e-05, + "loss": 0.4299, + "step": 1434 + }, + { + "epoch": 0.11368587839176074, + "grad_norm": 2.626312333477079, + "learning_rate": 1.9635109235521057e-05, + "loss": 0.5158, + "step": 1435 + }, + { + "epoch": 0.11376510200039612, + "grad_norm": 3.2288689351127853, + "learning_rate": 1.963442207113543e-05, + "loss": 0.5113, + "step": 1436 + }, + { + "epoch": 0.11384432560903149, + "grad_norm": 2.47591228365586, + "learning_rate": 1.9633734272369473e-05, + "loss": 0.5116, + "step": 1437 + }, + { + "epoch": 0.11392354921766687, + "grad_norm": 2.8861142580119985, + "learning_rate": 1.9633045839268464e-05, + "loss": 0.415, + "step": 1438 + }, + { + "epoch": 0.11400277282630224, + "grad_norm": 2.420946925143882, + "learning_rate": 1.9632356771877735e-05, + "loss": 0.4016, + "step": 1439 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 2.4014167852764605, + "learning_rate": 1.9631667070242667e-05, + "loss": 0.402, + "step": 1440 + }, + { + "epoch": 0.11416122004357299, + "grad_norm": 2.9566102577274544, + "learning_rate": 1.963097673440866e-05, + "loss": 0.4598, + "step": 1441 + }, + { + "epoch": 0.11424044365220835, + "grad_norm": 2.7312277270750154, + "learning_rate": 1.9630285764421183e-05, + "loss": 0.4335, + "step": 1442 + }, + { + "epoch": 0.11431966726084374, + "grad_norm": 2.3040746735708577, + "learning_rate": 1.9629594160325725e-05, + "loss": 0.3639, + "step": 1443 + }, + { + "epoch": 0.1143988908694791, + "grad_norm": 2.620302294334293, + "learning_rate": 1.9628901922167823e-05, + "loss": 0.4921, + "step": 1444 + }, + { + "epoch": 0.11447811447811448, + "grad_norm": 2.911555999784591, + "learning_rate": 1.9628209049993064e-05, + "loss": 0.4379, + "step": 1445 + }, + { + "epoch": 0.11455733808674985, + "grad_norm": 2.405405100917414, + "learning_rate": 1.9627515543847068e-05, + "loss": 0.4453, + "step": 1446 + }, + { + "epoch": 0.11463656169538522, + "grad_norm": 2.9958855340989547, + "learning_rate": 1.9626821403775494e-05, + "loss": 0.3848, + "step": 1447 + }, + { + "epoch": 0.1147157853040206, + "grad_norm": 2.8002215684340372, + "learning_rate": 1.9626126629824056e-05, + "loss": 0.5275, + "step": 1448 + }, + { + "epoch": 0.11479500891265597, + "grad_norm": 2.4953018630760733, + "learning_rate": 1.9625431222038494e-05, + "loss": 0.4933, + "step": 1449 + }, + { + "epoch": 0.11487423252129135, + "grad_norm": 2.6107891181250578, + "learning_rate": 1.9624735180464602e-05, + "loss": 0.5438, + "step": 1450 + }, + { + "epoch": 0.11495345612992672, + "grad_norm": 2.687200084638096, + "learning_rate": 1.962403850514821e-05, + "loss": 0.5916, + "step": 1451 + }, + { + "epoch": 0.11503267973856209, + "grad_norm": 2.4708752560975555, + "learning_rate": 1.962334119613519e-05, + "loss": 0.3683, + "step": 1452 + }, + { + "epoch": 0.11511190334719747, + "grad_norm": 2.623431885926357, + "learning_rate": 1.9622643253471457e-05, + "loss": 0.3992, + "step": 1453 + }, + { + "epoch": 0.11519112695583283, + "grad_norm": 2.4053221775577085, + "learning_rate": 1.9621944677202966e-05, + "loss": 0.4795, + "step": 1454 + }, + { + "epoch": 0.11527035056446822, + "grad_norm": 2.173221512238794, + "learning_rate": 1.9621245467375715e-05, + "loss": 0.3732, + "step": 1455 + }, + { + "epoch": 0.11534957417310358, + "grad_norm": 3.1533565901733893, + "learning_rate": 1.9620545624035748e-05, + "loss": 0.6043, + "step": 1456 + }, + { + "epoch": 0.11542879778173896, + "grad_norm": 2.8528162712667067, + "learning_rate": 1.961984514722914e-05, + "loss": 0.3557, + "step": 1457 + }, + { + "epoch": 0.11550802139037433, + "grad_norm": 2.701074418393308, + "learning_rate": 1.9619144037002015e-05, + "loss": 0.3548, + "step": 1458 + }, + { + "epoch": 0.1155872449990097, + "grad_norm": 2.7501869065389246, + "learning_rate": 1.9618442293400544e-05, + "loss": 0.4194, + "step": 1459 + }, + { + "epoch": 0.11566646860764508, + "grad_norm": 2.410240153815325, + "learning_rate": 1.9617739916470926e-05, + "loss": 0.4575, + "step": 1460 + }, + { + "epoch": 0.11574569221628045, + "grad_norm": 3.6046265194177494, + "learning_rate": 1.9617036906259416e-05, + "loss": 0.5225, + "step": 1461 + }, + { + "epoch": 0.11582491582491583, + "grad_norm": 2.550163598794831, + "learning_rate": 1.9616333262812298e-05, + "loss": 0.4023, + "step": 1462 + }, + { + "epoch": 0.1159041394335512, + "grad_norm": 2.5970453273011374, + "learning_rate": 1.9615628986175902e-05, + "loss": 0.3944, + "step": 1463 + }, + { + "epoch": 0.11598336304218657, + "grad_norm": 2.808728434483951, + "learning_rate": 1.9614924076396605e-05, + "loss": 0.3999, + "step": 1464 + }, + { + "epoch": 0.11606258665082195, + "grad_norm": 2.173829958604981, + "learning_rate": 1.9614218533520827e-05, + "loss": 0.3606, + "step": 1465 + }, + { + "epoch": 0.11614181025945731, + "grad_norm": 2.624440721747477, + "learning_rate": 1.9613512357595014e-05, + "loss": 0.336, + "step": 1466 + }, + { + "epoch": 0.1162210338680927, + "grad_norm": 2.667638287547519, + "learning_rate": 1.9612805548665673e-05, + "loss": 0.4324, + "step": 1467 + }, + { + "epoch": 0.11630025747672806, + "grad_norm": 2.576089489561868, + "learning_rate": 1.961209810677934e-05, + "loss": 0.4475, + "step": 1468 + }, + { + "epoch": 0.11637948108536343, + "grad_norm": 2.370723185882638, + "learning_rate": 1.9611390031982595e-05, + "loss": 0.4733, + "step": 1469 + }, + { + "epoch": 0.11645870469399881, + "grad_norm": 2.4566831658223487, + "learning_rate": 1.9610681324322068e-05, + "loss": 0.3275, + "step": 1470 + }, + { + "epoch": 0.11653792830263418, + "grad_norm": 2.656934667342211, + "learning_rate": 1.9609971983844412e-05, + "loss": 0.4251, + "step": 1471 + }, + { + "epoch": 0.11661715191126956, + "grad_norm": 2.3176020192483584, + "learning_rate": 1.9609262010596346e-05, + "loss": 0.3214, + "step": 1472 + }, + { + "epoch": 0.11669637551990493, + "grad_norm": 3.0402397686574827, + "learning_rate": 1.9608551404624613e-05, + "loss": 0.442, + "step": 1473 + }, + { + "epoch": 0.11677559912854031, + "grad_norm": 2.7463527142119166, + "learning_rate": 1.9607840165976003e-05, + "loss": 0.4034, + "step": 1474 + }, + { + "epoch": 0.11685482273717568, + "grad_norm": 2.7781366558517346, + "learning_rate": 1.960712829469735e-05, + "loss": 0.4868, + "step": 1475 + }, + { + "epoch": 0.11693404634581105, + "grad_norm": 2.4108285495916157, + "learning_rate": 1.9606415790835523e-05, + "loss": 0.4523, + "step": 1476 + }, + { + "epoch": 0.11701326995444643, + "grad_norm": 2.5225314182628664, + "learning_rate": 1.9605702654437438e-05, + "loss": 0.3294, + "step": 1477 + }, + { + "epoch": 0.1170924935630818, + "grad_norm": 3.102802518620715, + "learning_rate": 1.9604988885550056e-05, + "loss": 0.3664, + "step": 1478 + }, + { + "epoch": 0.11717171717171718, + "grad_norm": 2.6643491304090507, + "learning_rate": 1.960427448422037e-05, + "loss": 0.4698, + "step": 1479 + }, + { + "epoch": 0.11725094078035254, + "grad_norm": 2.440914712785853, + "learning_rate": 1.9603559450495423e-05, + "loss": 0.3997, + "step": 1480 + }, + { + "epoch": 0.11733016438898791, + "grad_norm": 2.1709628354591963, + "learning_rate": 1.9602843784422297e-05, + "loss": 0.3029, + "step": 1481 + }, + { + "epoch": 0.1174093879976233, + "grad_norm": 2.938762120514386, + "learning_rate": 1.9602127486048112e-05, + "loss": 0.366, + "step": 1482 + }, + { + "epoch": 0.11748861160625866, + "grad_norm": 2.7088157343684562, + "learning_rate": 1.9601410555420035e-05, + "loss": 0.3394, + "step": 1483 + }, + { + "epoch": 0.11756783521489404, + "grad_norm": 2.7259827114603365, + "learning_rate": 1.9600692992585275e-05, + "loss": 0.3933, + "step": 1484 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 2.7765140482764608, + "learning_rate": 1.959997479759107e-05, + "loss": 0.3928, + "step": 1485 + }, + { + "epoch": 0.11772628243216479, + "grad_norm": 2.9128258085526997, + "learning_rate": 1.959925597048472e-05, + "loss": 0.4938, + "step": 1486 + }, + { + "epoch": 0.11780550604080016, + "grad_norm": 2.4718269763967533, + "learning_rate": 1.9598536511313553e-05, + "loss": 0.4091, + "step": 1487 + }, + { + "epoch": 0.11788472964943553, + "grad_norm": 2.6090546215990957, + "learning_rate": 1.9597816420124945e-05, + "loss": 0.5564, + "step": 1488 + }, + { + "epoch": 0.11796395325807091, + "grad_norm": 2.819158025579447, + "learning_rate": 1.95970956969663e-05, + "loss": 0.4879, + "step": 1489 + }, + { + "epoch": 0.11804317686670628, + "grad_norm": 2.4412115292987266, + "learning_rate": 1.9596374341885093e-05, + "loss": 0.4954, + "step": 1490 + }, + { + "epoch": 0.11812240047534166, + "grad_norm": 2.412572553373767, + "learning_rate": 1.95956523549288e-05, + "loss": 0.4117, + "step": 1491 + }, + { + "epoch": 0.11820162408397702, + "grad_norm": 2.686183097520435, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.393, + "step": 1492 + }, + { + "epoch": 0.11828084769261239, + "grad_norm": 2.8484783873782673, + "learning_rate": 1.9594206485581196e-05, + "loss": 0.4983, + "step": 1493 + }, + { + "epoch": 0.11836007130124777, + "grad_norm": 2.156358279266858, + "learning_rate": 1.959348260328508e-05, + "loss": 0.3206, + "step": 1494 + }, + { + "epoch": 0.11843929490988314, + "grad_norm": 3.0164801487012616, + "learning_rate": 1.95927580893043e-05, + "loss": 0.543, + "step": 1495 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 3.0554858763887154, + "learning_rate": 1.9592032943686554e-05, + "loss": 0.4883, + "step": 1496 + }, + { + "epoch": 0.11859774212715389, + "grad_norm": 2.8308645228412845, + "learning_rate": 1.9591307166479595e-05, + "loss": 0.4305, + "step": 1497 + }, + { + "epoch": 0.11867696573578927, + "grad_norm": 2.9918225036819264, + "learning_rate": 1.959058075773121e-05, + "loss": 0.4614, + "step": 1498 + }, + { + "epoch": 0.11875618934442464, + "grad_norm": 2.5533857011462917, + "learning_rate": 1.9589853717489228e-05, + "loss": 0.4102, + "step": 1499 + }, + { + "epoch": 0.11883541295306001, + "grad_norm": 2.4311466966052957, + "learning_rate": 1.958912604580152e-05, + "loss": 0.3459, + "step": 1500 + }, + { + "epoch": 0.11891463656169539, + "grad_norm": 2.5739636174761595, + "learning_rate": 1.9588397742716004e-05, + "loss": 0.4279, + "step": 1501 + }, + { + "epoch": 0.11899386017033076, + "grad_norm": 2.9141249303258627, + "learning_rate": 1.9587668808280632e-05, + "loss": 0.5002, + "step": 1502 + }, + { + "epoch": 0.11907308377896614, + "grad_norm": 2.2176890890593217, + "learning_rate": 1.9586939242543402e-05, + "loss": 0.4051, + "step": 1503 + }, + { + "epoch": 0.1191523073876015, + "grad_norm": 2.6294707532205988, + "learning_rate": 1.9586209045552355e-05, + "loss": 0.3959, + "step": 1504 + }, + { + "epoch": 0.11923153099623687, + "grad_norm": 2.897063341641378, + "learning_rate": 1.9585478217355563e-05, + "loss": 0.4248, + "step": 1505 + }, + { + "epoch": 0.11931075460487225, + "grad_norm": 2.704944763166334, + "learning_rate": 1.9584746758001156e-05, + "loss": 0.5395, + "step": 1506 + }, + { + "epoch": 0.11938997821350762, + "grad_norm": 2.5999050620912527, + "learning_rate": 1.9584014667537293e-05, + "loss": 0.4858, + "step": 1507 + }, + { + "epoch": 0.119469201822143, + "grad_norm": 2.8910020273790193, + "learning_rate": 1.9583281946012183e-05, + "loss": 0.5208, + "step": 1508 + }, + { + "epoch": 0.11954842543077837, + "grad_norm": 2.632441685311661, + "learning_rate": 1.9582548593474064e-05, + "loss": 0.5518, + "step": 1509 + }, + { + "epoch": 0.11962764903941374, + "grad_norm": 2.436644670108184, + "learning_rate": 1.9581814609971232e-05, + "loss": 0.3156, + "step": 1510 + }, + { + "epoch": 0.11970687264804912, + "grad_norm": 2.5432163617969645, + "learning_rate": 1.958107999555201e-05, + "loss": 0.3777, + "step": 1511 + }, + { + "epoch": 0.11978609625668449, + "grad_norm": 2.8925782162268394, + "learning_rate": 1.958034475026477e-05, + "loss": 0.4169, + "step": 1512 + }, + { + "epoch": 0.11986531986531987, + "grad_norm": 2.6287630641747377, + "learning_rate": 1.957960887415793e-05, + "loss": 0.3504, + "step": 1513 + }, + { + "epoch": 0.11994454347395524, + "grad_norm": 2.454226161394671, + "learning_rate": 1.9578872367279937e-05, + "loss": 0.4155, + "step": 1514 + }, + { + "epoch": 0.12002376708259062, + "grad_norm": 3.0506713653642277, + "learning_rate": 1.957813522967929e-05, + "loss": 0.4063, + "step": 1515 + }, + { + "epoch": 0.12010299069122599, + "grad_norm": 2.5683044099764687, + "learning_rate": 1.9577397461404527e-05, + "loss": 0.277, + "step": 1516 + }, + { + "epoch": 0.12018221429986135, + "grad_norm": 2.972139986830908, + "learning_rate": 1.957665906250422e-05, + "loss": 0.5399, + "step": 1517 + }, + { + "epoch": 0.12026143790849673, + "grad_norm": 2.110434332360107, + "learning_rate": 1.9575920033027002e-05, + "loss": 0.3962, + "step": 1518 + }, + { + "epoch": 0.1203406615171321, + "grad_norm": 2.5344095372311695, + "learning_rate": 1.9575180373021516e-05, + "loss": 0.4118, + "step": 1519 + }, + { + "epoch": 0.12041988512576748, + "grad_norm": 2.4862295925655236, + "learning_rate": 1.9574440082536482e-05, + "loss": 0.3404, + "step": 1520 + }, + { + "epoch": 0.12049910873440285, + "grad_norm": 2.783541127818214, + "learning_rate": 1.9573699161620635e-05, + "loss": 0.489, + "step": 1521 + }, + { + "epoch": 0.12057833234303822, + "grad_norm": 2.5850462976897415, + "learning_rate": 1.9572957610322766e-05, + "loss": 0.2879, + "step": 1522 + }, + { + "epoch": 0.1206575559516736, + "grad_norm": 2.824212076990093, + "learning_rate": 1.95722154286917e-05, + "loss": 0.383, + "step": 1523 + }, + { + "epoch": 0.12073677956030897, + "grad_norm": 2.3777158598526493, + "learning_rate": 1.9571472616776304e-05, + "loss": 0.4038, + "step": 1524 + }, + { + "epoch": 0.12081600316894435, + "grad_norm": 3.14402479540086, + "learning_rate": 1.9570729174625493e-05, + "loss": 0.4047, + "step": 1525 + }, + { + "epoch": 0.12089522677757972, + "grad_norm": 2.613857130982444, + "learning_rate": 1.956998510228822e-05, + "loss": 0.3779, + "step": 1526 + }, + { + "epoch": 0.1209744503862151, + "grad_norm": 2.774024440056841, + "learning_rate": 1.956924039981347e-05, + "loss": 0.4627, + "step": 1527 + }, + { + "epoch": 0.12105367399485047, + "grad_norm": 3.0863234715010406, + "learning_rate": 1.956849506725029e-05, + "loss": 0.4621, + "step": 1528 + }, + { + "epoch": 0.12113289760348583, + "grad_norm": 2.6450002690119665, + "learning_rate": 1.9567749104647746e-05, + "loss": 0.4696, + "step": 1529 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 2.7470851536957377, + "learning_rate": 1.9567002512054964e-05, + "loss": 0.3729, + "step": 1530 + }, + { + "epoch": 0.12129134482075658, + "grad_norm": 2.684730202510088, + "learning_rate": 1.9566255289521096e-05, + "loss": 0.4945, + "step": 1531 + }, + { + "epoch": 0.12137056842939196, + "grad_norm": 2.2363869955224485, + "learning_rate": 1.956550743709535e-05, + "loss": 0.2615, + "step": 1532 + }, + { + "epoch": 0.12144979203802733, + "grad_norm": 2.9806460645100357, + "learning_rate": 1.9564758954826964e-05, + "loss": 0.3809, + "step": 1533 + }, + { + "epoch": 0.1215290156466627, + "grad_norm": 2.5849828693392407, + "learning_rate": 1.9564009842765225e-05, + "loss": 0.3779, + "step": 1534 + }, + { + "epoch": 0.12160823925529808, + "grad_norm": 2.643059762973339, + "learning_rate": 1.956326010095946e-05, + "loss": 0.5008, + "step": 1535 + }, + { + "epoch": 0.12168746286393345, + "grad_norm": 2.6165263974282422, + "learning_rate": 1.9562509729459024e-05, + "loss": 0.3935, + "step": 1536 + }, + { + "epoch": 0.12176668647256883, + "grad_norm": 2.753884796295939, + "learning_rate": 1.956175872831334e-05, + "loss": 0.5977, + "step": 1537 + }, + { + "epoch": 0.1218459100812042, + "grad_norm": 2.5690423000474807, + "learning_rate": 1.9561007097571853e-05, + "loss": 0.4264, + "step": 1538 + }, + { + "epoch": 0.12192513368983957, + "grad_norm": 2.6789834265427874, + "learning_rate": 1.9560254837284053e-05, + "loss": 0.3393, + "step": 1539 + }, + { + "epoch": 0.12200435729847495, + "grad_norm": 2.9376042031960847, + "learning_rate": 1.955950194749947e-05, + "loss": 0.4775, + "step": 1540 + }, + { + "epoch": 0.12208358090711031, + "grad_norm": 2.815447831132903, + "learning_rate": 1.9558748428267682e-05, + "loss": 0.4363, + "step": 1541 + }, + { + "epoch": 0.1221628045157457, + "grad_norm": 2.5318662855694347, + "learning_rate": 1.9557994279638307e-05, + "loss": 0.3841, + "step": 1542 + }, + { + "epoch": 0.12224202812438106, + "grad_norm": 2.4755649117929472, + "learning_rate": 1.9557239501660995e-05, + "loss": 0.4157, + "step": 1543 + }, + { + "epoch": 0.12232125173301645, + "grad_norm": 2.7443704456283284, + "learning_rate": 1.955648409438545e-05, + "loss": 0.4058, + "step": 1544 + }, + { + "epoch": 0.12240047534165181, + "grad_norm": 2.8926291368017867, + "learning_rate": 1.955572805786141e-05, + "loss": 0.4513, + "step": 1545 + }, + { + "epoch": 0.12247969895028718, + "grad_norm": 2.5296947102822434, + "learning_rate": 1.9554971392138655e-05, + "loss": 0.4033, + "step": 1546 + }, + { + "epoch": 0.12255892255892256, + "grad_norm": 3.137970896321621, + "learning_rate": 1.955421409726701e-05, + "loss": 0.4648, + "step": 1547 + }, + { + "epoch": 0.12263814616755793, + "grad_norm": 2.598940583766311, + "learning_rate": 1.9553456173296342e-05, + "loss": 0.5123, + "step": 1548 + }, + { + "epoch": 0.12271736977619331, + "grad_norm": 2.845098533377211, + "learning_rate": 1.9552697620276547e-05, + "loss": 0.4893, + "step": 1549 + }, + { + "epoch": 0.12279659338482868, + "grad_norm": 2.171640858053366, + "learning_rate": 1.9551938438257583e-05, + "loss": 0.2753, + "step": 1550 + }, + { + "epoch": 0.12287581699346405, + "grad_norm": 2.8101364938004822, + "learning_rate": 1.9551178627289436e-05, + "loss": 0.413, + "step": 1551 + }, + { + "epoch": 0.12295504060209943, + "grad_norm": 2.4082972044518405, + "learning_rate": 1.9550418187422127e-05, + "loss": 0.2552, + "step": 1552 + }, + { + "epoch": 0.1230342642107348, + "grad_norm": 2.979666631985347, + "learning_rate": 1.954965711870574e-05, + "loss": 0.4414, + "step": 1553 + }, + { + "epoch": 0.12311348781937018, + "grad_norm": 2.395588349726931, + "learning_rate": 1.954889542119038e-05, + "loss": 0.4616, + "step": 1554 + }, + { + "epoch": 0.12319271142800554, + "grad_norm": 2.7724609630018824, + "learning_rate": 1.9548133094926203e-05, + "loss": 0.3376, + "step": 1555 + }, + { + "epoch": 0.12327193503664093, + "grad_norm": 3.9366385584127586, + "learning_rate": 1.9547370139963406e-05, + "loss": 0.4891, + "step": 1556 + }, + { + "epoch": 0.1233511586452763, + "grad_norm": 2.9095013900698232, + "learning_rate": 1.954660655635222e-05, + "loss": 0.4781, + "step": 1557 + }, + { + "epoch": 0.12343038225391166, + "grad_norm": 3.298047953183691, + "learning_rate": 1.954584234414293e-05, + "loss": 0.5337, + "step": 1558 + }, + { + "epoch": 0.12350960586254704, + "grad_norm": 2.8024751242673442, + "learning_rate": 1.954507750338585e-05, + "loss": 0.3895, + "step": 1559 + }, + { + "epoch": 0.12358882947118241, + "grad_norm": 2.6821293799796058, + "learning_rate": 1.954431203413135e-05, + "loss": 0.4845, + "step": 1560 + }, + { + "epoch": 0.12366805307981779, + "grad_norm": 2.780344685065403, + "learning_rate": 1.9543545936429824e-05, + "loss": 0.4859, + "step": 1561 + }, + { + "epoch": 0.12374727668845316, + "grad_norm": 2.9414187280778115, + "learning_rate": 1.954277921033172e-05, + "loss": 0.5408, + "step": 1562 + }, + { + "epoch": 0.12382650029708853, + "grad_norm": 2.788911029571584, + "learning_rate": 1.954201185588752e-05, + "loss": 0.5358, + "step": 1563 + }, + { + "epoch": 0.12390572390572391, + "grad_norm": 2.6190080183801183, + "learning_rate": 1.9541243873147752e-05, + "loss": 0.3719, + "step": 1564 + }, + { + "epoch": 0.12398494751435928, + "grad_norm": 2.886293663927001, + "learning_rate": 1.9540475262162988e-05, + "loss": 0.6145, + "step": 1565 + }, + { + "epoch": 0.12406417112299466, + "grad_norm": 2.496323677689286, + "learning_rate": 1.9539706022983827e-05, + "loss": 0.3729, + "step": 1566 + }, + { + "epoch": 0.12414339473163002, + "grad_norm": 2.9856334209437323, + "learning_rate": 1.9538936155660934e-05, + "loss": 0.4992, + "step": 1567 + }, + { + "epoch": 0.12422261834026539, + "grad_norm": 2.4619775946089995, + "learning_rate": 1.953816566024499e-05, + "loss": 0.5855, + "step": 1568 + }, + { + "epoch": 0.12430184194890077, + "grad_norm": 2.9789060122626276, + "learning_rate": 1.9537394536786734e-05, + "loss": 0.4244, + "step": 1569 + }, + { + "epoch": 0.12438106555753614, + "grad_norm": 2.9107892936738478, + "learning_rate": 1.9536622785336936e-05, + "loss": 0.429, + "step": 1570 + }, + { + "epoch": 0.12446028916617152, + "grad_norm": 2.575065227488331, + "learning_rate": 1.953585040594642e-05, + "loss": 0.3026, + "step": 1571 + }, + { + "epoch": 0.12453951277480689, + "grad_norm": 3.122138835238216, + "learning_rate": 1.9535077398666034e-05, + "loss": 0.3784, + "step": 1572 + }, + { + "epoch": 0.12461873638344227, + "grad_norm": 2.742184685925629, + "learning_rate": 1.953430376354668e-05, + "loss": 0.3151, + "step": 1573 + }, + { + "epoch": 0.12469795999207764, + "grad_norm": 2.9297185263841867, + "learning_rate": 1.9533529500639302e-05, + "loss": 0.4937, + "step": 1574 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 2.4532738059092907, + "learning_rate": 1.9532754609994878e-05, + "loss": 0.2943, + "step": 1575 + }, + { + "epoch": 0.12485640720934839, + "grad_norm": 2.899836302875545, + "learning_rate": 1.953197909166443e-05, + "loss": 0.4488, + "step": 1576 + }, + { + "epoch": 0.12493563081798376, + "grad_norm": 2.4029920674779537, + "learning_rate": 1.9531202945699027e-05, + "loss": 0.2845, + "step": 1577 + }, + { + "epoch": 0.12501485442661914, + "grad_norm": 2.8428512910017134, + "learning_rate": 1.953042617214977e-05, + "loss": 0.3614, + "step": 1578 + }, + { + "epoch": 0.12509407803525452, + "grad_norm": 2.798713468090119, + "learning_rate": 1.9529648771067805e-05, + "loss": 0.3813, + "step": 1579 + }, + { + "epoch": 0.12517330164388987, + "grad_norm": 2.556467631821324, + "learning_rate": 1.9528870742504328e-05, + "loss": 0.3817, + "step": 1580 + }, + { + "epoch": 0.12525252525252525, + "grad_norm": 3.1781166725157557, + "learning_rate": 1.9528092086510556e-05, + "loss": 0.4631, + "step": 1581 + }, + { + "epoch": 0.12533174886116064, + "grad_norm": 2.9061494862759623, + "learning_rate": 1.9527312803137767e-05, + "loss": 0.4058, + "step": 1582 + }, + { + "epoch": 0.125410972469796, + "grad_norm": 2.8860317624467933, + "learning_rate": 1.9526532892437275e-05, + "loss": 0.5292, + "step": 1583 + }, + { + "epoch": 0.12549019607843137, + "grad_norm": 3.1236873627040262, + "learning_rate": 1.9525752354460433e-05, + "loss": 0.542, + "step": 1584 + }, + { + "epoch": 0.12556941968706675, + "grad_norm": 2.779541996887809, + "learning_rate": 1.9524971189258627e-05, + "loss": 0.4801, + "step": 1585 + }, + { + "epoch": 0.1256486432957021, + "grad_norm": 2.9289218259670027, + "learning_rate": 1.9524189396883307e-05, + "loss": 0.3938, + "step": 1586 + }, + { + "epoch": 0.1257278669043375, + "grad_norm": 2.745061426545248, + "learning_rate": 1.9523406977385937e-05, + "loss": 0.4475, + "step": 1587 + }, + { + "epoch": 0.12580709051297287, + "grad_norm": 2.036368753501112, + "learning_rate": 1.9522623930818043e-05, + "loss": 0.3534, + "step": 1588 + }, + { + "epoch": 0.12588631412160825, + "grad_norm": 2.6342558060261028, + "learning_rate": 1.9521840257231183e-05, + "loss": 0.4457, + "step": 1589 + }, + { + "epoch": 0.1259655377302436, + "grad_norm": 2.7610086518946564, + "learning_rate": 1.9521055956676956e-05, + "loss": 0.4416, + "step": 1590 + }, + { + "epoch": 0.12604476133887899, + "grad_norm": 2.237689734924548, + "learning_rate": 1.9520271029207008e-05, + "loss": 0.3523, + "step": 1591 + }, + { + "epoch": 0.12612398494751437, + "grad_norm": 2.6380819504247723, + "learning_rate": 1.9519485474873027e-05, + "loss": 0.4576, + "step": 1592 + }, + { + "epoch": 0.12620320855614972, + "grad_norm": 2.599429333634746, + "learning_rate": 1.9518699293726727e-05, + "loss": 0.4203, + "step": 1593 + }, + { + "epoch": 0.1262824321647851, + "grad_norm": 2.9331156090809025, + "learning_rate": 1.9517912485819878e-05, + "loss": 0.3572, + "step": 1594 + }, + { + "epoch": 0.12636165577342048, + "grad_norm": 2.6930673687534266, + "learning_rate": 1.9517125051204292e-05, + "loss": 0.4401, + "step": 1595 + }, + { + "epoch": 0.12644087938205587, + "grad_norm": 2.9577575468001593, + "learning_rate": 1.9516336989931813e-05, + "loss": 0.5192, + "step": 1596 + }, + { + "epoch": 0.12652010299069122, + "grad_norm": 3.1088600669992417, + "learning_rate": 1.9515548302054335e-05, + "loss": 0.463, + "step": 1597 + }, + { + "epoch": 0.1265993265993266, + "grad_norm": 2.2750800320333333, + "learning_rate": 1.9514758987623784e-05, + "loss": 0.2897, + "step": 1598 + }, + { + "epoch": 0.12667855020796198, + "grad_norm": 2.7515800863371873, + "learning_rate": 1.9513969046692137e-05, + "loss": 0.4902, + "step": 1599 + }, + { + "epoch": 0.12675777381659734, + "grad_norm": 3.2977570850013342, + "learning_rate": 1.951317847931141e-05, + "loss": 0.4044, + "step": 1600 + }, + { + "epoch": 0.12683699742523272, + "grad_norm": 2.928104060953201, + "learning_rate": 1.9512387285533655e-05, + "loss": 0.3933, + "step": 1601 + }, + { + "epoch": 0.1269162210338681, + "grad_norm": 3.2747184680547807, + "learning_rate": 1.951159546541096e-05, + "loss": 0.5765, + "step": 1602 + }, + { + "epoch": 0.12699544464250345, + "grad_norm": 2.3706795108547953, + "learning_rate": 1.9510803018995477e-05, + "loss": 0.3136, + "step": 1603 + }, + { + "epoch": 0.12707466825113883, + "grad_norm": 2.9030221606700337, + "learning_rate": 1.9510009946339377e-05, + "loss": 0.4393, + "step": 1604 + }, + { + "epoch": 0.12715389185977422, + "grad_norm": 2.3529714777259283, + "learning_rate": 1.9509216247494882e-05, + "loss": 0.3389, + "step": 1605 + }, + { + "epoch": 0.1272331154684096, + "grad_norm": 3.25468941664847, + "learning_rate": 1.950842192251425e-05, + "loss": 0.4556, + "step": 1606 + }, + { + "epoch": 0.12731233907704495, + "grad_norm": 2.170992913070096, + "learning_rate": 1.950762697144979e-05, + "loss": 0.332, + "step": 1607 + }, + { + "epoch": 0.12739156268568033, + "grad_norm": 2.2553042551492033, + "learning_rate": 1.950683139435384e-05, + "loss": 0.2714, + "step": 1608 + }, + { + "epoch": 0.1274707862943157, + "grad_norm": 2.9878863394480524, + "learning_rate": 1.9506035191278784e-05, + "loss": 0.5143, + "step": 1609 + }, + { + "epoch": 0.12755000990295107, + "grad_norm": 2.3776207347682705, + "learning_rate": 1.9505238362277054e-05, + "loss": 0.3995, + "step": 1610 + }, + { + "epoch": 0.12762923351158645, + "grad_norm": 2.510010197043507, + "learning_rate": 1.9504440907401113e-05, + "loss": 0.3674, + "step": 1611 + }, + { + "epoch": 0.12770845712022183, + "grad_norm": 2.5749818492694234, + "learning_rate": 1.9503642826703468e-05, + "loss": 0.3398, + "step": 1612 + }, + { + "epoch": 0.1277876807288572, + "grad_norm": 2.403313331210437, + "learning_rate": 1.950284412023668e-05, + "loss": 0.3517, + "step": 1613 + }, + { + "epoch": 0.12786690433749257, + "grad_norm": 3.0503242942353617, + "learning_rate": 1.9502044788053322e-05, + "loss": 0.4798, + "step": 1614 + }, + { + "epoch": 0.12794612794612795, + "grad_norm": 2.7721261153916323, + "learning_rate": 1.9501244830206037e-05, + "loss": 0.4057, + "step": 1615 + }, + { + "epoch": 0.12802535155476333, + "grad_norm": 2.8444677105161924, + "learning_rate": 1.9500444246747502e-05, + "loss": 0.3998, + "step": 1616 + }, + { + "epoch": 0.12810457516339868, + "grad_norm": 2.6802089568829945, + "learning_rate": 1.9499643037730422e-05, + "loss": 0.409, + "step": 1617 + }, + { + "epoch": 0.12818379877203406, + "grad_norm": 2.8102940931801452, + "learning_rate": 1.949884120320756e-05, + "loss": 0.3982, + "step": 1618 + }, + { + "epoch": 0.12826302238066944, + "grad_norm": 3.231899657674601, + "learning_rate": 1.949803874323171e-05, + "loss": 0.3676, + "step": 1619 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 2.92260270105415, + "learning_rate": 1.949723565785571e-05, + "loss": 0.4368, + "step": 1620 + }, + { + "epoch": 0.12842146959794018, + "grad_norm": 2.5592811402494506, + "learning_rate": 1.9496431947132438e-05, + "loss": 0.4609, + "step": 1621 + }, + { + "epoch": 0.12850069320657556, + "grad_norm": 2.6221869318504423, + "learning_rate": 1.9495627611114817e-05, + "loss": 0.3139, + "step": 1622 + }, + { + "epoch": 0.12857991681521094, + "grad_norm": 3.663064416554614, + "learning_rate": 1.949482264985581e-05, + "loss": 0.5551, + "step": 1623 + }, + { + "epoch": 0.1286591404238463, + "grad_norm": 2.67366172507629, + "learning_rate": 1.9494017063408415e-05, + "loss": 0.4868, + "step": 1624 + }, + { + "epoch": 0.12873836403248168, + "grad_norm": 2.689543415717479, + "learning_rate": 1.9493210851825682e-05, + "loss": 0.3989, + "step": 1625 + }, + { + "epoch": 0.12881758764111706, + "grad_norm": 3.3765266523424255, + "learning_rate": 1.949240401516069e-05, + "loss": 0.4312, + "step": 1626 + }, + { + "epoch": 0.1288968112497524, + "grad_norm": 2.1715257505230205, + "learning_rate": 1.9491596553466568e-05, + "loss": 0.3475, + "step": 1627 + }, + { + "epoch": 0.1289760348583878, + "grad_norm": 2.6979449124694304, + "learning_rate": 1.9490788466796483e-05, + "loss": 0.4145, + "step": 1628 + }, + { + "epoch": 0.12905525846702318, + "grad_norm": 2.2746830664393527, + "learning_rate": 1.9489979755203646e-05, + "loss": 0.3342, + "step": 1629 + }, + { + "epoch": 0.12913448207565856, + "grad_norm": 2.648442571428701, + "learning_rate": 1.9489170418741306e-05, + "loss": 0.4784, + "step": 1630 + }, + { + "epoch": 0.1292137056842939, + "grad_norm": 2.603482192866092, + "learning_rate": 1.948836045746275e-05, + "loss": 0.3643, + "step": 1631 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 2.7804665993177875, + "learning_rate": 1.9487549871421316e-05, + "loss": 0.4675, + "step": 1632 + }, + { + "epoch": 0.12937215290156467, + "grad_norm": 3.347643506302992, + "learning_rate": 1.9486738660670373e-05, + "loss": 0.4955, + "step": 1633 + }, + { + "epoch": 0.12945137651020003, + "grad_norm": 2.382435796668833, + "learning_rate": 1.9485926825263334e-05, + "loss": 0.3684, + "step": 1634 + }, + { + "epoch": 0.1295306001188354, + "grad_norm": 2.8124658289034428, + "learning_rate": 1.948511436525366e-05, + "loss": 0.4708, + "step": 1635 + }, + { + "epoch": 0.1296098237274708, + "grad_norm": 2.397738474960787, + "learning_rate": 1.9484301280694845e-05, + "loss": 0.3054, + "step": 1636 + }, + { + "epoch": 0.12968904733610617, + "grad_norm": 2.901551190259369, + "learning_rate": 1.9483487571640424e-05, + "loss": 0.5474, + "step": 1637 + }, + { + "epoch": 0.12976827094474153, + "grad_norm": 2.511534172202647, + "learning_rate": 1.948267323814398e-05, + "loss": 0.3418, + "step": 1638 + }, + { + "epoch": 0.1298474945533769, + "grad_norm": 2.636660392767567, + "learning_rate": 1.948185828025913e-05, + "loss": 0.3109, + "step": 1639 + }, + { + "epoch": 0.1299267181620123, + "grad_norm": 2.4929883609443038, + "learning_rate": 1.9481042698039534e-05, + "loss": 0.389, + "step": 1640 + }, + { + "epoch": 0.13000594177064764, + "grad_norm": 2.4043485541317664, + "learning_rate": 1.94802264915389e-05, + "loss": 0.3345, + "step": 1641 + }, + { + "epoch": 0.13008516537928302, + "grad_norm": 3.0446967598026498, + "learning_rate": 1.9479409660810965e-05, + "loss": 0.4212, + "step": 1642 + }, + { + "epoch": 0.1301643889879184, + "grad_norm": 2.4660348140404094, + "learning_rate": 1.9478592205909517e-05, + "loss": 0.3405, + "step": 1643 + }, + { + "epoch": 0.13024361259655376, + "grad_norm": 2.9326105002394227, + "learning_rate": 1.947777412688838e-05, + "loss": 0.3936, + "step": 1644 + }, + { + "epoch": 0.13032283620518914, + "grad_norm": 3.3642632036398368, + "learning_rate": 1.947695542380142e-05, + "loss": 0.3913, + "step": 1645 + }, + { + "epoch": 0.13040205981382452, + "grad_norm": 2.902126061765541, + "learning_rate": 1.9476136096702546e-05, + "loss": 0.3567, + "step": 1646 + }, + { + "epoch": 0.1304812834224599, + "grad_norm": 3.505122731530886, + "learning_rate": 1.9475316145645706e-05, + "loss": 0.4399, + "step": 1647 + }, + { + "epoch": 0.13056050703109526, + "grad_norm": 3.362321420296482, + "learning_rate": 1.947449557068489e-05, + "loss": 0.3739, + "step": 1648 + }, + { + "epoch": 0.13063973063973064, + "grad_norm": 2.7043004382627194, + "learning_rate": 1.947367437187413e-05, + "loss": 0.4691, + "step": 1649 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 3.09994203153213, + "learning_rate": 1.9472852549267496e-05, + "loss": 0.484, + "step": 1650 + }, + { + "epoch": 0.13079817785700137, + "grad_norm": 3.3495132778616186, + "learning_rate": 1.9472030102919102e-05, + "loss": 0.4564, + "step": 1651 + }, + { + "epoch": 0.13087740146563676, + "grad_norm": 2.334095499135991, + "learning_rate": 1.9471207032883103e-05, + "loss": 0.4249, + "step": 1652 + }, + { + "epoch": 0.13095662507427214, + "grad_norm": 2.5976780141713265, + "learning_rate": 1.9470383339213693e-05, + "loss": 0.4537, + "step": 1653 + }, + { + "epoch": 0.13103584868290752, + "grad_norm": 2.855276637433223, + "learning_rate": 1.946955902196511e-05, + "loss": 0.3884, + "step": 1654 + }, + { + "epoch": 0.13111507229154287, + "grad_norm": 2.2793206621657167, + "learning_rate": 1.9468734081191627e-05, + "loss": 0.3887, + "step": 1655 + }, + { + "epoch": 0.13119429590017825, + "grad_norm": 2.7978508259665733, + "learning_rate": 1.9467908516947568e-05, + "loss": 0.5249, + "step": 1656 + }, + { + "epoch": 0.13127351950881364, + "grad_norm": 2.9427997156011787, + "learning_rate": 1.946708232928729e-05, + "loss": 0.459, + "step": 1657 + }, + { + "epoch": 0.131352743117449, + "grad_norm": 2.577970086491782, + "learning_rate": 1.9466255518265193e-05, + "loss": 0.4381, + "step": 1658 + }, + { + "epoch": 0.13143196672608437, + "grad_norm": 3.2167569523486668, + "learning_rate": 1.946542808393572e-05, + "loss": 0.4795, + "step": 1659 + }, + { + "epoch": 0.13151119033471975, + "grad_norm": 2.842328280690265, + "learning_rate": 1.946460002635335e-05, + "loss": 0.401, + "step": 1660 + }, + { + "epoch": 0.13159041394335513, + "grad_norm": 2.381339253336541, + "learning_rate": 1.946377134557261e-05, + "loss": 0.4599, + "step": 1661 + }, + { + "epoch": 0.1316696375519905, + "grad_norm": 2.4829991752808667, + "learning_rate": 1.9462942041648062e-05, + "loss": 0.343, + "step": 1662 + }, + { + "epoch": 0.13174886116062587, + "grad_norm": 2.564416558503891, + "learning_rate": 1.9462112114634316e-05, + "loss": 0.4138, + "step": 1663 + }, + { + "epoch": 0.13182808476926125, + "grad_norm": 2.587157250168123, + "learning_rate": 1.9461281564586014e-05, + "loss": 0.3455, + "step": 1664 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 2.2235393603415066, + "learning_rate": 1.9460450391557847e-05, + "loss": 0.3886, + "step": 1665 + }, + { + "epoch": 0.13198653198653199, + "grad_norm": 2.272000375557651, + "learning_rate": 1.945961859560454e-05, + "loss": 0.3728, + "step": 1666 + }, + { + "epoch": 0.13206575559516737, + "grad_norm": 2.737644215712852, + "learning_rate": 1.9458786176780868e-05, + "loss": 0.4289, + "step": 1667 + }, + { + "epoch": 0.13214497920380272, + "grad_norm": 2.477519689542186, + "learning_rate": 1.945795313514164e-05, + "loss": 0.3687, + "step": 1668 + }, + { + "epoch": 0.1322242028124381, + "grad_norm": 3.315891742066584, + "learning_rate": 1.9457119470741707e-05, + "loss": 0.4817, + "step": 1669 + }, + { + "epoch": 0.13230342642107348, + "grad_norm": 3.751583010646154, + "learning_rate": 1.9456285183635958e-05, + "loss": 0.4694, + "step": 1670 + }, + { + "epoch": 0.13238265002970886, + "grad_norm": 2.4207879338478726, + "learning_rate": 1.9455450273879332e-05, + "loss": 0.3309, + "step": 1671 + }, + { + "epoch": 0.13246187363834422, + "grad_norm": 3.62041416957926, + "learning_rate": 1.94546147415268e-05, + "loss": 0.4837, + "step": 1672 + }, + { + "epoch": 0.1325410972469796, + "grad_norm": 2.518393835869722, + "learning_rate": 1.9453778586633386e-05, + "loss": 0.4374, + "step": 1673 + }, + { + "epoch": 0.13262032085561498, + "grad_norm": 2.972761889764143, + "learning_rate": 1.9452941809254136e-05, + "loss": 0.515, + "step": 1674 + }, + { + "epoch": 0.13269954446425034, + "grad_norm": 2.4852762300429463, + "learning_rate": 1.9452104409444153e-05, + "loss": 0.4269, + "step": 1675 + }, + { + "epoch": 0.13277876807288572, + "grad_norm": 2.691225002242367, + "learning_rate": 1.9451266387258576e-05, + "loss": 0.3798, + "step": 1676 + }, + { + "epoch": 0.1328579916815211, + "grad_norm": 2.66315078919408, + "learning_rate": 1.9450427742752583e-05, + "loss": 0.444, + "step": 1677 + }, + { + "epoch": 0.13293721529015648, + "grad_norm": 2.172922479297124, + "learning_rate": 1.9449588475981394e-05, + "loss": 0.4054, + "step": 1678 + }, + { + "epoch": 0.13301643889879183, + "grad_norm": 2.3766279143702116, + "learning_rate": 1.9448748587000277e-05, + "loss": 0.3742, + "step": 1679 + }, + { + "epoch": 0.13309566250742721, + "grad_norm": 2.659481511765234, + "learning_rate": 1.944790807586453e-05, + "loss": 0.4284, + "step": 1680 + }, + { + "epoch": 0.1331748861160626, + "grad_norm": 2.403462541581427, + "learning_rate": 1.9447066942629495e-05, + "loss": 0.2833, + "step": 1681 + }, + { + "epoch": 0.13325410972469795, + "grad_norm": 2.4337194417494956, + "learning_rate": 1.9446225187350558e-05, + "loss": 0.4756, + "step": 1682 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.6078413867928507, + "learning_rate": 1.9445382810083143e-05, + "loss": 0.4559, + "step": 1683 + }, + { + "epoch": 0.1334125569419687, + "grad_norm": 1.9835587571797169, + "learning_rate": 1.944453981088272e-05, + "loss": 0.3094, + "step": 1684 + }, + { + "epoch": 0.13349178055060407, + "grad_norm": 2.356588714443372, + "learning_rate": 1.9443696189804793e-05, + "loss": 0.4032, + "step": 1685 + }, + { + "epoch": 0.13357100415923945, + "grad_norm": 2.2430736075244946, + "learning_rate": 1.9442851946904914e-05, + "loss": 0.3803, + "step": 1686 + }, + { + "epoch": 0.13365022776787483, + "grad_norm": 3.0731427134840255, + "learning_rate": 1.9442007082238673e-05, + "loss": 0.3625, + "step": 1687 + }, + { + "epoch": 0.1337294513765102, + "grad_norm": 2.3297474213002585, + "learning_rate": 1.944116159586169e-05, + "loss": 0.3436, + "step": 1688 + }, + { + "epoch": 0.13380867498514556, + "grad_norm": 2.431890377062831, + "learning_rate": 1.944031548782965e-05, + "loss": 0.3478, + "step": 1689 + }, + { + "epoch": 0.13388789859378095, + "grad_norm": 2.849904722734544, + "learning_rate": 1.9439468758198258e-05, + "loss": 0.352, + "step": 1690 + }, + { + "epoch": 0.13396712220241633, + "grad_norm": 3.39934604143279, + "learning_rate": 1.943862140702327e-05, + "loss": 0.3997, + "step": 1691 + }, + { + "epoch": 0.13404634581105168, + "grad_norm": 2.562701418400086, + "learning_rate": 1.9437773434360476e-05, + "loss": 0.3949, + "step": 1692 + }, + { + "epoch": 0.13412556941968706, + "grad_norm": 2.909763695066894, + "learning_rate": 1.943692484026571e-05, + "loss": 0.4224, + "step": 1693 + }, + { + "epoch": 0.13420479302832244, + "grad_norm": 3.0401964946470614, + "learning_rate": 1.9436075624794853e-05, + "loss": 0.3896, + "step": 1694 + }, + { + "epoch": 0.13428401663695783, + "grad_norm": 3.884386828365048, + "learning_rate": 1.9435225788003822e-05, + "loss": 0.4134, + "step": 1695 + }, + { + "epoch": 0.13436324024559318, + "grad_norm": 2.933286761944894, + "learning_rate": 1.943437532994857e-05, + "loss": 0.4776, + "step": 1696 + }, + { + "epoch": 0.13444246385422856, + "grad_norm": 2.787306440849998, + "learning_rate": 1.9433524250685098e-05, + "loss": 0.5181, + "step": 1697 + }, + { + "epoch": 0.13452168746286394, + "grad_norm": 2.509240646798951, + "learning_rate": 1.9432672550269446e-05, + "loss": 0.3262, + "step": 1698 + }, + { + "epoch": 0.1346009110714993, + "grad_norm": 2.7167072597122974, + "learning_rate": 1.943182022875769e-05, + "loss": 0.3762, + "step": 1699 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 2.669643460412162, + "learning_rate": 1.9430967286205962e-05, + "loss": 0.3957, + "step": 1700 + }, + { + "epoch": 0.13475935828877006, + "grad_norm": 2.4270850822620114, + "learning_rate": 1.9430113722670412e-05, + "loss": 0.3467, + "step": 1701 + }, + { + "epoch": 0.13483858189740544, + "grad_norm": 2.7138244790146815, + "learning_rate": 1.942925953820725e-05, + "loss": 0.403, + "step": 1702 + }, + { + "epoch": 0.1349178055060408, + "grad_norm": 3.0766988709084515, + "learning_rate": 1.9428404732872716e-05, + "loss": 0.3518, + "step": 1703 + }, + { + "epoch": 0.13499702911467618, + "grad_norm": 2.9382266391598213, + "learning_rate": 1.94275493067231e-05, + "loss": 0.5382, + "step": 1704 + }, + { + "epoch": 0.13507625272331156, + "grad_norm": 2.9597048824604255, + "learning_rate": 1.9426693259814725e-05, + "loss": 0.4127, + "step": 1705 + }, + { + "epoch": 0.1351554763319469, + "grad_norm": 2.5322215454521766, + "learning_rate": 1.9425836592203954e-05, + "loss": 0.3423, + "step": 1706 + }, + { + "epoch": 0.1352346999405823, + "grad_norm": 2.7049015822029037, + "learning_rate": 1.94249793039472e-05, + "loss": 0.4701, + "step": 1707 + }, + { + "epoch": 0.13531392354921767, + "grad_norm": 2.4246078231505903, + "learning_rate": 1.9424121395100907e-05, + "loss": 0.3667, + "step": 1708 + }, + { + "epoch": 0.13539314715785303, + "grad_norm": 2.6928624533661862, + "learning_rate": 1.9423262865721567e-05, + "loss": 0.3661, + "step": 1709 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 2.6926214334991236, + "learning_rate": 1.9422403715865708e-05, + "loss": 0.4549, + "step": 1710 + }, + { + "epoch": 0.1355515943751238, + "grad_norm": 2.8664509011884736, + "learning_rate": 1.9421543945589904e-05, + "loss": 0.3646, + "step": 1711 + }, + { + "epoch": 0.13563081798375917, + "grad_norm": 2.5495186171854427, + "learning_rate": 1.9420683554950765e-05, + "loss": 0.368, + "step": 1712 + }, + { + "epoch": 0.13571004159239453, + "grad_norm": 3.2178015112401224, + "learning_rate": 1.9419822544004942e-05, + "loss": 0.5219, + "step": 1713 + }, + { + "epoch": 0.1357892652010299, + "grad_norm": 3.280884238267083, + "learning_rate": 1.941896091280913e-05, + "loss": 0.5009, + "step": 1714 + }, + { + "epoch": 0.1358684888096653, + "grad_norm": 2.826399007686927, + "learning_rate": 1.9418098661420064e-05, + "loss": 0.413, + "step": 1715 + }, + { + "epoch": 0.13594771241830064, + "grad_norm": 2.3069129821670855, + "learning_rate": 1.9417235789894517e-05, + "loss": 0.576, + "step": 1716 + }, + { + "epoch": 0.13602693602693602, + "grad_norm": 2.559751687671898, + "learning_rate": 1.9416372298289306e-05, + "loss": 0.4126, + "step": 1717 + }, + { + "epoch": 0.1361061596355714, + "grad_norm": 2.4776879979271964, + "learning_rate": 1.941550818666129e-05, + "loss": 0.3482, + "step": 1718 + }, + { + "epoch": 0.1361853832442068, + "grad_norm": 2.8479681770747467, + "learning_rate": 1.941464345506736e-05, + "loss": 0.4583, + "step": 1719 + }, + { + "epoch": 0.13626460685284214, + "grad_norm": 2.1690098167702314, + "learning_rate": 1.9413778103564462e-05, + "loss": 0.5257, + "step": 1720 + }, + { + "epoch": 0.13634383046147752, + "grad_norm": 2.455941090465394, + "learning_rate": 1.9412912132209573e-05, + "loss": 0.4485, + "step": 1721 + }, + { + "epoch": 0.1364230540701129, + "grad_norm": 2.5830636468674864, + "learning_rate": 1.941204554105971e-05, + "loss": 0.3915, + "step": 1722 + }, + { + "epoch": 0.13650227767874826, + "grad_norm": 2.376180009234982, + "learning_rate": 1.941117833017194e-05, + "loss": 0.3951, + "step": 1723 + }, + { + "epoch": 0.13658150128738364, + "grad_norm": 2.6244711090192974, + "learning_rate": 1.9410310499603356e-05, + "loss": 0.4578, + "step": 1724 + }, + { + "epoch": 0.13666072489601902, + "grad_norm": 2.261573384408937, + "learning_rate": 1.9409442049411104e-05, + "loss": 0.306, + "step": 1725 + }, + { + "epoch": 0.13673994850465437, + "grad_norm": 2.8011175232833274, + "learning_rate": 1.9408572979652373e-05, + "loss": 0.4511, + "step": 1726 + }, + { + "epoch": 0.13681917211328976, + "grad_norm": 2.748409319210236, + "learning_rate": 1.940770329038438e-05, + "loss": 0.3832, + "step": 1727 + }, + { + "epoch": 0.13689839572192514, + "grad_norm": 2.6968365789827042, + "learning_rate": 1.9406832981664392e-05, + "loss": 0.3206, + "step": 1728 + }, + { + "epoch": 0.13697761933056052, + "grad_norm": 2.420111617525425, + "learning_rate": 1.9405962053549717e-05, + "loss": 0.4374, + "step": 1729 + }, + { + "epoch": 0.13705684293919587, + "grad_norm": 2.6371164388381176, + "learning_rate": 1.9405090506097698e-05, + "loss": 0.4315, + "step": 1730 + }, + { + "epoch": 0.13713606654783125, + "grad_norm": 2.547212877630584, + "learning_rate": 1.9404218339365724e-05, + "loss": 0.3484, + "step": 1731 + }, + { + "epoch": 0.13721529015646663, + "grad_norm": 3.5284779174582153, + "learning_rate": 1.940334555341122e-05, + "loss": 0.4274, + "step": 1732 + }, + { + "epoch": 0.137294513765102, + "grad_norm": 2.5627729261977255, + "learning_rate": 1.940247214829166e-05, + "loss": 0.3342, + "step": 1733 + }, + { + "epoch": 0.13737373737373737, + "grad_norm": 2.909823770122146, + "learning_rate": 1.9401598124064552e-05, + "loss": 0.4361, + "step": 1734 + }, + { + "epoch": 0.13745296098237275, + "grad_norm": 2.3163164996527628, + "learning_rate": 1.9400723480787446e-05, + "loss": 0.3918, + "step": 1735 + }, + { + "epoch": 0.13753218459100813, + "grad_norm": 2.3744330735184627, + "learning_rate": 1.9399848218517927e-05, + "loss": 0.3415, + "step": 1736 + }, + { + "epoch": 0.1376114081996435, + "grad_norm": 2.0240339542048384, + "learning_rate": 1.9398972337313634e-05, + "loss": 0.3365, + "step": 1737 + }, + { + "epoch": 0.13769063180827887, + "grad_norm": 2.1905630441425576, + "learning_rate": 1.939809583723224e-05, + "loss": 0.2921, + "step": 1738 + }, + { + "epoch": 0.13776985541691425, + "grad_norm": 2.8982878046600637, + "learning_rate": 1.9397218718331455e-05, + "loss": 0.5758, + "step": 1739 + }, + { + "epoch": 0.1378490790255496, + "grad_norm": 2.325300339897712, + "learning_rate": 1.939634098066903e-05, + "loss": 0.4574, + "step": 1740 + }, + { + "epoch": 0.13792830263418498, + "grad_norm": 2.4353157449939604, + "learning_rate": 1.9395462624302768e-05, + "loss": 0.4399, + "step": 1741 + }, + { + "epoch": 0.13800752624282037, + "grad_norm": 2.3425346700048, + "learning_rate": 1.93945836492905e-05, + "loss": 0.404, + "step": 1742 + }, + { + "epoch": 0.13808674985145572, + "grad_norm": 2.8871416616390677, + "learning_rate": 1.93937040556901e-05, + "loss": 0.549, + "step": 1743 + }, + { + "epoch": 0.1381659734600911, + "grad_norm": 4.0136586054759755, + "learning_rate": 1.939282384355949e-05, + "loss": 0.512, + "step": 1744 + }, + { + "epoch": 0.13824519706872648, + "grad_norm": 2.5005383446804115, + "learning_rate": 1.9391943012956623e-05, + "loss": 0.4017, + "step": 1745 + }, + { + "epoch": 0.13832442067736186, + "grad_norm": 2.3596636785761893, + "learning_rate": 1.93910615639395e-05, + "loss": 0.362, + "step": 1746 + }, + { + "epoch": 0.13840364428599722, + "grad_norm": 2.79467014377652, + "learning_rate": 1.9390179496566162e-05, + "loss": 0.3851, + "step": 1747 + }, + { + "epoch": 0.1384828678946326, + "grad_norm": 3.2382132850629, + "learning_rate": 1.938929681089469e-05, + "loss": 0.4707, + "step": 1748 + }, + { + "epoch": 0.13856209150326798, + "grad_norm": 3.409663158696584, + "learning_rate": 1.9388413506983196e-05, + "loss": 0.488, + "step": 1749 + }, + { + "epoch": 0.13864131511190333, + "grad_norm": 2.9866191563452777, + "learning_rate": 1.938752958488985e-05, + "loss": 0.4674, + "step": 1750 + }, + { + "epoch": 0.13872053872053872, + "grad_norm": 2.4722404261927147, + "learning_rate": 1.9386645044672848e-05, + "loss": 0.329, + "step": 1751 + }, + { + "epoch": 0.1387997623291741, + "grad_norm": 2.552647734846144, + "learning_rate": 1.9385759886390433e-05, + "loss": 0.3518, + "step": 1752 + }, + { + "epoch": 0.13887898593780948, + "grad_norm": 2.9725759388961737, + "learning_rate": 1.9384874110100897e-05, + "loss": 0.459, + "step": 1753 + }, + { + "epoch": 0.13895820954644483, + "grad_norm": 2.806089119703009, + "learning_rate": 1.9383987715862554e-05, + "loss": 0.3819, + "step": 1754 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 2.6964792416535515, + "learning_rate": 1.9383100703733774e-05, + "loss": 0.5298, + "step": 1755 + }, + { + "epoch": 0.1391166567637156, + "grad_norm": 3.028829052005943, + "learning_rate": 1.9382213073772962e-05, + "loss": 0.4913, + "step": 1756 + }, + { + "epoch": 0.13919588037235095, + "grad_norm": 3.1683897779788777, + "learning_rate": 1.938132482603856e-05, + "loss": 0.5255, + "step": 1757 + }, + { + "epoch": 0.13927510398098633, + "grad_norm": 2.850752841125001, + "learning_rate": 1.9380435960589065e-05, + "loss": 0.4175, + "step": 1758 + }, + { + "epoch": 0.1393543275896217, + "grad_norm": 2.4336769845841766, + "learning_rate": 1.937954647748299e-05, + "loss": 0.4143, + "step": 1759 + }, + { + "epoch": 0.1394335511982571, + "grad_norm": 2.8209896320010595, + "learning_rate": 1.9378656376778914e-05, + "loss": 0.4958, + "step": 1760 + }, + { + "epoch": 0.13951277480689245, + "grad_norm": 3.3084398717923187, + "learning_rate": 1.9377765658535445e-05, + "loss": 0.408, + "step": 1761 + }, + { + "epoch": 0.13959199841552783, + "grad_norm": 2.675740052142721, + "learning_rate": 1.937687432281123e-05, + "loss": 0.2555, + "step": 1762 + }, + { + "epoch": 0.1396712220241632, + "grad_norm": 2.05647363861636, + "learning_rate": 1.9375982369664958e-05, + "loss": 0.3897, + "step": 1763 + }, + { + "epoch": 0.13975044563279856, + "grad_norm": 2.74097400063134, + "learning_rate": 1.937508979915536e-05, + "loss": 0.4509, + "step": 1764 + }, + { + "epoch": 0.13982966924143395, + "grad_norm": 2.859127455478302, + "learning_rate": 1.9374196611341212e-05, + "loss": 0.6438, + "step": 1765 + }, + { + "epoch": 0.13990889285006933, + "grad_norm": 2.7073035930658094, + "learning_rate": 1.937330280628132e-05, + "loss": 0.5711, + "step": 1766 + }, + { + "epoch": 0.13998811645870468, + "grad_norm": 2.6455247709703817, + "learning_rate": 1.937240838403454e-05, + "loss": 0.4085, + "step": 1767 + }, + { + "epoch": 0.14006734006734006, + "grad_norm": 2.457411513078964, + "learning_rate": 1.9371513344659764e-05, + "loss": 0.3975, + "step": 1768 + }, + { + "epoch": 0.14014656367597544, + "grad_norm": 2.245731219545626, + "learning_rate": 1.937061768821593e-05, + "loss": 0.358, + "step": 1769 + }, + { + "epoch": 0.14022578728461083, + "grad_norm": 2.6889545638155368, + "learning_rate": 1.936972141476201e-05, + "loss": 0.305, + "step": 1770 + }, + { + "epoch": 0.14030501089324618, + "grad_norm": 2.783009301793553, + "learning_rate": 1.936882452435702e-05, + "loss": 0.3615, + "step": 1771 + }, + { + "epoch": 0.14038423450188156, + "grad_norm": 2.614667162035841, + "learning_rate": 1.936792701706001e-05, + "loss": 0.3377, + "step": 1772 + }, + { + "epoch": 0.14046345811051694, + "grad_norm": 2.4262050626102862, + "learning_rate": 1.9367028892930088e-05, + "loss": 0.318, + "step": 1773 + }, + { + "epoch": 0.1405426817191523, + "grad_norm": 3.3041205906260798, + "learning_rate": 1.9366130152026378e-05, + "loss": 0.4841, + "step": 1774 + }, + { + "epoch": 0.14062190532778768, + "grad_norm": 2.2797815061366866, + "learning_rate": 1.936523079440807e-05, + "loss": 0.332, + "step": 1775 + }, + { + "epoch": 0.14070112893642306, + "grad_norm": 2.7699321107417787, + "learning_rate": 1.936433082013437e-05, + "loss": 0.4473, + "step": 1776 + }, + { + "epoch": 0.14078035254505844, + "grad_norm": 2.1374182325870565, + "learning_rate": 1.936343022926455e-05, + "loss": 0.3788, + "step": 1777 + }, + { + "epoch": 0.1408595761536938, + "grad_norm": 3.0339973699361042, + "learning_rate": 1.93625290218579e-05, + "loss": 0.5335, + "step": 1778 + }, + { + "epoch": 0.14093879976232918, + "grad_norm": 2.6758153115011516, + "learning_rate": 1.9361627197973767e-05, + "loss": 0.3508, + "step": 1779 + }, + { + "epoch": 0.14101802337096456, + "grad_norm": 2.385172483226919, + "learning_rate": 1.9360724757671525e-05, + "loss": 0.3692, + "step": 1780 + }, + { + "epoch": 0.1410972469795999, + "grad_norm": 2.2394871546720396, + "learning_rate": 1.93598217010106e-05, + "loss": 0.4537, + "step": 1781 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 2.434345583261253, + "learning_rate": 1.9358918028050453e-05, + "loss": 0.4931, + "step": 1782 + }, + { + "epoch": 0.14125569419687067, + "grad_norm": 2.501755616044473, + "learning_rate": 1.9358013738850586e-05, + "loss": 0.3767, + "step": 1783 + }, + { + "epoch": 0.14133491780550603, + "grad_norm": 2.3754157703780248, + "learning_rate": 1.935710883347054e-05, + "loss": 0.4095, + "step": 1784 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 2.6069791431240463, + "learning_rate": 1.9356203311969903e-05, + "loss": 0.4818, + "step": 1785 + }, + { + "epoch": 0.1414933650227768, + "grad_norm": 2.2688673897499085, + "learning_rate": 1.9355297174408298e-05, + "loss": 0.3397, + "step": 1786 + }, + { + "epoch": 0.14157258863141217, + "grad_norm": 2.1285235170660766, + "learning_rate": 1.9354390420845387e-05, + "loss": 0.3791, + "step": 1787 + }, + { + "epoch": 0.14165181224004753, + "grad_norm": 3.20407914228043, + "learning_rate": 1.9353483051340876e-05, + "loss": 0.4441, + "step": 1788 + }, + { + "epoch": 0.1417310358486829, + "grad_norm": 2.7316191709489, + "learning_rate": 1.9352575065954515e-05, + "loss": 0.5762, + "step": 1789 + }, + { + "epoch": 0.1418102594573183, + "grad_norm": 2.670139626233861, + "learning_rate": 1.9351666464746087e-05, + "loss": 0.4172, + "step": 1790 + }, + { + "epoch": 0.14188948306595364, + "grad_norm": 2.5409369456840296, + "learning_rate": 1.935075724777542e-05, + "loss": 0.4056, + "step": 1791 + }, + { + "epoch": 0.14196870667458902, + "grad_norm": 2.876611771770737, + "learning_rate": 1.9349847415102378e-05, + "loss": 0.4431, + "step": 1792 + }, + { + "epoch": 0.1420479302832244, + "grad_norm": 2.363433386652227, + "learning_rate": 1.9348936966786874e-05, + "loss": 0.3403, + "step": 1793 + }, + { + "epoch": 0.1421271538918598, + "grad_norm": 3.1314688480524193, + "learning_rate": 1.9348025902888858e-05, + "loss": 0.4836, + "step": 1794 + }, + { + "epoch": 0.14220637750049514, + "grad_norm": 2.902301365139742, + "learning_rate": 1.9347114223468316e-05, + "loss": 0.383, + "step": 1795 + }, + { + "epoch": 0.14228560110913052, + "grad_norm": 3.229547997557393, + "learning_rate": 1.9346201928585273e-05, + "loss": 0.5752, + "step": 1796 + }, + { + "epoch": 0.1423648247177659, + "grad_norm": 2.631268096111741, + "learning_rate": 1.9345289018299807e-05, + "loss": 0.3044, + "step": 1797 + }, + { + "epoch": 0.14244404832640126, + "grad_norm": 2.367723781863915, + "learning_rate": 1.9344375492672024e-05, + "loss": 0.3397, + "step": 1798 + }, + { + "epoch": 0.14252327193503664, + "grad_norm": 2.570068539571693, + "learning_rate": 1.934346135176208e-05, + "loss": 0.2447, + "step": 1799 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 2.6413623735013214, + "learning_rate": 1.9342546595630162e-05, + "loss": 0.4542, + "step": 1800 + }, + { + "epoch": 0.1426817191523074, + "grad_norm": 2.6938621139400984, + "learning_rate": 1.9341631224336503e-05, + "loss": 0.4423, + "step": 1801 + }, + { + "epoch": 0.14276094276094276, + "grad_norm": 2.508217181521343, + "learning_rate": 1.934071523794138e-05, + "loss": 0.4365, + "step": 1802 + }, + { + "epoch": 0.14284016636957814, + "grad_norm": 2.891330407700625, + "learning_rate": 1.9339798636505102e-05, + "loss": 0.3714, + "step": 1803 + }, + { + "epoch": 0.14291938997821352, + "grad_norm": 2.2090716715124077, + "learning_rate": 1.9338881420088023e-05, + "loss": 0.4381, + "step": 1804 + }, + { + "epoch": 0.14299861358684887, + "grad_norm": 2.7368042797276817, + "learning_rate": 1.933796358875054e-05, + "loss": 0.491, + "step": 1805 + }, + { + "epoch": 0.14307783719548425, + "grad_norm": 2.7439895557239486, + "learning_rate": 1.9337045142553085e-05, + "loss": 0.4179, + "step": 1806 + }, + { + "epoch": 0.14315706080411963, + "grad_norm": 2.7852492841905265, + "learning_rate": 1.9336126081556134e-05, + "loss": 0.4837, + "step": 1807 + }, + { + "epoch": 0.143236284412755, + "grad_norm": 2.515799825706865, + "learning_rate": 1.9335206405820208e-05, + "loss": 0.3615, + "step": 1808 + }, + { + "epoch": 0.14331550802139037, + "grad_norm": 2.5820336848485357, + "learning_rate": 1.933428611540585e-05, + "loss": 0.4055, + "step": 1809 + }, + { + "epoch": 0.14339473163002575, + "grad_norm": 2.981222838263862, + "learning_rate": 1.9333365210373668e-05, + "loss": 0.4564, + "step": 1810 + }, + { + "epoch": 0.14347395523866113, + "grad_norm": 2.2107166964307927, + "learning_rate": 1.93324436907843e-05, + "loss": 0.3019, + "step": 1811 + }, + { + "epoch": 0.1435531788472965, + "grad_norm": 2.5502536517646, + "learning_rate": 1.9331521556698415e-05, + "loss": 0.4435, + "step": 1812 + }, + { + "epoch": 0.14363240245593187, + "grad_norm": 2.5045531843154345, + "learning_rate": 1.9330598808176736e-05, + "loss": 0.4225, + "step": 1813 + }, + { + "epoch": 0.14371162606456725, + "grad_norm": 2.7058029849479355, + "learning_rate": 1.9329675445280024e-05, + "loss": 0.3797, + "step": 1814 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 2.21236360613346, + "learning_rate": 1.9328751468069075e-05, + "loss": 0.272, + "step": 1815 + }, + { + "epoch": 0.14387007328183798, + "grad_norm": 2.3096842202082035, + "learning_rate": 1.932782687660473e-05, + "loss": 0.4776, + "step": 1816 + }, + { + "epoch": 0.14394929689047337, + "grad_norm": 2.6397581305612223, + "learning_rate": 1.9326901670947868e-05, + "loss": 0.4297, + "step": 1817 + }, + { + "epoch": 0.14402852049910875, + "grad_norm": 2.5240663320757832, + "learning_rate": 1.9325975851159406e-05, + "loss": 0.3381, + "step": 1818 + }, + { + "epoch": 0.1441077441077441, + "grad_norm": 2.6027349216933007, + "learning_rate": 1.932504941730031e-05, + "loss": 0.3218, + "step": 1819 + }, + { + "epoch": 0.14418696771637948, + "grad_norm": 2.91259323593562, + "learning_rate": 1.932412236943158e-05, + "loss": 0.4759, + "step": 1820 + }, + { + "epoch": 0.14426619132501486, + "grad_norm": 2.4509157236949175, + "learning_rate": 1.9323194707614253e-05, + "loss": 0.3445, + "step": 1821 + }, + { + "epoch": 0.14434541493365022, + "grad_norm": 2.221462385128824, + "learning_rate": 1.932226643190942e-05, + "loss": 0.4278, + "step": 1822 + }, + { + "epoch": 0.1444246385422856, + "grad_norm": 2.8548104438241957, + "learning_rate": 1.9321337542378193e-05, + "loss": 0.5667, + "step": 1823 + }, + { + "epoch": 0.14450386215092098, + "grad_norm": 2.7113131051064085, + "learning_rate": 1.9320408039081745e-05, + "loss": 0.3562, + "step": 1824 + }, + { + "epoch": 0.14458308575955633, + "grad_norm": 2.613420516914703, + "learning_rate": 1.9319477922081273e-05, + "loss": 0.3635, + "step": 1825 + }, + { + "epoch": 0.14466230936819172, + "grad_norm": 2.602287139400813, + "learning_rate": 1.9318547191438018e-05, + "loss": 0.3518, + "step": 1826 + }, + { + "epoch": 0.1447415329768271, + "grad_norm": 2.490658825983515, + "learning_rate": 1.9317615847213274e-05, + "loss": 0.3429, + "step": 1827 + }, + { + "epoch": 0.14482075658546248, + "grad_norm": 2.915452927581254, + "learning_rate": 1.931668388946836e-05, + "loss": 0.3886, + "step": 1828 + }, + { + "epoch": 0.14489998019409783, + "grad_norm": 2.526407207891424, + "learning_rate": 1.9315751318264636e-05, + "loss": 0.5159, + "step": 1829 + }, + { + "epoch": 0.14497920380273321, + "grad_norm": 2.955168164747815, + "learning_rate": 1.9314818133663516e-05, + "loss": 0.4846, + "step": 1830 + }, + { + "epoch": 0.1450584274113686, + "grad_norm": 3.5121858068678686, + "learning_rate": 1.9313884335726443e-05, + "loss": 0.4437, + "step": 1831 + }, + { + "epoch": 0.14513765102000395, + "grad_norm": 2.4542056913989954, + "learning_rate": 1.93129499245149e-05, + "loss": 0.4038, + "step": 1832 + }, + { + "epoch": 0.14521687462863933, + "grad_norm": 2.614970627669613, + "learning_rate": 1.9312014900090416e-05, + "loss": 0.4505, + "step": 1833 + }, + { + "epoch": 0.1452960982372747, + "grad_norm": 2.94459223807154, + "learning_rate": 1.931107926251456e-05, + "loss": 0.3997, + "step": 1834 + }, + { + "epoch": 0.1453753218459101, + "grad_norm": 2.0741903242743276, + "learning_rate": 1.931014301184893e-05, + "loss": 0.3576, + "step": 1835 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 3.0413031471553604, + "learning_rate": 1.9309206148155188e-05, + "loss": 0.4369, + "step": 1836 + }, + { + "epoch": 0.14553376906318083, + "grad_norm": 3.022673240121045, + "learning_rate": 1.930826867149501e-05, + "loss": 0.3604, + "step": 1837 + }, + { + "epoch": 0.1456129926718162, + "grad_norm": 2.6291402627308327, + "learning_rate": 1.9307330581930127e-05, + "loss": 0.4243, + "step": 1838 + }, + { + "epoch": 0.14569221628045156, + "grad_norm": 2.5038010753974613, + "learning_rate": 1.930639187952231e-05, + "loss": 0.394, + "step": 1839 + }, + { + "epoch": 0.14577143988908695, + "grad_norm": 2.4377826475629245, + "learning_rate": 1.930545256433337e-05, + "loss": 0.3946, + "step": 1840 + }, + { + "epoch": 0.14585066349772233, + "grad_norm": 2.5884114901224793, + "learning_rate": 1.930451263642515e-05, + "loss": 0.4143, + "step": 1841 + }, + { + "epoch": 0.14592988710635768, + "grad_norm": 2.4214111746774387, + "learning_rate": 1.9303572095859545e-05, + "loss": 0.4102, + "step": 1842 + }, + { + "epoch": 0.14600911071499306, + "grad_norm": 2.521537916787601, + "learning_rate": 1.9302630942698487e-05, + "loss": 0.3341, + "step": 1843 + }, + { + "epoch": 0.14608833432362844, + "grad_norm": 2.8937175398340416, + "learning_rate": 1.9301689177003938e-05, + "loss": 0.4283, + "step": 1844 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 2.110164510539122, + "learning_rate": 1.9300746798837913e-05, + "loss": 0.3956, + "step": 1845 + }, + { + "epoch": 0.14624678154089918, + "grad_norm": 3.300560078020215, + "learning_rate": 1.9299803808262466e-05, + "loss": 0.4582, + "step": 1846 + }, + { + "epoch": 0.14632600514953456, + "grad_norm": 2.5135784194558655, + "learning_rate": 1.9298860205339685e-05, + "loss": 0.338, + "step": 1847 + }, + { + "epoch": 0.14640522875816994, + "grad_norm": 2.5355473042736016, + "learning_rate": 1.9297915990131704e-05, + "loss": 0.3368, + "step": 1848 + }, + { + "epoch": 0.1464844523668053, + "grad_norm": 2.753276065349817, + "learning_rate": 1.9296971162700696e-05, + "loss": 0.435, + "step": 1849 + }, + { + "epoch": 0.14656367597544068, + "grad_norm": 2.3609144108329203, + "learning_rate": 1.9296025723108867e-05, + "loss": 0.3793, + "step": 1850 + }, + { + "epoch": 0.14664289958407606, + "grad_norm": 2.7500007886832107, + "learning_rate": 1.9295079671418474e-05, + "loss": 0.4281, + "step": 1851 + }, + { + "epoch": 0.14672212319271144, + "grad_norm": 2.2373448308827384, + "learning_rate": 1.929413300769181e-05, + "loss": 0.3577, + "step": 1852 + }, + { + "epoch": 0.1468013468013468, + "grad_norm": 2.607880903530943, + "learning_rate": 1.9293185731991212e-05, + "loss": 0.4729, + "step": 1853 + }, + { + "epoch": 0.14688057040998218, + "grad_norm": 2.409573353371404, + "learning_rate": 1.9292237844379043e-05, + "loss": 0.4331, + "step": 1854 + }, + { + "epoch": 0.14695979401861756, + "grad_norm": 2.5813942984074627, + "learning_rate": 1.929128934491773e-05, + "loss": 0.3301, + "step": 1855 + }, + { + "epoch": 0.1470390176272529, + "grad_norm": 2.3618287719994586, + "learning_rate": 1.929034023366972e-05, + "loss": 0.3427, + "step": 1856 + }, + { + "epoch": 0.1471182412358883, + "grad_norm": 2.345557554489996, + "learning_rate": 1.92893905106975e-05, + "loss": 0.3543, + "step": 1857 + }, + { + "epoch": 0.14719746484452367, + "grad_norm": 2.541322116913921, + "learning_rate": 1.9288440176063617e-05, + "loss": 0.3835, + "step": 1858 + }, + { + "epoch": 0.14727668845315905, + "grad_norm": 2.9599252365726505, + "learning_rate": 1.9287489229830645e-05, + "loss": 0.5104, + "step": 1859 + }, + { + "epoch": 0.1473559120617944, + "grad_norm": 2.824437379384795, + "learning_rate": 1.9286537672061192e-05, + "loss": 0.426, + "step": 1860 + }, + { + "epoch": 0.1474351356704298, + "grad_norm": 2.3642509807595546, + "learning_rate": 1.9285585502817917e-05, + "loss": 0.3357, + "step": 1861 + }, + { + "epoch": 0.14751435927906517, + "grad_norm": 2.9052100940915575, + "learning_rate": 1.9284632722163515e-05, + "loss": 0.3626, + "step": 1862 + }, + { + "epoch": 0.14759358288770053, + "grad_norm": 2.6413430336799437, + "learning_rate": 1.9283679330160726e-05, + "loss": 0.4311, + "step": 1863 + }, + { + "epoch": 0.1476728064963359, + "grad_norm": 2.69947293412902, + "learning_rate": 1.9282725326872324e-05, + "loss": 0.2961, + "step": 1864 + }, + { + "epoch": 0.1477520301049713, + "grad_norm": 2.3077475718603653, + "learning_rate": 1.9281770712361123e-05, + "loss": 0.3365, + "step": 1865 + }, + { + "epoch": 0.14783125371360664, + "grad_norm": 2.405100634374392, + "learning_rate": 1.928081548668998e-05, + "loss": 0.4454, + "step": 1866 + }, + { + "epoch": 0.14791047732224202, + "grad_norm": 2.319488157281669, + "learning_rate": 1.9279859649921797e-05, + "loss": 0.4218, + "step": 1867 + }, + { + "epoch": 0.1479897009308774, + "grad_norm": 2.179623154226663, + "learning_rate": 1.9278903202119508e-05, + "loss": 0.419, + "step": 1868 + }, + { + "epoch": 0.1480689245395128, + "grad_norm": 2.416402890079171, + "learning_rate": 1.9277946143346086e-05, + "loss": 0.4061, + "step": 1869 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 2.532753755277741, + "learning_rate": 1.9276988473664557e-05, + "loss": 0.4336, + "step": 1870 + }, + { + "epoch": 0.14822737175678352, + "grad_norm": 2.5448983829791927, + "learning_rate": 1.9276030193137974e-05, + "loss": 0.4713, + "step": 1871 + }, + { + "epoch": 0.1483065953654189, + "grad_norm": 2.344155104525831, + "learning_rate": 1.927507130182944e-05, + "loss": 0.2774, + "step": 1872 + }, + { + "epoch": 0.14838581897405426, + "grad_norm": 2.5045687788943547, + "learning_rate": 1.9274111799802084e-05, + "loss": 0.358, + "step": 1873 + }, + { + "epoch": 0.14846504258268964, + "grad_norm": 2.8864732873718055, + "learning_rate": 1.9273151687119093e-05, + "loss": 0.413, + "step": 1874 + }, + { + "epoch": 0.14854426619132502, + "grad_norm": 3.242439974526763, + "learning_rate": 1.927219096384368e-05, + "loss": 0.3413, + "step": 1875 + }, + { + "epoch": 0.1486234897999604, + "grad_norm": 2.9901869445980767, + "learning_rate": 1.9271229630039107e-05, + "loss": 0.4011, + "step": 1876 + }, + { + "epoch": 0.14870271340859575, + "grad_norm": 2.218276497437702, + "learning_rate": 1.9270267685768676e-05, + "loss": 0.3244, + "step": 1877 + }, + { + "epoch": 0.14878193701723114, + "grad_norm": 2.810244733400113, + "learning_rate": 1.9269305131095722e-05, + "loss": 0.3259, + "step": 1878 + }, + { + "epoch": 0.14886116062586652, + "grad_norm": 2.971104829346572, + "learning_rate": 1.9268341966083627e-05, + "loss": 0.4164, + "step": 1879 + }, + { + "epoch": 0.14894038423450187, + "grad_norm": 3.1301648101400805, + "learning_rate": 1.9267378190795812e-05, + "loss": 0.4199, + "step": 1880 + }, + { + "epoch": 0.14901960784313725, + "grad_norm": 2.6280063689633972, + "learning_rate": 1.9266413805295732e-05, + "loss": 0.3553, + "step": 1881 + }, + { + "epoch": 0.14909883145177263, + "grad_norm": 2.3529598745416735, + "learning_rate": 1.9265448809646893e-05, + "loss": 0.2887, + "step": 1882 + }, + { + "epoch": 0.149178055060408, + "grad_norm": 2.9183526875864683, + "learning_rate": 1.9264483203912826e-05, + "loss": 0.3801, + "step": 1883 + }, + { + "epoch": 0.14925727866904337, + "grad_norm": 2.7442967551169826, + "learning_rate": 1.9263516988157123e-05, + "loss": 0.4737, + "step": 1884 + }, + { + "epoch": 0.14933650227767875, + "grad_norm": 2.920579505008127, + "learning_rate": 1.92625501624434e-05, + "loss": 0.4385, + "step": 1885 + }, + { + "epoch": 0.14941572588631413, + "grad_norm": 2.0719634192726017, + "learning_rate": 1.9261582726835316e-05, + "loss": 0.3508, + "step": 1886 + }, + { + "epoch": 0.1494949494949495, + "grad_norm": 2.368460922799676, + "learning_rate": 1.926061468139657e-05, + "loss": 0.3515, + "step": 1887 + }, + { + "epoch": 0.14957417310358487, + "grad_norm": 2.51784280748681, + "learning_rate": 1.9259646026190913e-05, + "loss": 0.3316, + "step": 1888 + }, + { + "epoch": 0.14965339671222025, + "grad_norm": 2.3549242722517256, + "learning_rate": 1.9258676761282117e-05, + "loss": 0.2845, + "step": 1889 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 2.118485941256803, + "learning_rate": 1.9257706886734e-05, + "loss": 0.3105, + "step": 1890 + }, + { + "epoch": 0.14981184392949098, + "grad_norm": 2.412451448432033, + "learning_rate": 1.9256736402610437e-05, + "loss": 0.3228, + "step": 1891 + }, + { + "epoch": 0.14989106753812637, + "grad_norm": 2.6183425802412166, + "learning_rate": 1.9255765308975322e-05, + "loss": 0.3694, + "step": 1892 + }, + { + "epoch": 0.14997029114676175, + "grad_norm": 2.849067470828875, + "learning_rate": 1.9254793605892596e-05, + "loss": 0.4644, + "step": 1893 + }, + { + "epoch": 0.1500495147553971, + "grad_norm": 2.886134886523467, + "learning_rate": 1.9253821293426242e-05, + "loss": 0.3898, + "step": 1894 + }, + { + "epoch": 0.15012873836403248, + "grad_norm": 2.2099469061875645, + "learning_rate": 1.9252848371640284e-05, + "loss": 0.4072, + "step": 1895 + }, + { + "epoch": 0.15020796197266786, + "grad_norm": 2.4480325323851693, + "learning_rate": 1.925187484059878e-05, + "loss": 0.3571, + "step": 1896 + }, + { + "epoch": 0.15028718558130322, + "grad_norm": 2.8235513613292342, + "learning_rate": 1.9250900700365837e-05, + "loss": 0.4745, + "step": 1897 + }, + { + "epoch": 0.1503664091899386, + "grad_norm": 2.207864618771977, + "learning_rate": 1.9249925951005593e-05, + "loss": 0.2997, + "step": 1898 + }, + { + "epoch": 0.15044563279857398, + "grad_norm": 2.3071061681709044, + "learning_rate": 1.9248950592582235e-05, + "loss": 0.402, + "step": 1899 + }, + { + "epoch": 0.15052485640720936, + "grad_norm": 2.2341515895652466, + "learning_rate": 1.9247974625159983e-05, + "loss": 0.31, + "step": 1900 + }, + { + "epoch": 0.15060408001584472, + "grad_norm": 2.3209978738997252, + "learning_rate": 1.92469980488031e-05, + "loss": 0.4352, + "step": 1901 + }, + { + "epoch": 0.1506833036244801, + "grad_norm": 2.8130193032203645, + "learning_rate": 1.924602086357589e-05, + "loss": 0.4978, + "step": 1902 + }, + { + "epoch": 0.15076252723311548, + "grad_norm": 2.577839814331642, + "learning_rate": 1.9245043069542696e-05, + "loss": 0.3681, + "step": 1903 + }, + { + "epoch": 0.15084175084175083, + "grad_norm": 2.427918404756589, + "learning_rate": 1.92440646667679e-05, + "loss": 0.3145, + "step": 1904 + }, + { + "epoch": 0.1509209744503862, + "grad_norm": 2.555686083023233, + "learning_rate": 1.9243085655315924e-05, + "loss": 0.5014, + "step": 1905 + }, + { + "epoch": 0.1510001980590216, + "grad_norm": 2.9635544974587296, + "learning_rate": 1.924210603525123e-05, + "loss": 0.4371, + "step": 1906 + }, + { + "epoch": 0.15107942166765695, + "grad_norm": 2.596129947065641, + "learning_rate": 1.924112580663833e-05, + "loss": 0.3645, + "step": 1907 + }, + { + "epoch": 0.15115864527629233, + "grad_norm": 2.372571955362532, + "learning_rate": 1.9240144969541754e-05, + "loss": 0.4112, + "step": 1908 + }, + { + "epoch": 0.1512378688849277, + "grad_norm": 2.5385848730304374, + "learning_rate": 1.9239163524026097e-05, + "loss": 0.3696, + "step": 1909 + }, + { + "epoch": 0.1513170924935631, + "grad_norm": 2.4157411808313247, + "learning_rate": 1.9238181470155978e-05, + "loss": 0.3655, + "step": 1910 + }, + { + "epoch": 0.15139631610219845, + "grad_norm": 2.6058752101688807, + "learning_rate": 1.923719880799606e-05, + "loss": 0.4488, + "step": 1911 + }, + { + "epoch": 0.15147553971083383, + "grad_norm": 2.795193109191069, + "learning_rate": 1.9236215537611044e-05, + "loss": 0.4071, + "step": 1912 + }, + { + "epoch": 0.1515547633194692, + "grad_norm": 2.683267018921931, + "learning_rate": 1.923523165906568e-05, + "loss": 0.4357, + "step": 1913 + }, + { + "epoch": 0.15163398692810456, + "grad_norm": 2.752040545520521, + "learning_rate": 1.923424717242475e-05, + "loss": 0.4069, + "step": 1914 + }, + { + "epoch": 0.15171321053673995, + "grad_norm": 2.6445715990298897, + "learning_rate": 1.923326207775307e-05, + "loss": 0.4138, + "step": 1915 + }, + { + "epoch": 0.15179243414537533, + "grad_norm": 2.2271638580208126, + "learning_rate": 1.9232276375115517e-05, + "loss": 0.45, + "step": 1916 + }, + { + "epoch": 0.1518716577540107, + "grad_norm": 2.5305914712532647, + "learning_rate": 1.9231290064576985e-05, + "loss": 0.353, + "step": 1917 + }, + { + "epoch": 0.15195088136264606, + "grad_norm": 2.158368376520064, + "learning_rate": 1.923030314620242e-05, + "loss": 0.365, + "step": 1918 + }, + { + "epoch": 0.15203010497128144, + "grad_norm": 3.1284334951262402, + "learning_rate": 1.9229315620056805e-05, + "loss": 0.4513, + "step": 1919 + }, + { + "epoch": 0.15210932857991682, + "grad_norm": 2.8823217180333307, + "learning_rate": 1.9228327486205166e-05, + "loss": 0.4524, + "step": 1920 + }, + { + "epoch": 0.15218855218855218, + "grad_norm": 2.5224470056536603, + "learning_rate": 1.9227338744712565e-05, + "loss": 0.4435, + "step": 1921 + }, + { + "epoch": 0.15226777579718756, + "grad_norm": 2.415218772986, + "learning_rate": 1.9226349395644106e-05, + "loss": 0.4549, + "step": 1922 + }, + { + "epoch": 0.15234699940582294, + "grad_norm": 2.9917163657098205, + "learning_rate": 1.9225359439064934e-05, + "loss": 0.416, + "step": 1923 + }, + { + "epoch": 0.1524262230144583, + "grad_norm": 2.596585523806836, + "learning_rate": 1.9224368875040235e-05, + "loss": 0.4176, + "step": 1924 + }, + { + "epoch": 0.15250544662309368, + "grad_norm": 2.4960749946423584, + "learning_rate": 1.922337770363523e-05, + "loss": 0.3505, + "step": 1925 + }, + { + "epoch": 0.15258467023172906, + "grad_norm": 2.4648872949395764, + "learning_rate": 1.922238592491518e-05, + "loss": 0.3567, + "step": 1926 + }, + { + "epoch": 0.15266389384036444, + "grad_norm": 2.291997432617008, + "learning_rate": 1.9221393538945397e-05, + "loss": 0.4393, + "step": 1927 + }, + { + "epoch": 0.1527431174489998, + "grad_norm": 2.581662550513519, + "learning_rate": 1.9220400545791216e-05, + "loss": 0.365, + "step": 1928 + }, + { + "epoch": 0.15282234105763517, + "grad_norm": 2.7329104459559397, + "learning_rate": 1.9219406945518028e-05, + "loss": 0.44, + "step": 1929 + }, + { + "epoch": 0.15290156466627056, + "grad_norm": 2.3173582707939318, + "learning_rate": 1.921841273819125e-05, + "loss": 0.3395, + "step": 1930 + }, + { + "epoch": 0.1529807882749059, + "grad_norm": 3.836270283575409, + "learning_rate": 1.9217417923876352e-05, + "loss": 0.595, + "step": 1931 + }, + { + "epoch": 0.1530600118835413, + "grad_norm": 2.7094393505181924, + "learning_rate": 1.9216422502638836e-05, + "loss": 0.3966, + "step": 1932 + }, + { + "epoch": 0.15313923549217667, + "grad_norm": 2.1733271354677064, + "learning_rate": 1.9215426474544242e-05, + "loss": 0.2855, + "step": 1933 + }, + { + "epoch": 0.15321845910081205, + "grad_norm": 2.454972338653038, + "learning_rate": 1.9214429839658156e-05, + "loss": 0.2843, + "step": 1934 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 2.289194714120497, + "learning_rate": 1.9213432598046205e-05, + "loss": 0.3554, + "step": 1935 + }, + { + "epoch": 0.1533769063180828, + "grad_norm": 2.462188477398111, + "learning_rate": 1.9212434749774048e-05, + "loss": 0.3417, + "step": 1936 + }, + { + "epoch": 0.15345612992671817, + "grad_norm": 2.48518064749135, + "learning_rate": 1.921143629490739e-05, + "loss": 0.3711, + "step": 1937 + }, + { + "epoch": 0.15353535353535352, + "grad_norm": 2.3725140889180025, + "learning_rate": 1.9210437233511974e-05, + "loss": 0.3047, + "step": 1938 + }, + { + "epoch": 0.1536145771439889, + "grad_norm": 3.2169359464900036, + "learning_rate": 1.9209437565653587e-05, + "loss": 0.3353, + "step": 1939 + }, + { + "epoch": 0.1536938007526243, + "grad_norm": 2.59904687906021, + "learning_rate": 1.9208437291398045e-05, + "loss": 0.4435, + "step": 1940 + }, + { + "epoch": 0.15377302436125967, + "grad_norm": 2.607748724392275, + "learning_rate": 1.920743641081122e-05, + "loss": 0.4006, + "step": 1941 + }, + { + "epoch": 0.15385224796989502, + "grad_norm": 2.043277301946978, + "learning_rate": 1.920643492395901e-05, + "loss": 0.3249, + "step": 1942 + }, + { + "epoch": 0.1539314715785304, + "grad_norm": 2.9051574925978847, + "learning_rate": 1.9205432830907353e-05, + "loss": 0.42, + "step": 1943 + }, + { + "epoch": 0.15401069518716579, + "grad_norm": 2.549680541977789, + "learning_rate": 1.9204430131722243e-05, + "loss": 0.3844, + "step": 1944 + }, + { + "epoch": 0.15408991879580114, + "grad_norm": 2.2514664299702836, + "learning_rate": 1.9203426826469695e-05, + "loss": 0.3962, + "step": 1945 + }, + { + "epoch": 0.15416914240443652, + "grad_norm": 2.3550168976942243, + "learning_rate": 1.9202422915215777e-05, + "loss": 0.3624, + "step": 1946 + }, + { + "epoch": 0.1542483660130719, + "grad_norm": 2.750621721649528, + "learning_rate": 1.920141839802659e-05, + "loss": 0.3769, + "step": 1947 + }, + { + "epoch": 0.15432758962170726, + "grad_norm": 2.7096270626917836, + "learning_rate": 1.9200413274968276e-05, + "loss": 0.3509, + "step": 1948 + }, + { + "epoch": 0.15440681323034264, + "grad_norm": 2.5011842807184403, + "learning_rate": 1.9199407546107014e-05, + "loss": 0.4091, + "step": 1949 + }, + { + "epoch": 0.15448603683897802, + "grad_norm": 2.400068797552773, + "learning_rate": 1.919840121150903e-05, + "loss": 0.3399, + "step": 1950 + }, + { + "epoch": 0.1545652604476134, + "grad_norm": 3.365692975849877, + "learning_rate": 1.9197394271240587e-05, + "loss": 0.5116, + "step": 1951 + }, + { + "epoch": 0.15464448405624875, + "grad_norm": 2.348411823614479, + "learning_rate": 1.919638672536799e-05, + "loss": 0.4461, + "step": 1952 + }, + { + "epoch": 0.15472370766488414, + "grad_norm": 2.326574033758665, + "learning_rate": 1.9195378573957574e-05, + "loss": 0.3582, + "step": 1953 + }, + { + "epoch": 0.15480293127351952, + "grad_norm": 3.197937525427283, + "learning_rate": 1.9194369817075725e-05, + "loss": 0.5215, + "step": 1954 + }, + { + "epoch": 0.15488215488215487, + "grad_norm": 2.364082698442043, + "learning_rate": 1.9193360454788864e-05, + "loss": 0.4119, + "step": 1955 + }, + { + "epoch": 0.15496137849079025, + "grad_norm": 2.261356086273808, + "learning_rate": 1.919235048716345e-05, + "loss": 0.3367, + "step": 1956 + }, + { + "epoch": 0.15504060209942563, + "grad_norm": 1.9733795165007209, + "learning_rate": 1.919133991426599e-05, + "loss": 0.339, + "step": 1957 + }, + { + "epoch": 0.15511982570806102, + "grad_norm": 2.5969446728972927, + "learning_rate": 1.919032873616302e-05, + "loss": 0.5067, + "step": 1958 + }, + { + "epoch": 0.15519904931669637, + "grad_norm": 2.185435591931016, + "learning_rate": 1.918931695292113e-05, + "loss": 0.4309, + "step": 1959 + }, + { + "epoch": 0.15527827292533175, + "grad_norm": 2.188221493170469, + "learning_rate": 1.918830456460693e-05, + "loss": 0.42, + "step": 1960 + }, + { + "epoch": 0.15535749653396713, + "grad_norm": 2.550535303965114, + "learning_rate": 1.9187291571287088e-05, + "loss": 0.3554, + "step": 1961 + }, + { + "epoch": 0.15543672014260249, + "grad_norm": 2.2301417324382613, + "learning_rate": 1.91862779730283e-05, + "loss": 0.3438, + "step": 1962 + }, + { + "epoch": 0.15551594375123787, + "grad_norm": 2.740303270086453, + "learning_rate": 1.918526376989731e-05, + "loss": 0.4272, + "step": 1963 + }, + { + "epoch": 0.15559516735987325, + "grad_norm": 2.5375538918440874, + "learning_rate": 1.9184248961960895e-05, + "loss": 0.3971, + "step": 1964 + }, + { + "epoch": 0.1556743909685086, + "grad_norm": 2.658960699154366, + "learning_rate": 1.918323354928588e-05, + "loss": 0.4898, + "step": 1965 + }, + { + "epoch": 0.15575361457714398, + "grad_norm": 2.0416244203848697, + "learning_rate": 1.918221753193912e-05, + "loss": 0.3569, + "step": 1966 + }, + { + "epoch": 0.15583283818577937, + "grad_norm": 2.9353735087713404, + "learning_rate": 1.9181200909987524e-05, + "loss": 0.4745, + "step": 1967 + }, + { + "epoch": 0.15591206179441475, + "grad_norm": 2.3663236729472055, + "learning_rate": 1.918018368349802e-05, + "loss": 0.3946, + "step": 1968 + }, + { + "epoch": 0.1559912854030501, + "grad_norm": 2.5952464620638684, + "learning_rate": 1.9179165852537596e-05, + "loss": 0.4383, + "step": 1969 + }, + { + "epoch": 0.15607050901168548, + "grad_norm": 2.65250779277471, + "learning_rate": 1.9178147417173265e-05, + "loss": 0.5501, + "step": 1970 + }, + { + "epoch": 0.15614973262032086, + "grad_norm": 2.5965675177727428, + "learning_rate": 1.917712837747209e-05, + "loss": 0.4203, + "step": 1971 + }, + { + "epoch": 0.15622895622895622, + "grad_norm": 2.725024577997951, + "learning_rate": 1.917610873350117e-05, + "loss": 0.3939, + "step": 1972 + }, + { + "epoch": 0.1563081798375916, + "grad_norm": 2.7905166985781857, + "learning_rate": 1.917508848532764e-05, + "loss": 0.3775, + "step": 1973 + }, + { + "epoch": 0.15638740344622698, + "grad_norm": 2.408246525866466, + "learning_rate": 1.9174067633018682e-05, + "loss": 0.3487, + "step": 1974 + }, + { + "epoch": 0.15646662705486236, + "grad_norm": 2.288284118588807, + "learning_rate": 1.9173046176641515e-05, + "loss": 0.3322, + "step": 1975 + }, + { + "epoch": 0.15654585066349772, + "grad_norm": 2.5430380247700914, + "learning_rate": 1.917202411626339e-05, + "loss": 0.3408, + "step": 1976 + }, + { + "epoch": 0.1566250742721331, + "grad_norm": 2.4220785156936127, + "learning_rate": 1.9171001451951616e-05, + "loss": 0.4192, + "step": 1977 + }, + { + "epoch": 0.15670429788076848, + "grad_norm": 4.208517632278412, + "learning_rate": 1.916997818377352e-05, + "loss": 0.4149, + "step": 1978 + }, + { + "epoch": 0.15678352148940383, + "grad_norm": 2.970158632130674, + "learning_rate": 1.9168954311796487e-05, + "loss": 0.383, + "step": 1979 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 2.0977795879296144, + "learning_rate": 1.9167929836087932e-05, + "loss": 0.3751, + "step": 1980 + }, + { + "epoch": 0.1569419687066746, + "grad_norm": 2.490154249518521, + "learning_rate": 1.9166904756715307e-05, + "loss": 0.3586, + "step": 1981 + }, + { + "epoch": 0.15702119231530995, + "grad_norm": 2.5337492480961212, + "learning_rate": 1.9165879073746112e-05, + "loss": 0.5593, + "step": 1982 + }, + { + "epoch": 0.15710041592394533, + "grad_norm": 2.1043513379897876, + "learning_rate": 1.9164852787247887e-05, + "loss": 0.4316, + "step": 1983 + }, + { + "epoch": 0.1571796395325807, + "grad_norm": 2.395381249523691, + "learning_rate": 1.91638258972882e-05, + "loss": 0.3593, + "step": 1984 + }, + { + "epoch": 0.1572588631412161, + "grad_norm": 2.4304424087456695, + "learning_rate": 1.916279840393467e-05, + "loss": 0.4216, + "step": 1985 + }, + { + "epoch": 0.15733808674985145, + "grad_norm": 3.044013056940546, + "learning_rate": 1.916177030725496e-05, + "loss": 0.507, + "step": 1986 + }, + { + "epoch": 0.15741731035848683, + "grad_norm": 2.5012033696965146, + "learning_rate": 1.9160741607316755e-05, + "loss": 0.3416, + "step": 1987 + }, + { + "epoch": 0.1574965339671222, + "grad_norm": 2.49684165356504, + "learning_rate": 1.9159712304187795e-05, + "loss": 0.3868, + "step": 1988 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 2.4806195817284142, + "learning_rate": 1.9158682397935852e-05, + "loss": 0.3231, + "step": 1989 + }, + { + "epoch": 0.15765498118439294, + "grad_norm": 3.0325071001164683, + "learning_rate": 1.9157651888628744e-05, + "loss": 0.4461, + "step": 1990 + }, + { + "epoch": 0.15773420479302833, + "grad_norm": 2.320859475975259, + "learning_rate": 1.915662077633432e-05, + "loss": 0.411, + "step": 1991 + }, + { + "epoch": 0.1578134284016637, + "grad_norm": 2.298453393391117, + "learning_rate": 1.915558906112048e-05, + "loss": 0.3277, + "step": 1992 + }, + { + "epoch": 0.15789265201029906, + "grad_norm": 1.9772050916982755, + "learning_rate": 1.915455674305515e-05, + "loss": 0.2862, + "step": 1993 + }, + { + "epoch": 0.15797187561893444, + "grad_norm": 2.513249010052859, + "learning_rate": 1.9153523822206312e-05, + "loss": 0.3768, + "step": 1994 + }, + { + "epoch": 0.15805109922756982, + "grad_norm": 2.4778682765515008, + "learning_rate": 1.9152490298641973e-05, + "loss": 0.4127, + "step": 1995 + }, + { + "epoch": 0.15813032283620518, + "grad_norm": 2.4736841771935825, + "learning_rate": 1.9151456172430186e-05, + "loss": 0.3561, + "step": 1996 + }, + { + "epoch": 0.15820954644484056, + "grad_norm": 2.3262542182228967, + "learning_rate": 1.9150421443639045e-05, + "loss": 0.3611, + "step": 1997 + }, + { + "epoch": 0.15828877005347594, + "grad_norm": 2.66421357961457, + "learning_rate": 1.9149386112336682e-05, + "loss": 0.4409, + "step": 1998 + }, + { + "epoch": 0.15836799366211132, + "grad_norm": 2.293721974298357, + "learning_rate": 1.9148350178591264e-05, + "loss": 0.3153, + "step": 1999 + }, + { + "epoch": 0.15844721727074668, + "grad_norm": 2.4187314984910717, + "learning_rate": 1.914731364247101e-05, + "loss": 0.4574, + "step": 2000 + }, + { + "epoch": 0.15852644087938206, + "grad_norm": 2.492077352643004, + "learning_rate": 1.914627650404416e-05, + "loss": 0.3937, + "step": 2001 + }, + { + "epoch": 0.15860566448801744, + "grad_norm": 3.8429708832477933, + "learning_rate": 1.9145238763379016e-05, + "loss": 0.4076, + "step": 2002 + }, + { + "epoch": 0.1586848880966528, + "grad_norm": 2.37446947715965, + "learning_rate": 1.9144200420543905e-05, + "loss": 0.3978, + "step": 2003 + }, + { + "epoch": 0.15876411170528817, + "grad_norm": 2.2525181852610587, + "learning_rate": 1.9143161475607194e-05, + "loss": 0.3748, + "step": 2004 + }, + { + "epoch": 0.15884333531392356, + "grad_norm": 3.2333314540265965, + "learning_rate": 1.9142121928637292e-05, + "loss": 0.4077, + "step": 2005 + }, + { + "epoch": 0.1589225589225589, + "grad_norm": 2.2886793279480915, + "learning_rate": 1.914108177970265e-05, + "loss": 0.3586, + "step": 2006 + }, + { + "epoch": 0.1590017825311943, + "grad_norm": 2.9240298932930706, + "learning_rate": 1.914004102887176e-05, + "loss": 0.4101, + "step": 2007 + }, + { + "epoch": 0.15908100613982967, + "grad_norm": 2.645957628733759, + "learning_rate": 1.9138999676213146e-05, + "loss": 0.4604, + "step": 2008 + }, + { + "epoch": 0.15916022974846505, + "grad_norm": 2.8304492123260023, + "learning_rate": 1.9137957721795376e-05, + "loss": 0.4334, + "step": 2009 + }, + { + "epoch": 0.1592394533571004, + "grad_norm": 2.682277593664758, + "learning_rate": 1.913691516568706e-05, + "loss": 0.4744, + "step": 2010 + }, + { + "epoch": 0.1593186769657358, + "grad_norm": 2.8318036336564925, + "learning_rate": 1.9135872007956846e-05, + "loss": 0.3819, + "step": 2011 + }, + { + "epoch": 0.15939790057437117, + "grad_norm": 2.5515605354784734, + "learning_rate": 1.9134828248673415e-05, + "loss": 0.3293, + "step": 2012 + }, + { + "epoch": 0.15947712418300652, + "grad_norm": 2.5218977385903383, + "learning_rate": 1.9133783887905502e-05, + "loss": 0.4383, + "step": 2013 + }, + { + "epoch": 0.1595563477916419, + "grad_norm": 2.42101144537026, + "learning_rate": 1.913273892572187e-05, + "loss": 0.3175, + "step": 2014 + }, + { + "epoch": 0.1596355714002773, + "grad_norm": 2.8670695029270354, + "learning_rate": 1.9131693362191318e-05, + "loss": 0.3706, + "step": 2015 + }, + { + "epoch": 0.15971479500891267, + "grad_norm": 2.965626169572073, + "learning_rate": 1.91306471973827e-05, + "loss": 0.4582, + "step": 2016 + }, + { + "epoch": 0.15979401861754802, + "grad_norm": 2.5223138223433463, + "learning_rate": 1.91296004313649e-05, + "loss": 0.4081, + "step": 2017 + }, + { + "epoch": 0.1598732422261834, + "grad_norm": 2.648496364884265, + "learning_rate": 1.9128553064206835e-05, + "loss": 0.4031, + "step": 2018 + }, + { + "epoch": 0.15995246583481879, + "grad_norm": 2.71057088173164, + "learning_rate": 1.9127505095977483e-05, + "loss": 0.5089, + "step": 2019 + }, + { + "epoch": 0.16003168944345414, + "grad_norm": 2.8047621605232114, + "learning_rate": 1.9126456526745833e-05, + "loss": 0.4594, + "step": 2020 + }, + { + "epoch": 0.16011091305208952, + "grad_norm": 2.858064942449386, + "learning_rate": 1.9125407356580932e-05, + "loss": 0.3752, + "step": 2021 + }, + { + "epoch": 0.1601901366607249, + "grad_norm": 2.84549067250781, + "learning_rate": 1.9124357585551872e-05, + "loss": 0.4845, + "step": 2022 + }, + { + "epoch": 0.16026936026936026, + "grad_norm": 2.498779682999885, + "learning_rate": 1.9123307213727764e-05, + "loss": 0.3665, + "step": 2023 + }, + { + "epoch": 0.16034858387799564, + "grad_norm": 2.47476938925415, + "learning_rate": 1.9122256241177776e-05, + "loss": 0.4031, + "step": 2024 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 2.642160064942329, + "learning_rate": 1.9121204667971107e-05, + "loss": 0.4218, + "step": 2025 + }, + { + "epoch": 0.1605070310952664, + "grad_norm": 2.615368504755733, + "learning_rate": 1.9120152494177e-05, + "loss": 0.3612, + "step": 2026 + }, + { + "epoch": 0.16058625470390175, + "grad_norm": 2.5313924368608807, + "learning_rate": 1.9119099719864735e-05, + "loss": 0.4081, + "step": 2027 + }, + { + "epoch": 0.16066547831253714, + "grad_norm": 3.8175552796999437, + "learning_rate": 1.911804634510363e-05, + "loss": 0.3725, + "step": 2028 + }, + { + "epoch": 0.16074470192117252, + "grad_norm": 2.9951060809210617, + "learning_rate": 1.911699236996305e-05, + "loss": 0.4088, + "step": 2029 + }, + { + "epoch": 0.16082392552980787, + "grad_norm": 2.348003594052534, + "learning_rate": 1.911593779451239e-05, + "loss": 0.3109, + "step": 2030 + }, + { + "epoch": 0.16090314913844325, + "grad_norm": 2.757184007426351, + "learning_rate": 1.911488261882109e-05, + "loss": 0.4856, + "step": 2031 + }, + { + "epoch": 0.16098237274707863, + "grad_norm": 2.303383573113819, + "learning_rate": 1.911382684295862e-05, + "loss": 0.3683, + "step": 2032 + }, + { + "epoch": 0.16106159635571402, + "grad_norm": 2.671306628313457, + "learning_rate": 1.911277046699451e-05, + "loss": 0.4193, + "step": 2033 + }, + { + "epoch": 0.16114081996434937, + "grad_norm": 2.458050905598702, + "learning_rate": 1.9111713490998316e-05, + "loss": 0.2779, + "step": 2034 + }, + { + "epoch": 0.16122004357298475, + "grad_norm": 2.148801434175658, + "learning_rate": 1.911065591503963e-05, + "loss": 0.4058, + "step": 2035 + }, + { + "epoch": 0.16129926718162013, + "grad_norm": 2.5667405398210543, + "learning_rate": 1.9109597739188088e-05, + "loss": 0.4142, + "step": 2036 + }, + { + "epoch": 0.16137849079025549, + "grad_norm": 2.914776632242434, + "learning_rate": 1.9108538963513366e-05, + "loss": 0.4438, + "step": 2037 + }, + { + "epoch": 0.16145771439889087, + "grad_norm": 2.8905125445208593, + "learning_rate": 1.9107479588085182e-05, + "loss": 0.3398, + "step": 2038 + }, + { + "epoch": 0.16153693800752625, + "grad_norm": 2.006768054386868, + "learning_rate": 1.910641961297329e-05, + "loss": 0.4016, + "step": 2039 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 2.552066249231819, + "learning_rate": 1.9105359038247484e-05, + "loss": 0.3846, + "step": 2040 + }, + { + "epoch": 0.16169538522479698, + "grad_norm": 2.3096835855174214, + "learning_rate": 1.9104297863977595e-05, + "loss": 0.4205, + "step": 2041 + }, + { + "epoch": 0.16177460883343237, + "grad_norm": 2.596706776733995, + "learning_rate": 1.9103236090233507e-05, + "loss": 0.4578, + "step": 2042 + }, + { + "epoch": 0.16185383244206775, + "grad_norm": 2.3372339557634927, + "learning_rate": 1.9102173717085114e-05, + "loss": 0.3605, + "step": 2043 + }, + { + "epoch": 0.1619330560507031, + "grad_norm": 2.5420338526568846, + "learning_rate": 1.9101110744602384e-05, + "loss": 0.4239, + "step": 2044 + }, + { + "epoch": 0.16201227965933848, + "grad_norm": 2.537370175925376, + "learning_rate": 1.9100047172855306e-05, + "loss": 0.4747, + "step": 2045 + }, + { + "epoch": 0.16209150326797386, + "grad_norm": 2.295719027597248, + "learning_rate": 1.9098983001913903e-05, + "loss": 0.3365, + "step": 2046 + }, + { + "epoch": 0.16217072687660922, + "grad_norm": 2.4511334477056392, + "learning_rate": 1.909791823184825e-05, + "loss": 0.3897, + "step": 2047 + }, + { + "epoch": 0.1622499504852446, + "grad_norm": 2.2627039506298825, + "learning_rate": 1.909685286272846e-05, + "loss": 0.4409, + "step": 2048 + }, + { + "epoch": 0.16232917409387998, + "grad_norm": 2.4580596605401714, + "learning_rate": 1.9095786894624685e-05, + "loss": 0.3955, + "step": 2049 + }, + { + "epoch": 0.16240839770251536, + "grad_norm": 2.6876106053561037, + "learning_rate": 1.9094720327607102e-05, + "loss": 0.4521, + "step": 2050 + }, + { + "epoch": 0.16248762131115072, + "grad_norm": 2.688260383973597, + "learning_rate": 1.909365316174595e-05, + "loss": 0.4015, + "step": 2051 + }, + { + "epoch": 0.1625668449197861, + "grad_norm": 1.8927562871984127, + "learning_rate": 1.9092585397111492e-05, + "loss": 0.2599, + "step": 2052 + }, + { + "epoch": 0.16264606852842148, + "grad_norm": 2.7526013589756793, + "learning_rate": 1.9091517033774038e-05, + "loss": 0.4724, + "step": 2053 + }, + { + "epoch": 0.16272529213705683, + "grad_norm": 2.1804564163925293, + "learning_rate": 1.9090448071803932e-05, + "loss": 0.3649, + "step": 2054 + }, + { + "epoch": 0.1628045157456922, + "grad_norm": 2.397642986835983, + "learning_rate": 1.908937851127156e-05, + "loss": 0.3842, + "step": 2055 + }, + { + "epoch": 0.1628837393543276, + "grad_norm": 2.413831701072522, + "learning_rate": 1.908830835224735e-05, + "loss": 0.3511, + "step": 2056 + }, + { + "epoch": 0.16296296296296298, + "grad_norm": 2.162875520520643, + "learning_rate": 1.9087237594801762e-05, + "loss": 0.3863, + "step": 2057 + }, + { + "epoch": 0.16304218657159833, + "grad_norm": 2.4008368626074748, + "learning_rate": 1.9086166239005305e-05, + "loss": 0.5084, + "step": 2058 + }, + { + "epoch": 0.1631214101802337, + "grad_norm": 2.5468590631097077, + "learning_rate": 1.908509428492852e-05, + "loss": 0.3354, + "step": 2059 + }, + { + "epoch": 0.1632006337888691, + "grad_norm": 2.543397628910795, + "learning_rate": 1.9084021732641994e-05, + "loss": 0.3894, + "step": 2060 + }, + { + "epoch": 0.16327985739750445, + "grad_norm": 2.4492414147101824, + "learning_rate": 1.9082948582216344e-05, + "loss": 0.4175, + "step": 2061 + }, + { + "epoch": 0.16335908100613983, + "grad_norm": 3.218981075025703, + "learning_rate": 1.9081874833722234e-05, + "loss": 0.459, + "step": 2062 + }, + { + "epoch": 0.1634383046147752, + "grad_norm": 2.87191996109704, + "learning_rate": 1.908080048723037e-05, + "loss": 0.4823, + "step": 2063 + }, + { + "epoch": 0.16351752822341056, + "grad_norm": 2.2306528898608136, + "learning_rate": 1.9079725542811484e-05, + "loss": 0.3937, + "step": 2064 + }, + { + "epoch": 0.16359675183204594, + "grad_norm": 2.041495451703393, + "learning_rate": 1.907865000053636e-05, + "loss": 0.3763, + "step": 2065 + }, + { + "epoch": 0.16367597544068133, + "grad_norm": 1.8736622485455503, + "learning_rate": 1.9077573860475815e-05, + "loss": 0.3738, + "step": 2066 + }, + { + "epoch": 0.1637551990493167, + "grad_norm": 2.6715963215775957, + "learning_rate": 1.9076497122700713e-05, + "loss": 0.3496, + "step": 2067 + }, + { + "epoch": 0.16383442265795206, + "grad_norm": 1.979250690921824, + "learning_rate": 1.9075419787281948e-05, + "loss": 0.4179, + "step": 2068 + }, + { + "epoch": 0.16391364626658744, + "grad_norm": 2.422544721645588, + "learning_rate": 1.9074341854290458e-05, + "loss": 0.3265, + "step": 2069 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 1.8309182594501825, + "learning_rate": 1.907326332379722e-05, + "loss": 0.3067, + "step": 2070 + }, + { + "epoch": 0.16407209348385818, + "grad_norm": 1.9939881887508093, + "learning_rate": 1.9072184195873248e-05, + "loss": 0.2872, + "step": 2071 + }, + { + "epoch": 0.16415131709249356, + "grad_norm": 2.091185918907336, + "learning_rate": 1.9071104470589603e-05, + "loss": 0.3555, + "step": 2072 + }, + { + "epoch": 0.16423054070112894, + "grad_norm": 3.1267310089949523, + "learning_rate": 1.9070024148017375e-05, + "loss": 0.4318, + "step": 2073 + }, + { + "epoch": 0.16430976430976432, + "grad_norm": 2.505969102780564, + "learning_rate": 1.9068943228227695e-05, + "loss": 0.3401, + "step": 2074 + }, + { + "epoch": 0.16438898791839968, + "grad_norm": 2.937497177671296, + "learning_rate": 1.9067861711291744e-05, + "loss": 0.3575, + "step": 2075 + }, + { + "epoch": 0.16446821152703506, + "grad_norm": 2.564068401025119, + "learning_rate": 1.906677959728073e-05, + "loss": 0.3903, + "step": 2076 + }, + { + "epoch": 0.16454743513567044, + "grad_norm": 2.4717480204506606, + "learning_rate": 1.9065696886265906e-05, + "loss": 0.4541, + "step": 2077 + }, + { + "epoch": 0.1646266587443058, + "grad_norm": 2.265608078191137, + "learning_rate": 1.9064613578318564e-05, + "loss": 0.2936, + "step": 2078 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 2.5264154209087755, + "learning_rate": 1.9063529673510036e-05, + "loss": 0.2668, + "step": 2079 + }, + { + "epoch": 0.16478510596157656, + "grad_norm": 2.6262399213341205, + "learning_rate": 1.9062445171911688e-05, + "loss": 0.4439, + "step": 2080 + }, + { + "epoch": 0.1648643295702119, + "grad_norm": 2.685429770487024, + "learning_rate": 1.9061360073594933e-05, + "loss": 0.3894, + "step": 2081 + }, + { + "epoch": 0.1649435531788473, + "grad_norm": 3.0024177468917532, + "learning_rate": 1.9060274378631215e-05, + "loss": 0.4441, + "step": 2082 + }, + { + "epoch": 0.16502277678748267, + "grad_norm": 2.7851514300737694, + "learning_rate": 1.9059188087092025e-05, + "loss": 0.43, + "step": 2083 + }, + { + "epoch": 0.16510200039611805, + "grad_norm": 2.5017197289095403, + "learning_rate": 1.905810119904889e-05, + "loss": 0.2752, + "step": 2084 + }, + { + "epoch": 0.1651812240047534, + "grad_norm": 2.413077150062562, + "learning_rate": 1.9057013714573375e-05, + "loss": 0.4302, + "step": 2085 + }, + { + "epoch": 0.1652604476133888, + "grad_norm": 2.4092520752561426, + "learning_rate": 1.9055925633737088e-05, + "loss": 0.3911, + "step": 2086 + }, + { + "epoch": 0.16533967122202417, + "grad_norm": 2.4886958012007856, + "learning_rate": 1.905483695661167e-05, + "loss": 0.4181, + "step": 2087 + }, + { + "epoch": 0.16541889483065952, + "grad_norm": 2.147771132208899, + "learning_rate": 1.905374768326881e-05, + "loss": 0.3424, + "step": 2088 + }, + { + "epoch": 0.1654981184392949, + "grad_norm": 2.736285722084298, + "learning_rate": 1.9052657813780226e-05, + "loss": 0.4367, + "step": 2089 + }, + { + "epoch": 0.1655773420479303, + "grad_norm": 2.086534234254006, + "learning_rate": 1.9051567348217686e-05, + "loss": 0.3098, + "step": 2090 + }, + { + "epoch": 0.16565656565656567, + "grad_norm": 2.3589462520488502, + "learning_rate": 1.905047628665299e-05, + "loss": 0.2758, + "step": 2091 + }, + { + "epoch": 0.16573578926520102, + "grad_norm": 2.384522005928775, + "learning_rate": 1.9049384629157974e-05, + "loss": 0.376, + "step": 2092 + }, + { + "epoch": 0.1658150128738364, + "grad_norm": 2.3333173360236246, + "learning_rate": 1.9048292375804527e-05, + "loss": 0.5036, + "step": 2093 + }, + { + "epoch": 0.16589423648247179, + "grad_norm": 2.761465650581943, + "learning_rate": 1.9047199526664565e-05, + "loss": 0.4701, + "step": 2094 + }, + { + "epoch": 0.16597346009110714, + "grad_norm": 2.390273717896687, + "learning_rate": 1.9046106081810047e-05, + "loss": 0.4178, + "step": 2095 + }, + { + "epoch": 0.16605268369974252, + "grad_norm": 2.485548426032065, + "learning_rate": 1.9045012041312966e-05, + "loss": 0.4411, + "step": 2096 + }, + { + "epoch": 0.1661319073083779, + "grad_norm": 1.9644997048672619, + "learning_rate": 1.904391740524537e-05, + "loss": 0.3021, + "step": 2097 + }, + { + "epoch": 0.16621113091701328, + "grad_norm": 2.3746144407645713, + "learning_rate": 1.9042822173679325e-05, + "loss": 0.3881, + "step": 2098 + }, + { + "epoch": 0.16629035452564864, + "grad_norm": 2.2568019717306687, + "learning_rate": 1.9041726346686952e-05, + "loss": 0.3511, + "step": 2099 + }, + { + "epoch": 0.16636957813428402, + "grad_norm": 2.457961894509813, + "learning_rate": 1.9040629924340406e-05, + "loss": 0.4055, + "step": 2100 + }, + { + "epoch": 0.1664488017429194, + "grad_norm": 2.3332681598428158, + "learning_rate": 1.903953290671188e-05, + "loss": 0.4422, + "step": 2101 + }, + { + "epoch": 0.16652802535155475, + "grad_norm": 2.0983274627979642, + "learning_rate": 1.903843529387361e-05, + "loss": 0.4128, + "step": 2102 + }, + { + "epoch": 0.16660724896019014, + "grad_norm": 2.2835278490024002, + "learning_rate": 1.903733708589786e-05, + "loss": 0.4479, + "step": 2103 + }, + { + "epoch": 0.16668647256882552, + "grad_norm": 2.746919399303464, + "learning_rate": 1.9036238282856952e-05, + "loss": 0.5907, + "step": 2104 + }, + { + "epoch": 0.16676569617746087, + "grad_norm": 2.3865614186491126, + "learning_rate": 1.903513888482323e-05, + "loss": 0.4483, + "step": 2105 + }, + { + "epoch": 0.16684491978609625, + "grad_norm": 1.7891039212143733, + "learning_rate": 1.903403889186909e-05, + "loss": 0.3021, + "step": 2106 + }, + { + "epoch": 0.16692414339473163, + "grad_norm": 2.4818601510672025, + "learning_rate": 1.903293830406696e-05, + "loss": 0.4789, + "step": 2107 + }, + { + "epoch": 0.16700336700336701, + "grad_norm": 2.2380740442567095, + "learning_rate": 1.9031837121489303e-05, + "loss": 0.4511, + "step": 2108 + }, + { + "epoch": 0.16708259061200237, + "grad_norm": 2.1029352008475595, + "learning_rate": 1.903073534420863e-05, + "loss": 0.3873, + "step": 2109 + }, + { + "epoch": 0.16716181422063775, + "grad_norm": 2.4541343693784317, + "learning_rate": 1.9029632972297488e-05, + "loss": 0.3209, + "step": 2110 + }, + { + "epoch": 0.16724103782927313, + "grad_norm": 2.499412314769988, + "learning_rate": 1.9028530005828462e-05, + "loss": 0.3712, + "step": 2111 + }, + { + "epoch": 0.16732026143790849, + "grad_norm": 2.3803087447164053, + "learning_rate": 1.9027426444874177e-05, + "loss": 0.4894, + "step": 2112 + }, + { + "epoch": 0.16739948504654387, + "grad_norm": 2.139522428014468, + "learning_rate": 1.90263222895073e-05, + "loss": 0.3657, + "step": 2113 + }, + { + "epoch": 0.16747870865517925, + "grad_norm": 2.5857360157692844, + "learning_rate": 1.902521753980053e-05, + "loss": 0.4387, + "step": 2114 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 2.5254515697665045, + "learning_rate": 1.9024112195826614e-05, + "loss": 0.4116, + "step": 2115 + }, + { + "epoch": 0.16763715587244998, + "grad_norm": 2.330962944128933, + "learning_rate": 1.902300625765833e-05, + "loss": 0.353, + "step": 2116 + }, + { + "epoch": 0.16771637948108536, + "grad_norm": 2.429531634297885, + "learning_rate": 1.9021899725368498e-05, + "loss": 0.3004, + "step": 2117 + }, + { + "epoch": 0.16779560308972075, + "grad_norm": 3.105557690144621, + "learning_rate": 1.902079259902998e-05, + "loss": 0.3659, + "step": 2118 + }, + { + "epoch": 0.1678748266983561, + "grad_norm": 2.0809041765481817, + "learning_rate": 1.901968487871568e-05, + "loss": 0.3142, + "step": 2119 + }, + { + "epoch": 0.16795405030699148, + "grad_norm": 2.1673918217544332, + "learning_rate": 1.9018576564498527e-05, + "loss": 0.3441, + "step": 2120 + }, + { + "epoch": 0.16803327391562686, + "grad_norm": 2.8895498973788722, + "learning_rate": 1.9017467656451498e-05, + "loss": 0.4644, + "step": 2121 + }, + { + "epoch": 0.16811249752426222, + "grad_norm": 2.653218764317275, + "learning_rate": 1.9016358154647618e-05, + "loss": 0.4446, + "step": 2122 + }, + { + "epoch": 0.1681917211328976, + "grad_norm": 2.276938814563543, + "learning_rate": 1.9015248059159937e-05, + "loss": 0.3865, + "step": 2123 + }, + { + "epoch": 0.16827094474153298, + "grad_norm": 2.7172014187309372, + "learning_rate": 1.901413737006155e-05, + "loss": 0.4451, + "step": 2124 + }, + { + "epoch": 0.16835016835016836, + "grad_norm": 2.6093729567236896, + "learning_rate": 1.901302608742559e-05, + "loss": 0.4226, + "step": 2125 + }, + { + "epoch": 0.16842939195880371, + "grad_norm": 2.595846609277253, + "learning_rate": 1.9011914211325225e-05, + "loss": 0.4025, + "step": 2126 + }, + { + "epoch": 0.1685086155674391, + "grad_norm": 1.9634697524463316, + "learning_rate": 1.9010801741833678e-05, + "loss": 0.3354, + "step": 2127 + }, + { + "epoch": 0.16858783917607448, + "grad_norm": 3.349044899521629, + "learning_rate": 1.900968867902419e-05, + "loss": 0.403, + "step": 2128 + }, + { + "epoch": 0.16866706278470983, + "grad_norm": 2.4891659613813073, + "learning_rate": 1.900857502297006e-05, + "loss": 0.3356, + "step": 2129 + }, + { + "epoch": 0.1687462863933452, + "grad_norm": 1.8493124973294865, + "learning_rate": 1.9007460773744605e-05, + "loss": 0.2588, + "step": 2130 + }, + { + "epoch": 0.1688255100019806, + "grad_norm": 2.1532833458909346, + "learning_rate": 1.90063459314212e-05, + "loss": 0.2615, + "step": 2131 + }, + { + "epoch": 0.16890473361061598, + "grad_norm": 2.533837726728809, + "learning_rate": 1.9005230496073256e-05, + "loss": 0.2674, + "step": 2132 + }, + { + "epoch": 0.16898395721925133, + "grad_norm": 2.9587320464331577, + "learning_rate": 1.900411446777421e-05, + "loss": 0.4194, + "step": 2133 + }, + { + "epoch": 0.1690631808278867, + "grad_norm": 2.2784747424848435, + "learning_rate": 1.900299784659755e-05, + "loss": 0.3482, + "step": 2134 + }, + { + "epoch": 0.1691424044365221, + "grad_norm": 2.768012413760472, + "learning_rate": 1.9001880632616806e-05, + "loss": 0.4818, + "step": 2135 + }, + { + "epoch": 0.16922162804515745, + "grad_norm": 3.7781618071929057, + "learning_rate": 1.9000762825905535e-05, + "loss": 0.4172, + "step": 2136 + }, + { + "epoch": 0.16930085165379283, + "grad_norm": 2.496340060068725, + "learning_rate": 1.899964442653734e-05, + "loss": 0.3051, + "step": 2137 + }, + { + "epoch": 0.1693800752624282, + "grad_norm": 2.528331472675459, + "learning_rate": 1.8998525434585862e-05, + "loss": 0.4017, + "step": 2138 + }, + { + "epoch": 0.1694592988710636, + "grad_norm": 2.2919283342131784, + "learning_rate": 1.8997405850124786e-05, + "loss": 0.3101, + "step": 2139 + }, + { + "epoch": 0.16953852247969894, + "grad_norm": 2.434449683924331, + "learning_rate": 1.8996285673227826e-05, + "loss": 0.3701, + "step": 2140 + }, + { + "epoch": 0.16961774608833433, + "grad_norm": 2.5664662080142318, + "learning_rate": 1.899516490396874e-05, + "loss": 0.4562, + "step": 2141 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 2.373680078928652, + "learning_rate": 1.8994043542421328e-05, + "loss": 0.3602, + "step": 2142 + }, + { + "epoch": 0.16977619330560506, + "grad_norm": 2.9418582009739844, + "learning_rate": 1.8992921588659424e-05, + "loss": 0.3693, + "step": 2143 + }, + { + "epoch": 0.16985541691424044, + "grad_norm": 1.9707006698811431, + "learning_rate": 1.8991799042756906e-05, + "loss": 0.2413, + "step": 2144 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 2.4198743180837017, + "learning_rate": 1.8990675904787688e-05, + "loss": 0.3169, + "step": 2145 + }, + { + "epoch": 0.17001386413151118, + "grad_norm": 2.4385245086665845, + "learning_rate": 1.898955217482572e-05, + "loss": 0.405, + "step": 2146 + }, + { + "epoch": 0.17009308774014656, + "grad_norm": 2.448066099258772, + "learning_rate": 1.8988427852944997e-05, + "loss": 0.3468, + "step": 2147 + }, + { + "epoch": 0.17017231134878194, + "grad_norm": 2.706294096472709, + "learning_rate": 1.898730293921955e-05, + "loss": 0.4089, + "step": 2148 + }, + { + "epoch": 0.17025153495741732, + "grad_norm": 2.8815788548623886, + "learning_rate": 1.8986177433723446e-05, + "loss": 0.418, + "step": 2149 + }, + { + "epoch": 0.17033075856605268, + "grad_norm": 2.371487072909641, + "learning_rate": 1.89850513365308e-05, + "loss": 0.401, + "step": 2150 + }, + { + "epoch": 0.17040998217468806, + "grad_norm": 3.104191787096029, + "learning_rate": 1.8983924647715756e-05, + "loss": 0.4464, + "step": 2151 + }, + { + "epoch": 0.17048920578332344, + "grad_norm": 2.440025377327329, + "learning_rate": 1.89827973673525e-05, + "loss": 0.2731, + "step": 2152 + }, + { + "epoch": 0.1705684293919588, + "grad_norm": 2.8971508373914756, + "learning_rate": 1.8981669495515264e-05, + "loss": 0.4354, + "step": 2153 + }, + { + "epoch": 0.17064765300059417, + "grad_norm": 3.247004717659025, + "learning_rate": 1.8980541032278302e-05, + "loss": 0.5439, + "step": 2154 + }, + { + "epoch": 0.17072687660922956, + "grad_norm": 2.374871292005657, + "learning_rate": 1.8979411977715928e-05, + "loss": 0.4213, + "step": 2155 + }, + { + "epoch": 0.17080610021786494, + "grad_norm": 2.2545037196460367, + "learning_rate": 1.8978282331902483e-05, + "loss": 0.2908, + "step": 2156 + }, + { + "epoch": 0.1708853238265003, + "grad_norm": 2.619256686868994, + "learning_rate": 1.8977152094912346e-05, + "loss": 0.441, + "step": 2157 + }, + { + "epoch": 0.17096454743513567, + "grad_norm": 2.192957677833369, + "learning_rate": 1.897602126681994e-05, + "loss": 0.3166, + "step": 2158 + }, + { + "epoch": 0.17104377104377105, + "grad_norm": 2.430374293376807, + "learning_rate": 1.897488984769972e-05, + "loss": 0.3759, + "step": 2159 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 2.7285783174520675, + "learning_rate": 1.8973757837626193e-05, + "loss": 0.3116, + "step": 2160 + }, + { + "epoch": 0.1712022182610418, + "grad_norm": 2.1713937899303315, + "learning_rate": 1.8972625236673887e-05, + "loss": 0.2753, + "step": 2161 + }, + { + "epoch": 0.17128144186967717, + "grad_norm": 2.4081495295913737, + "learning_rate": 1.8971492044917386e-05, + "loss": 0.3759, + "step": 2162 + }, + { + "epoch": 0.17136066547831252, + "grad_norm": 2.7599536453866804, + "learning_rate": 1.8970358262431297e-05, + "loss": 0.5082, + "step": 2163 + }, + { + "epoch": 0.1714398890869479, + "grad_norm": 2.556122205339184, + "learning_rate": 1.8969223889290283e-05, + "loss": 0.4345, + "step": 2164 + }, + { + "epoch": 0.1715191126955833, + "grad_norm": 2.895613537402155, + "learning_rate": 1.8968088925569032e-05, + "loss": 0.4889, + "step": 2165 + }, + { + "epoch": 0.17159833630421867, + "grad_norm": 2.2890530778345846, + "learning_rate": 1.896695337134228e-05, + "loss": 0.335, + "step": 2166 + }, + { + "epoch": 0.17167755991285402, + "grad_norm": 2.5929684615372124, + "learning_rate": 1.8965817226684794e-05, + "loss": 0.3215, + "step": 2167 + }, + { + "epoch": 0.1717567835214894, + "grad_norm": 2.4875316299779775, + "learning_rate": 1.896468049167138e-05, + "loss": 0.3408, + "step": 2168 + }, + { + "epoch": 0.17183600713012478, + "grad_norm": 2.1899809524907665, + "learning_rate": 1.896354316637689e-05, + "loss": 0.2778, + "step": 2169 + }, + { + "epoch": 0.17191523073876014, + "grad_norm": 2.3699029896496926, + "learning_rate": 1.8962405250876218e-05, + "loss": 0.3449, + "step": 2170 + }, + { + "epoch": 0.17199445434739552, + "grad_norm": 2.358263386433618, + "learning_rate": 1.896126674524428e-05, + "loss": 0.2964, + "step": 2171 + }, + { + "epoch": 0.1720736779560309, + "grad_norm": 2.7779522918396675, + "learning_rate": 1.896012764955605e-05, + "loss": 0.3479, + "step": 2172 + }, + { + "epoch": 0.17215290156466628, + "grad_norm": 2.8200480690484158, + "learning_rate": 1.8958987963886526e-05, + "loss": 0.322, + "step": 2173 + }, + { + "epoch": 0.17223212517330164, + "grad_norm": 2.2987813864920645, + "learning_rate": 1.8957847688310752e-05, + "loss": 0.3801, + "step": 2174 + }, + { + "epoch": 0.17231134878193702, + "grad_norm": 2.9524822890598577, + "learning_rate": 1.8956706822903812e-05, + "loss": 0.294, + "step": 2175 + }, + { + "epoch": 0.1723905723905724, + "grad_norm": 2.417615406569931, + "learning_rate": 1.8955565367740824e-05, + "loss": 0.3939, + "step": 2176 + }, + { + "epoch": 0.17246979599920775, + "grad_norm": 2.2878973867876553, + "learning_rate": 1.8954423322896944e-05, + "loss": 0.3393, + "step": 2177 + }, + { + "epoch": 0.17254901960784313, + "grad_norm": 2.7467332554696555, + "learning_rate": 1.895328068844738e-05, + "loss": 0.4135, + "step": 2178 + }, + { + "epoch": 0.17262824321647852, + "grad_norm": 2.410082829360762, + "learning_rate": 1.8952137464467358e-05, + "loss": 0.3861, + "step": 2179 + }, + { + "epoch": 0.1727074668251139, + "grad_norm": 2.3148305943350236, + "learning_rate": 1.895099365103216e-05, + "loss": 0.3363, + "step": 2180 + }, + { + "epoch": 0.17278669043374925, + "grad_norm": 2.4451622488844698, + "learning_rate": 1.89498492482171e-05, + "loss": 0.3457, + "step": 2181 + }, + { + "epoch": 0.17286591404238463, + "grad_norm": 2.9123090342896596, + "learning_rate": 1.8948704256097533e-05, + "loss": 0.4391, + "step": 2182 + }, + { + "epoch": 0.17294513765102001, + "grad_norm": 2.5961962897146953, + "learning_rate": 1.8947558674748844e-05, + "loss": 0.3523, + "step": 2183 + }, + { + "epoch": 0.17302436125965537, + "grad_norm": 2.3554909079110096, + "learning_rate": 1.8946412504246474e-05, + "loss": 0.3494, + "step": 2184 + }, + { + "epoch": 0.17310358486829075, + "grad_norm": 2.540965403058671, + "learning_rate": 1.8945265744665886e-05, + "loss": 0.3714, + "step": 2185 + }, + { + "epoch": 0.17318280847692613, + "grad_norm": 2.429718294333805, + "learning_rate": 1.8944118396082594e-05, + "loss": 0.4051, + "step": 2186 + }, + { + "epoch": 0.17326203208556148, + "grad_norm": 2.3319168615465395, + "learning_rate": 1.8942970458572138e-05, + "loss": 0.3159, + "step": 2187 + }, + { + "epoch": 0.17334125569419687, + "grad_norm": 2.180703276575142, + "learning_rate": 1.894182193221011e-05, + "loss": 0.3446, + "step": 2188 + }, + { + "epoch": 0.17342047930283225, + "grad_norm": 3.544600079813891, + "learning_rate": 1.894067281707213e-05, + "loss": 0.4013, + "step": 2189 + }, + { + "epoch": 0.17349970291146763, + "grad_norm": 2.077860658659168, + "learning_rate": 1.893952311323387e-05, + "loss": 0.2531, + "step": 2190 + }, + { + "epoch": 0.17357892652010298, + "grad_norm": 2.7539813496483565, + "learning_rate": 1.8938372820771024e-05, + "loss": 0.4322, + "step": 2191 + }, + { + "epoch": 0.17365815012873836, + "grad_norm": 2.3017730244002834, + "learning_rate": 1.8937221939759334e-05, + "loss": 0.2896, + "step": 2192 + }, + { + "epoch": 0.17373737373737375, + "grad_norm": 2.2789548140396674, + "learning_rate": 1.8936070470274587e-05, + "loss": 0.3167, + "step": 2193 + }, + { + "epoch": 0.1738165973460091, + "grad_norm": 2.4857092572182737, + "learning_rate": 1.8934918412392596e-05, + "loss": 0.3634, + "step": 2194 + }, + { + "epoch": 0.17389582095464448, + "grad_norm": 3.0808422718461514, + "learning_rate": 1.893376576618922e-05, + "loss": 0.4165, + "step": 2195 + }, + { + "epoch": 0.17397504456327986, + "grad_norm": 2.106262606885841, + "learning_rate": 1.8932612531740354e-05, + "loss": 0.3718, + "step": 2196 + }, + { + "epoch": 0.17405426817191524, + "grad_norm": 2.5529701921922117, + "learning_rate": 1.893145870912193e-05, + "loss": 0.369, + "step": 2197 + }, + { + "epoch": 0.1741334917805506, + "grad_norm": 2.442736798790252, + "learning_rate": 1.8930304298409933e-05, + "loss": 0.3621, + "step": 2198 + }, + { + "epoch": 0.17421271538918598, + "grad_norm": 2.5063383000144634, + "learning_rate": 1.8929149299680364e-05, + "loss": 0.2891, + "step": 2199 + }, + { + "epoch": 0.17429193899782136, + "grad_norm": 1.7475169515077713, + "learning_rate": 1.8927993713009275e-05, + "loss": 0.2974, + "step": 2200 + }, + { + "epoch": 0.17437116260645671, + "grad_norm": 2.4189101364032757, + "learning_rate": 1.892683753847276e-05, + "loss": 0.3523, + "step": 2201 + }, + { + "epoch": 0.1744503862150921, + "grad_norm": 2.303026791129677, + "learning_rate": 1.892568077614695e-05, + "loss": 0.3423, + "step": 2202 + }, + { + "epoch": 0.17452960982372748, + "grad_norm": 2.7473646135768695, + "learning_rate": 1.892452342610801e-05, + "loss": 0.4762, + "step": 2203 + }, + { + "epoch": 0.17460883343236283, + "grad_norm": 2.7201715222550864, + "learning_rate": 1.892336548843214e-05, + "loss": 0.3626, + "step": 2204 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 2.281882051748451, + "learning_rate": 1.892220696319559e-05, + "loss": 0.2948, + "step": 2205 + }, + { + "epoch": 0.1747672806496336, + "grad_norm": 2.3124394676615516, + "learning_rate": 1.8921047850474645e-05, + "loss": 0.4231, + "step": 2206 + }, + { + "epoch": 0.17484650425826898, + "grad_norm": 2.1894654960596003, + "learning_rate": 1.891988815034562e-05, + "loss": 0.2889, + "step": 2207 + }, + { + "epoch": 0.17492572786690433, + "grad_norm": 2.7231170288819846, + "learning_rate": 1.891872786288488e-05, + "loss": 0.5903, + "step": 2208 + }, + { + "epoch": 0.1750049514755397, + "grad_norm": 2.4238463547996543, + "learning_rate": 1.8917566988168826e-05, + "loss": 0.3282, + "step": 2209 + }, + { + "epoch": 0.1750841750841751, + "grad_norm": 1.9409698911102269, + "learning_rate": 1.8916405526273894e-05, + "loss": 0.355, + "step": 2210 + }, + { + "epoch": 0.17516339869281045, + "grad_norm": 2.6014900331580413, + "learning_rate": 1.8915243477276563e-05, + "loss": 0.4701, + "step": 2211 + }, + { + "epoch": 0.17524262230144583, + "grad_norm": 2.2879987364227308, + "learning_rate": 1.8914080841253348e-05, + "loss": 0.3437, + "step": 2212 + }, + { + "epoch": 0.1753218459100812, + "grad_norm": 2.7801715899583885, + "learning_rate": 1.8912917618280796e-05, + "loss": 0.5025, + "step": 2213 + }, + { + "epoch": 0.1754010695187166, + "grad_norm": 2.694736890213809, + "learning_rate": 1.8911753808435508e-05, + "loss": 0.4684, + "step": 2214 + }, + { + "epoch": 0.17548029312735194, + "grad_norm": 2.616118825405062, + "learning_rate": 1.891058941179411e-05, + "loss": 0.2805, + "step": 2215 + }, + { + "epoch": 0.17555951673598733, + "grad_norm": 2.2962292147394767, + "learning_rate": 1.8909424428433278e-05, + "loss": 0.3663, + "step": 2216 + }, + { + "epoch": 0.1756387403446227, + "grad_norm": 2.300167652656391, + "learning_rate": 1.8908258858429716e-05, + "loss": 0.3352, + "step": 2217 + }, + { + "epoch": 0.17571796395325806, + "grad_norm": 2.238963644423776, + "learning_rate": 1.890709270186017e-05, + "loss": 0.3778, + "step": 2218 + }, + { + "epoch": 0.17579718756189344, + "grad_norm": 2.0216601390263413, + "learning_rate": 1.890592595880143e-05, + "loss": 0.3113, + "step": 2219 + }, + { + "epoch": 0.17587641117052882, + "grad_norm": 2.3600720490721416, + "learning_rate": 1.890475862933032e-05, + "loss": 0.5143, + "step": 2220 + }, + { + "epoch": 0.17595563477916418, + "grad_norm": 2.481295917569171, + "learning_rate": 1.8903590713523698e-05, + "loss": 0.3345, + "step": 2221 + }, + { + "epoch": 0.17603485838779956, + "grad_norm": 1.9133074466708255, + "learning_rate": 1.8902422211458466e-05, + "loss": 0.2761, + "step": 2222 + }, + { + "epoch": 0.17611408199643494, + "grad_norm": 2.360993772606194, + "learning_rate": 1.890125312321157e-05, + "loss": 0.3967, + "step": 2223 + }, + { + "epoch": 0.17619330560507032, + "grad_norm": 2.209250537372684, + "learning_rate": 1.8900083448859986e-05, + "loss": 0.3227, + "step": 2224 + }, + { + "epoch": 0.17627252921370568, + "grad_norm": 2.216775376139844, + "learning_rate": 1.8898913188480733e-05, + "loss": 0.2768, + "step": 2225 + }, + { + "epoch": 0.17635175282234106, + "grad_norm": 2.985214414290217, + "learning_rate": 1.8897742342150863e-05, + "loss": 0.5473, + "step": 2226 + }, + { + "epoch": 0.17643097643097644, + "grad_norm": 2.54720611375589, + "learning_rate": 1.8896570909947477e-05, + "loss": 0.4999, + "step": 2227 + }, + { + "epoch": 0.1765102000396118, + "grad_norm": 2.5616891684490266, + "learning_rate": 1.88953988919477e-05, + "loss": 0.3606, + "step": 2228 + }, + { + "epoch": 0.17658942364824717, + "grad_norm": 2.7624486173525127, + "learning_rate": 1.8894226288228707e-05, + "loss": 0.3502, + "step": 2229 + }, + { + "epoch": 0.17666864725688255, + "grad_norm": 2.899884035817668, + "learning_rate": 1.8893053098867714e-05, + "loss": 0.3123, + "step": 2230 + }, + { + "epoch": 0.17674787086551794, + "grad_norm": 2.413198494708979, + "learning_rate": 1.889187932394196e-05, + "loss": 0.4752, + "step": 2231 + }, + { + "epoch": 0.1768270944741533, + "grad_norm": 1.8774862286167893, + "learning_rate": 1.889070496352874e-05, + "loss": 0.3285, + "step": 2232 + }, + { + "epoch": 0.17690631808278867, + "grad_norm": 2.8019297147960502, + "learning_rate": 1.888953001770538e-05, + "loss": 0.4447, + "step": 2233 + }, + { + "epoch": 0.17698554169142405, + "grad_norm": 3.089390450629726, + "learning_rate": 1.8888354486549238e-05, + "loss": 0.4364, + "step": 2234 + }, + { + "epoch": 0.1770647653000594, + "grad_norm": 2.7763340171829545, + "learning_rate": 1.888717837013772e-05, + "loss": 0.3412, + "step": 2235 + }, + { + "epoch": 0.1771439889086948, + "grad_norm": 2.350382712116652, + "learning_rate": 1.8886001668548273e-05, + "loss": 0.3596, + "step": 2236 + }, + { + "epoch": 0.17722321251733017, + "grad_norm": 2.0589733456646813, + "learning_rate": 1.8884824381858368e-05, + "loss": 0.348, + "step": 2237 + }, + { + "epoch": 0.17730243612596555, + "grad_norm": 2.713390016532201, + "learning_rate": 1.888364651014553e-05, + "loss": 0.3167, + "step": 2238 + }, + { + "epoch": 0.1773816597346009, + "grad_norm": 2.580833062371776, + "learning_rate": 1.888246805348732e-05, + "loss": 0.3798, + "step": 2239 + }, + { + "epoch": 0.1774608833432363, + "grad_norm": 2.097769257939288, + "learning_rate": 1.8881289011961323e-05, + "loss": 0.383, + "step": 2240 + }, + { + "epoch": 0.17754010695187167, + "grad_norm": 2.4511582730125356, + "learning_rate": 1.8880109385645184e-05, + "loss": 0.4192, + "step": 2241 + }, + { + "epoch": 0.17761933056050702, + "grad_norm": 2.152868131719102, + "learning_rate": 1.8878929174616566e-05, + "loss": 0.2988, + "step": 2242 + }, + { + "epoch": 0.1776985541691424, + "grad_norm": 3.348306824672599, + "learning_rate": 1.887774837895318e-05, + "loss": 0.3431, + "step": 2243 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 2.073670096785361, + "learning_rate": 1.887656699873279e-05, + "loss": 0.4473, + "step": 2244 + }, + { + "epoch": 0.17785700138641314, + "grad_norm": 2.1205626412960403, + "learning_rate": 1.887538503403317e-05, + "loss": 0.3981, + "step": 2245 + }, + { + "epoch": 0.17793622499504852, + "grad_norm": 2.1057075517848145, + "learning_rate": 1.8874202484932148e-05, + "loss": 0.541, + "step": 2246 + }, + { + "epoch": 0.1780154486036839, + "grad_norm": 2.2622129274146556, + "learning_rate": 1.8873019351507596e-05, + "loss": 0.4459, + "step": 2247 + }, + { + "epoch": 0.17809467221231928, + "grad_norm": 2.2590690386233345, + "learning_rate": 1.887183563383741e-05, + "loss": 0.4301, + "step": 2248 + }, + { + "epoch": 0.17817389582095464, + "grad_norm": 1.8527160438370396, + "learning_rate": 1.8870651331999542e-05, + "loss": 0.3188, + "step": 2249 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 2.1888037910866567, + "learning_rate": 1.886946644607196e-05, + "loss": 0.3387, + "step": 2250 + }, + { + "epoch": 0.1783323430382254, + "grad_norm": 2.203207946909501, + "learning_rate": 1.8868280976132697e-05, + "loss": 0.3214, + "step": 2251 + }, + { + "epoch": 0.17841156664686075, + "grad_norm": 2.162461292130499, + "learning_rate": 1.8867094922259798e-05, + "loss": 0.2963, + "step": 2252 + }, + { + "epoch": 0.17849079025549613, + "grad_norm": 2.2357105821769334, + "learning_rate": 1.8865908284531368e-05, + "loss": 0.3015, + "step": 2253 + }, + { + "epoch": 0.17857001386413152, + "grad_norm": 2.098565615184502, + "learning_rate": 1.8864721063025536e-05, + "loss": 0.3427, + "step": 2254 + }, + { + "epoch": 0.1786492374727669, + "grad_norm": 2.544108450267634, + "learning_rate": 1.8863533257820475e-05, + "loss": 0.2758, + "step": 2255 + }, + { + "epoch": 0.17872846108140225, + "grad_norm": 2.563509390950677, + "learning_rate": 1.8862344868994395e-05, + "loss": 0.4012, + "step": 2256 + }, + { + "epoch": 0.17880768469003763, + "grad_norm": 2.056589261982362, + "learning_rate": 1.8861155896625553e-05, + "loss": 0.3323, + "step": 2257 + }, + { + "epoch": 0.17888690829867301, + "grad_norm": 2.4927181003160865, + "learning_rate": 1.885996634079223e-05, + "loss": 0.3375, + "step": 2258 + }, + { + "epoch": 0.17896613190730837, + "grad_norm": 2.1622531747641003, + "learning_rate": 1.8858776201572758e-05, + "loss": 0.3627, + "step": 2259 + }, + { + "epoch": 0.17904535551594375, + "grad_norm": 2.1178658779004795, + "learning_rate": 1.8857585479045493e-05, + "loss": 0.2167, + "step": 2260 + }, + { + "epoch": 0.17912457912457913, + "grad_norm": 2.3911010311555936, + "learning_rate": 1.8856394173288848e-05, + "loss": 0.4988, + "step": 2261 + }, + { + "epoch": 0.17920380273321448, + "grad_norm": 3.090916014336323, + "learning_rate": 1.8855202284381264e-05, + "loss": 0.4275, + "step": 2262 + }, + { + "epoch": 0.17928302634184987, + "grad_norm": 2.521138369068937, + "learning_rate": 1.8854009812401213e-05, + "loss": 0.3247, + "step": 2263 + }, + { + "epoch": 0.17936224995048525, + "grad_norm": 2.5078213967329748, + "learning_rate": 1.885281675742722e-05, + "loss": 0.4314, + "step": 2264 + }, + { + "epoch": 0.17944147355912063, + "grad_norm": 2.0203068400035735, + "learning_rate": 1.885162311953784e-05, + "loss": 0.2707, + "step": 2265 + }, + { + "epoch": 0.17952069716775598, + "grad_norm": 2.3394484789226575, + "learning_rate": 1.885042889881167e-05, + "loss": 0.3076, + "step": 2266 + }, + { + "epoch": 0.17959992077639136, + "grad_norm": 2.976526413132003, + "learning_rate": 1.8849234095327343e-05, + "loss": 0.5041, + "step": 2267 + }, + { + "epoch": 0.17967914438502675, + "grad_norm": 3.192980426468377, + "learning_rate": 1.884803870916353e-05, + "loss": 0.356, + "step": 2268 + }, + { + "epoch": 0.1797583679936621, + "grad_norm": 2.266720195603325, + "learning_rate": 1.884684274039894e-05, + "loss": 0.3867, + "step": 2269 + }, + { + "epoch": 0.17983759160229748, + "grad_norm": 2.630003140698048, + "learning_rate": 1.8845646189112327e-05, + "loss": 0.4206, + "step": 2270 + }, + { + "epoch": 0.17991681521093286, + "grad_norm": 2.19533126041278, + "learning_rate": 1.8844449055382473e-05, + "loss": 0.3231, + "step": 2271 + }, + { + "epoch": 0.17999603881956824, + "grad_norm": 2.291971238372196, + "learning_rate": 1.8843251339288207e-05, + "loss": 0.2977, + "step": 2272 + }, + { + "epoch": 0.1800752624282036, + "grad_norm": 2.139047899374045, + "learning_rate": 1.884205304090839e-05, + "loss": 0.3601, + "step": 2273 + }, + { + "epoch": 0.18015448603683898, + "grad_norm": 2.744523817004491, + "learning_rate": 1.8840854160321926e-05, + "loss": 0.2631, + "step": 2274 + }, + { + "epoch": 0.18023370964547436, + "grad_norm": 2.7548359030785297, + "learning_rate": 1.8839654697607756e-05, + "loss": 0.3208, + "step": 2275 + }, + { + "epoch": 0.18031293325410971, + "grad_norm": 2.4774999927677137, + "learning_rate": 1.8838454652844857e-05, + "loss": 0.3377, + "step": 2276 + }, + { + "epoch": 0.1803921568627451, + "grad_norm": 1.9707352356292724, + "learning_rate": 1.8837254026112245e-05, + "loss": 0.2485, + "step": 2277 + }, + { + "epoch": 0.18047138047138048, + "grad_norm": 1.9098460905264407, + "learning_rate": 1.883605281748898e-05, + "loss": 0.2836, + "step": 2278 + }, + { + "epoch": 0.18055060408001586, + "grad_norm": 1.8641038253620796, + "learning_rate": 1.8834851027054152e-05, + "loss": 0.2612, + "step": 2279 + }, + { + "epoch": 0.1806298276886512, + "grad_norm": 2.8081035775012375, + "learning_rate": 1.8833648654886898e-05, + "loss": 0.4917, + "step": 2280 + }, + { + "epoch": 0.1807090512972866, + "grad_norm": 2.041431150392968, + "learning_rate": 1.883244570106638e-05, + "loss": 0.2784, + "step": 2281 + }, + { + "epoch": 0.18078827490592198, + "grad_norm": 1.9540642497672804, + "learning_rate": 1.8831242165671816e-05, + "loss": 0.3058, + "step": 2282 + }, + { + "epoch": 0.18086749851455733, + "grad_norm": 2.2666690188234, + "learning_rate": 1.8830038048782445e-05, + "loss": 0.3771, + "step": 2283 + }, + { + "epoch": 0.1809467221231927, + "grad_norm": 2.447525472996404, + "learning_rate": 1.8828833350477556e-05, + "loss": 0.3348, + "step": 2284 + }, + { + "epoch": 0.1810259457318281, + "grad_norm": 3.0817296113041257, + "learning_rate": 1.8827628070836477e-05, + "loss": 0.5346, + "step": 2285 + }, + { + "epoch": 0.18110516934046345, + "grad_norm": 1.7488797341407631, + "learning_rate": 1.8826422209938563e-05, + "loss": 0.2251, + "step": 2286 + }, + { + "epoch": 0.18118439294909883, + "grad_norm": 2.312363645373812, + "learning_rate": 1.8825215767863215e-05, + "loss": 0.3802, + "step": 2287 + }, + { + "epoch": 0.1812636165577342, + "grad_norm": 2.6113495062677794, + "learning_rate": 1.8824008744689873e-05, + "loss": 0.3817, + "step": 2288 + }, + { + "epoch": 0.1813428401663696, + "grad_norm": 2.7492437202208304, + "learning_rate": 1.8822801140498014e-05, + "loss": 0.3853, + "step": 2289 + }, + { + "epoch": 0.18142206377500494, + "grad_norm": 3.72839857079176, + "learning_rate": 1.8821592955367154e-05, + "loss": 0.5297, + "step": 2290 + }, + { + "epoch": 0.18150128738364033, + "grad_norm": 2.5601664171180807, + "learning_rate": 1.8820384189376845e-05, + "loss": 0.4437, + "step": 2291 + }, + { + "epoch": 0.1815805109922757, + "grad_norm": 2.0999948908668307, + "learning_rate": 1.8819174842606675e-05, + "loss": 0.3295, + "step": 2292 + }, + { + "epoch": 0.18165973460091106, + "grad_norm": 3.4630688727512533, + "learning_rate": 1.8817964915136277e-05, + "loss": 0.2814, + "step": 2293 + }, + { + "epoch": 0.18173895820954644, + "grad_norm": 2.3417772119653333, + "learning_rate": 1.881675440704532e-05, + "loss": 0.3478, + "step": 2294 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.3543112301846043, + "learning_rate": 1.881554331841351e-05, + "loss": 0.3024, + "step": 2295 + }, + { + "epoch": 0.1818974054268172, + "grad_norm": 2.48172645963912, + "learning_rate": 1.881433164932059e-05, + "loss": 0.4151, + "step": 2296 + }, + { + "epoch": 0.18197662903545256, + "grad_norm": 2.4838331653823995, + "learning_rate": 1.881311939984634e-05, + "loss": 0.3585, + "step": 2297 + }, + { + "epoch": 0.18205585264408794, + "grad_norm": 2.5104491727078537, + "learning_rate": 1.8811906570070583e-05, + "loss": 0.3031, + "step": 2298 + }, + { + "epoch": 0.18213507625272332, + "grad_norm": 2.582212854514169, + "learning_rate": 1.8810693160073184e-05, + "loss": 0.3555, + "step": 2299 + }, + { + "epoch": 0.18221429986135868, + "grad_norm": 2.260466920406567, + "learning_rate": 1.880947916993403e-05, + "loss": 0.3147, + "step": 2300 + }, + { + "epoch": 0.18229352346999406, + "grad_norm": 2.108711585961772, + "learning_rate": 1.8808264599733065e-05, + "loss": 0.2336, + "step": 2301 + }, + { + "epoch": 0.18237274707862944, + "grad_norm": 1.8129311270852229, + "learning_rate": 1.8807049449550254e-05, + "loss": 0.1859, + "step": 2302 + }, + { + "epoch": 0.1824519706872648, + "grad_norm": 2.8514702441521216, + "learning_rate": 1.8805833719465617e-05, + "loss": 0.4239, + "step": 2303 + }, + { + "epoch": 0.18253119429590017, + "grad_norm": 2.6258942978508775, + "learning_rate": 1.88046174095592e-05, + "loss": 0.4009, + "step": 2304 + }, + { + "epoch": 0.18261041790453555, + "grad_norm": 2.376692009004102, + "learning_rate": 1.880340051991109e-05, + "loss": 0.2973, + "step": 2305 + }, + { + "epoch": 0.18268964151317094, + "grad_norm": 2.81701758688857, + "learning_rate": 1.8802183050601417e-05, + "loss": 0.3888, + "step": 2306 + }, + { + "epoch": 0.1827688651218063, + "grad_norm": 2.6988120734982197, + "learning_rate": 1.8800965001710342e-05, + "loss": 0.4857, + "step": 2307 + }, + { + "epoch": 0.18284808873044167, + "grad_norm": 2.434844528570647, + "learning_rate": 1.879974637331807e-05, + "loss": 0.402, + "step": 2308 + }, + { + "epoch": 0.18292731233907705, + "grad_norm": 3.2364920599774787, + "learning_rate": 1.879852716550484e-05, + "loss": 0.439, + "step": 2309 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 2.2624829721422395, + "learning_rate": 1.8797307378350935e-05, + "loss": 0.396, + "step": 2310 + }, + { + "epoch": 0.1830857595563478, + "grad_norm": 2.418464608583417, + "learning_rate": 1.8796087011936665e-05, + "loss": 0.4137, + "step": 2311 + }, + { + "epoch": 0.18316498316498317, + "grad_norm": 2.4378665971974747, + "learning_rate": 1.8794866066342394e-05, + "loss": 0.3857, + "step": 2312 + }, + { + "epoch": 0.18324420677361855, + "grad_norm": 2.1559454965374893, + "learning_rate": 1.879364454164851e-05, + "loss": 0.2897, + "step": 2313 + }, + { + "epoch": 0.1833234303822539, + "grad_norm": 2.7326341608881095, + "learning_rate": 1.879242243793544e-05, + "loss": 0.3999, + "step": 2314 + }, + { + "epoch": 0.18340265399088929, + "grad_norm": 2.5845062026744614, + "learning_rate": 1.8791199755283664e-05, + "loss": 0.3615, + "step": 2315 + }, + { + "epoch": 0.18348187759952467, + "grad_norm": 2.383931557532952, + "learning_rate": 1.878997649377368e-05, + "loss": 0.3871, + "step": 2316 + }, + { + "epoch": 0.18356110120816002, + "grad_norm": 2.510233308100274, + "learning_rate": 1.8788752653486045e-05, + "loss": 0.4941, + "step": 2317 + }, + { + "epoch": 0.1836403248167954, + "grad_norm": 3.645298019919298, + "learning_rate": 1.878752823450133e-05, + "loss": 0.4638, + "step": 2318 + }, + { + "epoch": 0.18371954842543078, + "grad_norm": 2.5640338416982487, + "learning_rate": 1.878630323690017e-05, + "loss": 0.2658, + "step": 2319 + }, + { + "epoch": 0.18379877203406614, + "grad_norm": 2.277676082097039, + "learning_rate": 1.8785077660763217e-05, + "loss": 0.2998, + "step": 2320 + }, + { + "epoch": 0.18387799564270152, + "grad_norm": 2.793564639000829, + "learning_rate": 1.8783851506171166e-05, + "loss": 0.3348, + "step": 2321 + }, + { + "epoch": 0.1839572192513369, + "grad_norm": 2.4186368481788665, + "learning_rate": 1.8782624773204764e-05, + "loss": 0.3054, + "step": 2322 + }, + { + "epoch": 0.18403644285997228, + "grad_norm": 2.5380074555152135, + "learning_rate": 1.8781397461944777e-05, + "loss": 0.3612, + "step": 2323 + }, + { + "epoch": 0.18411566646860764, + "grad_norm": 2.2511183841820523, + "learning_rate": 1.8780169572472024e-05, + "loss": 0.3667, + "step": 2324 + }, + { + "epoch": 0.18419489007724302, + "grad_norm": 2.089634176256761, + "learning_rate": 1.8778941104867347e-05, + "loss": 0.2162, + "step": 2325 + }, + { + "epoch": 0.1842741136858784, + "grad_norm": 2.334529141607381, + "learning_rate": 1.8777712059211643e-05, + "loss": 0.4318, + "step": 2326 + }, + { + "epoch": 0.18435333729451375, + "grad_norm": 2.462494951280783, + "learning_rate": 1.8776482435585836e-05, + "loss": 0.3748, + "step": 2327 + }, + { + "epoch": 0.18443256090314913, + "grad_norm": 4.004588331062914, + "learning_rate": 1.877525223407089e-05, + "loss": 0.4338, + "step": 2328 + }, + { + "epoch": 0.18451178451178452, + "grad_norm": 2.3269944007926417, + "learning_rate": 1.877402145474781e-05, + "loss": 0.3882, + "step": 2329 + }, + { + "epoch": 0.1845910081204199, + "grad_norm": 2.759584402642482, + "learning_rate": 1.877279009769763e-05, + "loss": 0.5195, + "step": 2330 + }, + { + "epoch": 0.18467023172905525, + "grad_norm": 2.5557437988805485, + "learning_rate": 1.8771558163001438e-05, + "loss": 0.4687, + "step": 2331 + }, + { + "epoch": 0.18474945533769063, + "grad_norm": 2.0378673196831887, + "learning_rate": 1.8770325650740347e-05, + "loss": 0.3179, + "step": 2332 + }, + { + "epoch": 0.184828678946326, + "grad_norm": 2.386000362403704, + "learning_rate": 1.876909256099551e-05, + "loss": 0.3602, + "step": 2333 + }, + { + "epoch": 0.18490790255496137, + "grad_norm": 3.0234490585000353, + "learning_rate": 1.876785889384812e-05, + "loss": 0.4049, + "step": 2334 + }, + { + "epoch": 0.18498712616359675, + "grad_norm": 3.097328032456957, + "learning_rate": 1.8766624649379415e-05, + "loss": 0.4067, + "step": 2335 + }, + { + "epoch": 0.18506634977223213, + "grad_norm": 2.859826786833267, + "learning_rate": 1.8765389827670657e-05, + "loss": 0.4865, + "step": 2336 + }, + { + "epoch": 0.1851455733808675, + "grad_norm": 2.1015460161226436, + "learning_rate": 1.8764154428803155e-05, + "loss": 0.2837, + "step": 2337 + }, + { + "epoch": 0.18522479698950287, + "grad_norm": 2.2995081930660937, + "learning_rate": 1.8762918452858256e-05, + "loss": 0.2956, + "step": 2338 + }, + { + "epoch": 0.18530402059813825, + "grad_norm": 2.35434895152673, + "learning_rate": 1.876168189991734e-05, + "loss": 0.2762, + "step": 2339 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 2.451243285460605, + "learning_rate": 1.876044477006183e-05, + "loss": 0.2849, + "step": 2340 + }, + { + "epoch": 0.18546246781540898, + "grad_norm": 2.23536515794613, + "learning_rate": 1.8759207063373183e-05, + "loss": 0.3969, + "step": 2341 + }, + { + "epoch": 0.18554169142404436, + "grad_norm": 2.566599801976133, + "learning_rate": 1.87579687799329e-05, + "loss": 0.3268, + "step": 2342 + }, + { + "epoch": 0.18562091503267975, + "grad_norm": 2.4754747113106124, + "learning_rate": 1.875672991982251e-05, + "loss": 0.3663, + "step": 2343 + }, + { + "epoch": 0.1857001386413151, + "grad_norm": 2.74166198690715, + "learning_rate": 1.875549048312359e-05, + "loss": 0.3339, + "step": 2344 + }, + { + "epoch": 0.18577936224995048, + "grad_norm": 2.975816090283354, + "learning_rate": 1.8754250469917753e-05, + "loss": 0.5422, + "step": 2345 + }, + { + "epoch": 0.18585858585858586, + "grad_norm": 2.6011827108643235, + "learning_rate": 1.8753009880286647e-05, + "loss": 0.4457, + "step": 2346 + }, + { + "epoch": 0.18593780946722124, + "grad_norm": 2.5964538856244768, + "learning_rate": 1.8751768714311952e-05, + "loss": 0.3786, + "step": 2347 + }, + { + "epoch": 0.1860170330758566, + "grad_norm": 2.3410431498333955, + "learning_rate": 1.87505269720754e-05, + "loss": 0.4045, + "step": 2348 + }, + { + "epoch": 0.18609625668449198, + "grad_norm": 2.4402011232336496, + "learning_rate": 1.8749284653658754e-05, + "loss": 0.3779, + "step": 2349 + }, + { + "epoch": 0.18617548029312736, + "grad_norm": 2.366642652619114, + "learning_rate": 1.874804175914381e-05, + "loss": 0.3724, + "step": 2350 + }, + { + "epoch": 0.1862547039017627, + "grad_norm": 1.9239428344717986, + "learning_rate": 1.8746798288612405e-05, + "loss": 0.2864, + "step": 2351 + }, + { + "epoch": 0.1863339275103981, + "grad_norm": 2.362876582429057, + "learning_rate": 1.8745554242146428e-05, + "loss": 0.358, + "step": 2352 + }, + { + "epoch": 0.18641315111903348, + "grad_norm": 2.356237795697956, + "learning_rate": 1.874430961982778e-05, + "loss": 0.3782, + "step": 2353 + }, + { + "epoch": 0.18649237472766886, + "grad_norm": 2.1905368125781353, + "learning_rate": 1.874306442173842e-05, + "loss": 0.3575, + "step": 2354 + }, + { + "epoch": 0.1865715983363042, + "grad_norm": 2.191471676289847, + "learning_rate": 1.8741818647960337e-05, + "loss": 0.3142, + "step": 2355 + }, + { + "epoch": 0.1866508219449396, + "grad_norm": 2.525277479361757, + "learning_rate": 1.8740572298575558e-05, + "loss": 0.3111, + "step": 2356 + }, + { + "epoch": 0.18673004555357497, + "grad_norm": 2.262008817578584, + "learning_rate": 1.8739325373666152e-05, + "loss": 0.3561, + "step": 2357 + }, + { + "epoch": 0.18680926916221033, + "grad_norm": 2.1024282379196855, + "learning_rate": 1.8738077873314218e-05, + "loss": 0.3291, + "step": 2358 + }, + { + "epoch": 0.1868884927708457, + "grad_norm": 2.4822728521593334, + "learning_rate": 1.8736829797601903e-05, + "loss": 0.4646, + "step": 2359 + }, + { + "epoch": 0.1869677163794811, + "grad_norm": 3.07982275805283, + "learning_rate": 1.8735581146611387e-05, + "loss": 0.4493, + "step": 2360 + }, + { + "epoch": 0.18704693998811645, + "grad_norm": 2.775561776895125, + "learning_rate": 1.873433192042488e-05, + "loss": 0.3567, + "step": 2361 + }, + { + "epoch": 0.18712616359675183, + "grad_norm": 2.0416057745576763, + "learning_rate": 1.8733082119124646e-05, + "loss": 0.336, + "step": 2362 + }, + { + "epoch": 0.1872053872053872, + "grad_norm": 2.6729126196305213, + "learning_rate": 1.8731831742792974e-05, + "loss": 0.4414, + "step": 2363 + }, + { + "epoch": 0.1872846108140226, + "grad_norm": 2.3773759184808925, + "learning_rate": 1.87305807915122e-05, + "loss": 0.375, + "step": 2364 + }, + { + "epoch": 0.18736383442265794, + "grad_norm": 2.4057512840315294, + "learning_rate": 1.8729329265364685e-05, + "loss": 0.3645, + "step": 2365 + }, + { + "epoch": 0.18744305803129332, + "grad_norm": 2.7158335366717257, + "learning_rate": 1.8728077164432844e-05, + "loss": 0.4029, + "step": 2366 + }, + { + "epoch": 0.1875222816399287, + "grad_norm": 2.5252002446889295, + "learning_rate": 1.872682448879912e-05, + "loss": 0.3037, + "step": 2367 + }, + { + "epoch": 0.18760150524856406, + "grad_norm": 2.491869821849235, + "learning_rate": 1.8725571238545992e-05, + "loss": 0.3009, + "step": 2368 + }, + { + "epoch": 0.18768072885719944, + "grad_norm": 2.1272868834953473, + "learning_rate": 1.872431741375598e-05, + "loss": 0.343, + "step": 2369 + }, + { + "epoch": 0.18775995246583482, + "grad_norm": 1.8435376199308053, + "learning_rate": 1.872306301451165e-05, + "loss": 0.2152, + "step": 2370 + }, + { + "epoch": 0.1878391760744702, + "grad_norm": 2.0131326363249626, + "learning_rate": 1.872180804089559e-05, + "loss": 0.2593, + "step": 2371 + }, + { + "epoch": 0.18791839968310556, + "grad_norm": 2.3018050316226475, + "learning_rate": 1.8720552492990438e-05, + "loss": 0.3328, + "step": 2372 + }, + { + "epoch": 0.18799762329174094, + "grad_norm": 2.6609049398218088, + "learning_rate": 1.8719296370878866e-05, + "loss": 0.3521, + "step": 2373 + }, + { + "epoch": 0.18807684690037632, + "grad_norm": 2.6650809589334727, + "learning_rate": 1.871803967464358e-05, + "loss": 0.3447, + "step": 2374 + }, + { + "epoch": 0.18815607050901167, + "grad_norm": 2.451500628574491, + "learning_rate": 1.8716782404367333e-05, + "loss": 0.2894, + "step": 2375 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 1.9325107608589824, + "learning_rate": 1.8715524560132906e-05, + "loss": 0.3222, + "step": 2376 + }, + { + "epoch": 0.18831451772628244, + "grad_norm": 2.3753138122376685, + "learning_rate": 1.8714266142023124e-05, + "loss": 0.3854, + "step": 2377 + }, + { + "epoch": 0.18839374133491782, + "grad_norm": 2.2117819364013958, + "learning_rate": 1.8713007150120846e-05, + "loss": 0.32, + "step": 2378 + }, + { + "epoch": 0.18847296494355317, + "grad_norm": 2.4390039965109773, + "learning_rate": 1.871174758450897e-05, + "loss": 0.341, + "step": 2379 + }, + { + "epoch": 0.18855218855218855, + "grad_norm": 3.0573172772389596, + "learning_rate": 1.8710487445270436e-05, + "loss": 0.4679, + "step": 2380 + }, + { + "epoch": 0.18863141216082394, + "grad_norm": 2.3143663208533547, + "learning_rate": 1.8709226732488216e-05, + "loss": 0.3594, + "step": 2381 + }, + { + "epoch": 0.1887106357694593, + "grad_norm": 2.8772245854603917, + "learning_rate": 1.8707965446245317e-05, + "loss": 0.3446, + "step": 2382 + }, + { + "epoch": 0.18878985937809467, + "grad_norm": 4.06400445909464, + "learning_rate": 1.87067035866248e-05, + "loss": 0.2771, + "step": 2383 + }, + { + "epoch": 0.18886908298673005, + "grad_norm": 1.999736035035748, + "learning_rate": 1.8705441153709742e-05, + "loss": 0.307, + "step": 2384 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 2.3349303937758736, + "learning_rate": 1.8704178147583273e-05, + "loss": 0.3565, + "step": 2385 + }, + { + "epoch": 0.1890275302040008, + "grad_norm": 1.6776105169326794, + "learning_rate": 1.8702914568328555e-05, + "loss": 0.3258, + "step": 2386 + }, + { + "epoch": 0.18910675381263617, + "grad_norm": 2.101714509347008, + "learning_rate": 1.8701650416028788e-05, + "loss": 0.2515, + "step": 2387 + }, + { + "epoch": 0.18918597742127155, + "grad_norm": 2.29702783638012, + "learning_rate": 1.870038569076721e-05, + "loss": 0.3166, + "step": 2388 + }, + { + "epoch": 0.1892652010299069, + "grad_norm": 2.3366008944905476, + "learning_rate": 1.86991203926271e-05, + "loss": 0.2731, + "step": 2389 + }, + { + "epoch": 0.18934442463854229, + "grad_norm": 2.522877680230459, + "learning_rate": 1.8697854521691767e-05, + "loss": 0.3838, + "step": 2390 + }, + { + "epoch": 0.18942364824717767, + "grad_norm": 2.837417775984766, + "learning_rate": 1.8696588078044566e-05, + "loss": 0.4043, + "step": 2391 + }, + { + "epoch": 0.18950287185581302, + "grad_norm": 2.569208471978124, + "learning_rate": 1.8695321061768886e-05, + "loss": 0.4068, + "step": 2392 + }, + { + "epoch": 0.1895820954644484, + "grad_norm": 2.324114348310896, + "learning_rate": 1.8694053472948154e-05, + "loss": 0.3178, + "step": 2393 + }, + { + "epoch": 0.18966131907308378, + "grad_norm": 2.4716474656941796, + "learning_rate": 1.8692785311665835e-05, + "loss": 0.3222, + "step": 2394 + }, + { + "epoch": 0.18974054268171917, + "grad_norm": 2.2117536725780305, + "learning_rate": 1.8691516578005426e-05, + "loss": 0.3132, + "step": 2395 + }, + { + "epoch": 0.18981976629035452, + "grad_norm": 2.281402927838633, + "learning_rate": 1.8690247272050474e-05, + "loss": 0.2456, + "step": 2396 + }, + { + "epoch": 0.1898989898989899, + "grad_norm": 2.0716927024484857, + "learning_rate": 1.8688977393884555e-05, + "loss": 0.273, + "step": 2397 + }, + { + "epoch": 0.18997821350762528, + "grad_norm": 2.192567436763538, + "learning_rate": 1.868770694359128e-05, + "loss": 0.3838, + "step": 2398 + }, + { + "epoch": 0.19005743711626064, + "grad_norm": 2.3353910833870333, + "learning_rate": 1.868643592125431e-05, + "loss": 0.3555, + "step": 2399 + }, + { + "epoch": 0.19013666072489602, + "grad_norm": 2.9816592791128227, + "learning_rate": 1.8685164326957327e-05, + "loss": 0.5093, + "step": 2400 + }, + { + "epoch": 0.1902158843335314, + "grad_norm": 3.248589257719648, + "learning_rate": 1.8683892160784066e-05, + "loss": 0.3436, + "step": 2401 + }, + { + "epoch": 0.19029510794216675, + "grad_norm": 2.3672979239434735, + "learning_rate": 1.868261942281829e-05, + "loss": 0.4247, + "step": 2402 + }, + { + "epoch": 0.19037433155080213, + "grad_norm": 2.449880558955819, + "learning_rate": 1.86813461131438e-05, + "loss": 0.4516, + "step": 2403 + }, + { + "epoch": 0.19045355515943752, + "grad_norm": 1.9781986952862538, + "learning_rate": 1.8680072231844445e-05, + "loss": 0.3328, + "step": 2404 + }, + { + "epoch": 0.1905327787680729, + "grad_norm": 1.682442082315602, + "learning_rate": 1.8678797779004096e-05, + "loss": 0.2546, + "step": 2405 + }, + { + "epoch": 0.19061200237670825, + "grad_norm": 2.358540070133724, + "learning_rate": 1.8677522754706677e-05, + "loss": 0.4219, + "step": 2406 + }, + { + "epoch": 0.19069122598534363, + "grad_norm": 2.279122575981621, + "learning_rate": 1.8676247159036132e-05, + "loss": 0.3462, + "step": 2407 + }, + { + "epoch": 0.190770449593979, + "grad_norm": 2.46748142707905, + "learning_rate": 1.8674970992076465e-05, + "loss": 0.3002, + "step": 2408 + }, + { + "epoch": 0.19084967320261437, + "grad_norm": 2.7487199920077523, + "learning_rate": 1.8673694253911696e-05, + "loss": 0.5119, + "step": 2409 + }, + { + "epoch": 0.19092889681124975, + "grad_norm": 2.459701113653219, + "learning_rate": 1.8672416944625896e-05, + "loss": 0.4144, + "step": 2410 + }, + { + "epoch": 0.19100812041988513, + "grad_norm": 2.2355157013713747, + "learning_rate": 1.867113906430317e-05, + "loss": 0.3779, + "step": 2411 + }, + { + "epoch": 0.1910873440285205, + "grad_norm": 2.008764280295245, + "learning_rate": 1.8669860613027657e-05, + "loss": 0.2847, + "step": 2412 + }, + { + "epoch": 0.19116656763715587, + "grad_norm": 2.830324642302351, + "learning_rate": 1.8668581590883544e-05, + "loss": 0.5248, + "step": 2413 + }, + { + "epoch": 0.19124579124579125, + "grad_norm": 2.316528929771331, + "learning_rate": 1.8667301997955038e-05, + "loss": 0.4244, + "step": 2414 + }, + { + "epoch": 0.19132501485442663, + "grad_norm": 2.1455708077769295, + "learning_rate": 1.8666021834326404e-05, + "loss": 0.4063, + "step": 2415 + }, + { + "epoch": 0.19140423846306198, + "grad_norm": 2.578496252157606, + "learning_rate": 1.866474110008193e-05, + "loss": 0.4278, + "step": 2416 + }, + { + "epoch": 0.19148346207169736, + "grad_norm": 2.2288754110195566, + "learning_rate": 1.8663459795305946e-05, + "loss": 0.4112, + "step": 2417 + }, + { + "epoch": 0.19156268568033274, + "grad_norm": 2.277262606405243, + "learning_rate": 1.866217792008282e-05, + "loss": 0.3687, + "step": 2418 + }, + { + "epoch": 0.1916419092889681, + "grad_norm": 1.8630297645419311, + "learning_rate": 1.866089547449696e-05, + "loss": 0.2948, + "step": 2419 + }, + { + "epoch": 0.19172113289760348, + "grad_norm": 2.5857719677134807, + "learning_rate": 1.8659612458632802e-05, + "loss": 0.3685, + "step": 2420 + }, + { + "epoch": 0.19180035650623886, + "grad_norm": 2.1077816478372227, + "learning_rate": 1.8658328872574833e-05, + "loss": 0.3134, + "step": 2421 + }, + { + "epoch": 0.19187958011487424, + "grad_norm": 2.2558696868498163, + "learning_rate": 1.8657044716407573e-05, + "loss": 0.3903, + "step": 2422 + }, + { + "epoch": 0.1919588037235096, + "grad_norm": 2.5260330779322144, + "learning_rate": 1.865575999021557e-05, + "loss": 0.4757, + "step": 2423 + }, + { + "epoch": 0.19203802733214498, + "grad_norm": 2.585946758475045, + "learning_rate": 1.8654474694083416e-05, + "loss": 0.4314, + "step": 2424 + }, + { + "epoch": 0.19211725094078036, + "grad_norm": 2.5235702116602683, + "learning_rate": 1.8653188828095754e-05, + "loss": 0.3479, + "step": 2425 + }, + { + "epoch": 0.1921964745494157, + "grad_norm": 2.338783130966399, + "learning_rate": 1.865190239233724e-05, + "loss": 0.3642, + "step": 2426 + }, + { + "epoch": 0.1922756981580511, + "grad_norm": 2.2613101907663795, + "learning_rate": 1.8650615386892587e-05, + "loss": 0.3137, + "step": 2427 + }, + { + "epoch": 0.19235492176668648, + "grad_norm": 2.481183082822829, + "learning_rate": 1.8649327811846533e-05, + "loss": 0.4983, + "step": 2428 + }, + { + "epoch": 0.19243414537532186, + "grad_norm": 2.056116658196089, + "learning_rate": 1.8648039667283857e-05, + "loss": 0.395, + "step": 2429 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 2.187373035714514, + "learning_rate": 1.8646750953289384e-05, + "loss": 0.3235, + "step": 2430 + }, + { + "epoch": 0.1925925925925926, + "grad_norm": 1.953239087467633, + "learning_rate": 1.8645461669947966e-05, + "loss": 0.27, + "step": 2431 + }, + { + "epoch": 0.19267181620122797, + "grad_norm": 2.2855907480080697, + "learning_rate": 1.8644171817344497e-05, + "loss": 0.3153, + "step": 2432 + }, + { + "epoch": 0.19275103980986333, + "grad_norm": 2.5142407429696254, + "learning_rate": 1.8642881395563904e-05, + "loss": 0.2761, + "step": 2433 + }, + { + "epoch": 0.1928302634184987, + "grad_norm": 2.232408290132552, + "learning_rate": 1.864159040469116e-05, + "loss": 0.4019, + "step": 2434 + }, + { + "epoch": 0.1929094870271341, + "grad_norm": 2.6768788194814483, + "learning_rate": 1.864029884481127e-05, + "loss": 0.3774, + "step": 2435 + }, + { + "epoch": 0.19298871063576947, + "grad_norm": 2.4057338437133695, + "learning_rate": 1.8639006716009275e-05, + "loss": 0.4702, + "step": 2436 + }, + { + "epoch": 0.19306793424440483, + "grad_norm": 2.406634426736634, + "learning_rate": 1.8637714018370255e-05, + "loss": 0.3872, + "step": 2437 + }, + { + "epoch": 0.1931471578530402, + "grad_norm": 2.1804927207402356, + "learning_rate": 1.8636420751979328e-05, + "loss": 0.4127, + "step": 2438 + }, + { + "epoch": 0.1932263814616756, + "grad_norm": 2.2325083599641893, + "learning_rate": 1.863512691692165e-05, + "loss": 0.4142, + "step": 2439 + }, + { + "epoch": 0.19330560507031094, + "grad_norm": 2.360300792783522, + "learning_rate": 1.863383251328242e-05, + "loss": 0.3718, + "step": 2440 + }, + { + "epoch": 0.19338482867894632, + "grad_norm": 1.9676691602914926, + "learning_rate": 1.8632537541146856e-05, + "loss": 0.2545, + "step": 2441 + }, + { + "epoch": 0.1934640522875817, + "grad_norm": 2.544070344504953, + "learning_rate": 1.8631242000600235e-05, + "loss": 0.421, + "step": 2442 + }, + { + "epoch": 0.19354327589621706, + "grad_norm": 2.317720778716317, + "learning_rate": 1.8629945891727856e-05, + "loss": 0.4134, + "step": 2443 + }, + { + "epoch": 0.19362249950485244, + "grad_norm": 2.1669465368125818, + "learning_rate": 1.8628649214615066e-05, + "loss": 0.3168, + "step": 2444 + }, + { + "epoch": 0.19370172311348782, + "grad_norm": 2.0639083871105828, + "learning_rate": 1.8627351969347246e-05, + "loss": 0.3443, + "step": 2445 + }, + { + "epoch": 0.1937809467221232, + "grad_norm": 2.2376669330107153, + "learning_rate": 1.8626054156009807e-05, + "loss": 0.3529, + "step": 2446 + }, + { + "epoch": 0.19386017033075856, + "grad_norm": 2.550901319390686, + "learning_rate": 1.862475577468821e-05, + "loss": 0.3534, + "step": 2447 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 2.3583170168567658, + "learning_rate": 1.8623456825467948e-05, + "loss": 0.3823, + "step": 2448 + }, + { + "epoch": 0.19401861754802932, + "grad_norm": 2.2929430344925135, + "learning_rate": 1.8622157308434544e-05, + "loss": 0.4592, + "step": 2449 + }, + { + "epoch": 0.19409784115666467, + "grad_norm": 2.6646782989089504, + "learning_rate": 1.8620857223673567e-05, + "loss": 0.4302, + "step": 2450 + }, + { + "epoch": 0.19417706476530006, + "grad_norm": 1.81841435235266, + "learning_rate": 1.8619556571270624e-05, + "loss": 0.2961, + "step": 2451 + }, + { + "epoch": 0.19425628837393544, + "grad_norm": 2.1546680016047928, + "learning_rate": 1.8618255351311355e-05, + "loss": 0.3418, + "step": 2452 + }, + { + "epoch": 0.19433551198257082, + "grad_norm": 2.150348235408283, + "learning_rate": 1.8616953563881444e-05, + "loss": 0.352, + "step": 2453 + }, + { + "epoch": 0.19441473559120617, + "grad_norm": 2.0046820761363846, + "learning_rate": 1.8615651209066598e-05, + "loss": 0.3235, + "step": 2454 + }, + { + "epoch": 0.19449395919984155, + "grad_norm": 2.5348173653153623, + "learning_rate": 1.8614348286952577e-05, + "loss": 0.3452, + "step": 2455 + }, + { + "epoch": 0.19457318280847694, + "grad_norm": 2.3274243520308007, + "learning_rate": 1.8613044797625173e-05, + "loss": 0.3892, + "step": 2456 + }, + { + "epoch": 0.1946524064171123, + "grad_norm": 2.0989791578857155, + "learning_rate": 1.861174074117021e-05, + "loss": 0.3405, + "step": 2457 + }, + { + "epoch": 0.19473163002574767, + "grad_norm": 2.1919308956371184, + "learning_rate": 1.8610436117673557e-05, + "loss": 0.3096, + "step": 2458 + }, + { + "epoch": 0.19481085363438305, + "grad_norm": 2.2882205739796944, + "learning_rate": 1.8609130927221116e-05, + "loss": 0.405, + "step": 2459 + }, + { + "epoch": 0.1948900772430184, + "grad_norm": 2.096111072736974, + "learning_rate": 1.8607825169898827e-05, + "loss": 0.4088, + "step": 2460 + }, + { + "epoch": 0.1949693008516538, + "grad_norm": 2.5457410371902127, + "learning_rate": 1.8606518845792672e-05, + "loss": 0.5546, + "step": 2461 + }, + { + "epoch": 0.19504852446028917, + "grad_norm": 1.848592772388562, + "learning_rate": 1.860521195498866e-05, + "loss": 0.3491, + "step": 2462 + }, + { + "epoch": 0.19512774806892455, + "grad_norm": 2.0752605803301467, + "learning_rate": 1.8603904497572846e-05, + "loss": 0.3093, + "step": 2463 + }, + { + "epoch": 0.1952069716775599, + "grad_norm": 2.285386555839289, + "learning_rate": 1.8602596473631323e-05, + "loss": 0.4335, + "step": 2464 + }, + { + "epoch": 0.19528619528619529, + "grad_norm": 1.9504041251275086, + "learning_rate": 1.8601287883250215e-05, + "loss": 0.3306, + "step": 2465 + }, + { + "epoch": 0.19536541889483067, + "grad_norm": 2.814250993235811, + "learning_rate": 1.8599978726515685e-05, + "loss": 0.3126, + "step": 2466 + }, + { + "epoch": 0.19544464250346602, + "grad_norm": 2.699794693071824, + "learning_rate": 1.8598669003513934e-05, + "loss": 0.5214, + "step": 2467 + }, + { + "epoch": 0.1955238661121014, + "grad_norm": 2.138989129402344, + "learning_rate": 1.8597358714331207e-05, + "loss": 0.3157, + "step": 2468 + }, + { + "epoch": 0.19560308972073678, + "grad_norm": 2.269082248599281, + "learning_rate": 1.8596047859053776e-05, + "loss": 0.3847, + "step": 2469 + }, + { + "epoch": 0.19568231332937217, + "grad_norm": 2.1483245298195115, + "learning_rate": 1.8594736437767954e-05, + "loss": 0.4204, + "step": 2470 + }, + { + "epoch": 0.19576153693800752, + "grad_norm": 2.4622250635289364, + "learning_rate": 1.8593424450560094e-05, + "loss": 0.4287, + "step": 2471 + }, + { + "epoch": 0.1958407605466429, + "grad_norm": 2.2783626156378216, + "learning_rate": 1.8592111897516583e-05, + "loss": 0.414, + "step": 2472 + }, + { + "epoch": 0.19591998415527828, + "grad_norm": 2.813603039056177, + "learning_rate": 1.8590798778723843e-05, + "loss": 0.3991, + "step": 2473 + }, + { + "epoch": 0.19599920776391364, + "grad_norm": 2.266553035044402, + "learning_rate": 1.8589485094268344e-05, + "loss": 0.3105, + "step": 2474 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 1.8631401649992443, + "learning_rate": 1.858817084423658e-05, + "loss": 0.3321, + "step": 2475 + }, + { + "epoch": 0.1961576549811844, + "grad_norm": 2.6001028866504354, + "learning_rate": 1.8586856028715087e-05, + "loss": 0.4129, + "step": 2476 + }, + { + "epoch": 0.19623687858981978, + "grad_norm": 1.9225639528899896, + "learning_rate": 1.8585540647790445e-05, + "loss": 0.3477, + "step": 2477 + }, + { + "epoch": 0.19631610219845513, + "grad_norm": 1.9029896696961794, + "learning_rate": 1.858422470154926e-05, + "loss": 0.3582, + "step": 2478 + }, + { + "epoch": 0.19639532580709052, + "grad_norm": 2.371662228739464, + "learning_rate": 1.8582908190078184e-05, + "loss": 0.5111, + "step": 2479 + }, + { + "epoch": 0.1964745494157259, + "grad_norm": 2.3458971637684, + "learning_rate": 1.8581591113463903e-05, + "loss": 0.4875, + "step": 2480 + }, + { + "epoch": 0.19655377302436125, + "grad_norm": 2.220020587214232, + "learning_rate": 1.858027347179314e-05, + "loss": 0.358, + "step": 2481 + }, + { + "epoch": 0.19663299663299663, + "grad_norm": 2.016025618145945, + "learning_rate": 1.8578955265152652e-05, + "loss": 0.3057, + "step": 2482 + }, + { + "epoch": 0.196712220241632, + "grad_norm": 2.390973899894166, + "learning_rate": 1.857763649362924e-05, + "loss": 0.4794, + "step": 2483 + }, + { + "epoch": 0.19679144385026737, + "grad_norm": 2.0811221237668005, + "learning_rate": 1.857631715730974e-05, + "loss": 0.3887, + "step": 2484 + }, + { + "epoch": 0.19687066745890275, + "grad_norm": 2.3012484162313824, + "learning_rate": 1.857499725628102e-05, + "loss": 0.3006, + "step": 2485 + }, + { + "epoch": 0.19694989106753813, + "grad_norm": 2.4683935517071616, + "learning_rate": 1.8573676790629988e-05, + "loss": 0.2737, + "step": 2486 + }, + { + "epoch": 0.1970291146761735, + "grad_norm": 2.2284228307033946, + "learning_rate": 1.8572355760443597e-05, + "loss": 0.2367, + "step": 2487 + }, + { + "epoch": 0.19710833828480886, + "grad_norm": 2.216729456039309, + "learning_rate": 1.8571034165808826e-05, + "loss": 0.2884, + "step": 2488 + }, + { + "epoch": 0.19718756189344425, + "grad_norm": 2.2143382473000313, + "learning_rate": 1.85697120068127e-05, + "loss": 0.1766, + "step": 2489 + }, + { + "epoch": 0.19726678550207963, + "grad_norm": 2.732087908688094, + "learning_rate": 1.8568389283542263e-05, + "loss": 0.3801, + "step": 2490 + }, + { + "epoch": 0.19734600911071498, + "grad_norm": 3.081517991139325, + "learning_rate": 1.8567065996084628e-05, + "loss": 0.4109, + "step": 2491 + }, + { + "epoch": 0.19742523271935036, + "grad_norm": 2.181387457142246, + "learning_rate": 1.8565742144526917e-05, + "loss": 0.3455, + "step": 2492 + }, + { + "epoch": 0.19750445632798574, + "grad_norm": 2.1877078020664866, + "learning_rate": 1.85644177289563e-05, + "loss": 0.392, + "step": 2493 + }, + { + "epoch": 0.19758367993662113, + "grad_norm": 2.3123444763880387, + "learning_rate": 1.856309274945999e-05, + "loss": 0.3186, + "step": 2494 + }, + { + "epoch": 0.19766290354525648, + "grad_norm": 2.2855869416309016, + "learning_rate": 1.8561767206125223e-05, + "loss": 0.3503, + "step": 2495 + }, + { + "epoch": 0.19774212715389186, + "grad_norm": 2.6475461584697335, + "learning_rate": 1.856044109903928e-05, + "loss": 0.4552, + "step": 2496 + }, + { + "epoch": 0.19782135076252724, + "grad_norm": 2.315308822615627, + "learning_rate": 1.8559114428289482e-05, + "loss": 0.4657, + "step": 2497 + }, + { + "epoch": 0.1979005743711626, + "grad_norm": 2.082621432472806, + "learning_rate": 1.8557787193963184e-05, + "loss": 0.4042, + "step": 2498 + }, + { + "epoch": 0.19797979797979798, + "grad_norm": 2.2891809452758407, + "learning_rate": 1.8556459396147777e-05, + "loss": 0.4412, + "step": 2499 + }, + { + "epoch": 0.19805902158843336, + "grad_norm": 2.8126784423414466, + "learning_rate": 1.8555131034930686e-05, + "loss": 0.4436, + "step": 2500 + }, + { + "epoch": 0.1981382451970687, + "grad_norm": 1.920461562424893, + "learning_rate": 1.8553802110399385e-05, + "loss": 0.3126, + "step": 2501 + }, + { + "epoch": 0.1982174688057041, + "grad_norm": 2.3405925530387157, + "learning_rate": 1.8552472622641372e-05, + "loss": 0.3209, + "step": 2502 + }, + { + "epoch": 0.19829669241433948, + "grad_norm": 2.6909419205423575, + "learning_rate": 1.8551142571744188e-05, + "loss": 0.5276, + "step": 2503 + }, + { + "epoch": 0.19837591602297486, + "grad_norm": 1.953917517484155, + "learning_rate": 1.854981195779541e-05, + "loss": 0.2665, + "step": 2504 + }, + { + "epoch": 0.1984551396316102, + "grad_norm": 2.21961047385004, + "learning_rate": 1.8548480780882658e-05, + "loss": 0.3823, + "step": 2505 + }, + { + "epoch": 0.1985343632402456, + "grad_norm": 2.5403097204187475, + "learning_rate": 1.8547149041093574e-05, + "loss": 0.3974, + "step": 2506 + }, + { + "epoch": 0.19861358684888097, + "grad_norm": 2.1253184318112375, + "learning_rate": 1.8545816738515855e-05, + "loss": 0.3815, + "step": 2507 + }, + { + "epoch": 0.19869281045751633, + "grad_norm": 1.8509488428343006, + "learning_rate": 1.854448387323722e-05, + "loss": 0.3224, + "step": 2508 + }, + { + "epoch": 0.1987720340661517, + "grad_norm": 2.0945640602614763, + "learning_rate": 1.8543150445345443e-05, + "loss": 0.3317, + "step": 2509 + }, + { + "epoch": 0.1988512576747871, + "grad_norm": 1.9832166063174255, + "learning_rate": 1.854181645492831e-05, + "loss": 0.2794, + "step": 2510 + }, + { + "epoch": 0.19893048128342247, + "grad_norm": 2.489453753213978, + "learning_rate": 1.8540481902073664e-05, + "loss": 0.3726, + "step": 2511 + }, + { + "epoch": 0.19900970489205783, + "grad_norm": 2.3006469390899102, + "learning_rate": 1.8539146786869385e-05, + "loss": 0.3611, + "step": 2512 + }, + { + "epoch": 0.1990889285006932, + "grad_norm": 2.358837949652743, + "learning_rate": 1.8537811109403372e-05, + "loss": 0.2882, + "step": 2513 + }, + { + "epoch": 0.1991681521093286, + "grad_norm": 2.8427295733863596, + "learning_rate": 1.853647486976358e-05, + "loss": 0.4833, + "step": 2514 + }, + { + "epoch": 0.19924737571796394, + "grad_norm": 2.0815328712729495, + "learning_rate": 1.8535138068037995e-05, + "loss": 0.3174, + "step": 2515 + }, + { + "epoch": 0.19932659932659932, + "grad_norm": 2.491501696031602, + "learning_rate": 1.8533800704314633e-05, + "loss": 0.471, + "step": 2516 + }, + { + "epoch": 0.1994058229352347, + "grad_norm": 2.1810681624932373, + "learning_rate": 1.8532462778681558e-05, + "loss": 0.3122, + "step": 2517 + }, + { + "epoch": 0.1994850465438701, + "grad_norm": 2.0987487623904424, + "learning_rate": 1.8531124291226866e-05, + "loss": 0.3213, + "step": 2518 + }, + { + "epoch": 0.19956427015250544, + "grad_norm": 2.5940693606713463, + "learning_rate": 1.8529785242038688e-05, + "loss": 0.3382, + "step": 2519 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 1.9084487218273467, + "learning_rate": 1.8528445631205195e-05, + "loss": 0.3098, + "step": 2520 + }, + { + "epoch": 0.1997227173697762, + "grad_norm": 2.22697748874517, + "learning_rate": 1.852710545881459e-05, + "loss": 0.4005, + "step": 2521 + }, + { + "epoch": 0.19980194097841156, + "grad_norm": 1.94871077943574, + "learning_rate": 1.8525764724955123e-05, + "loss": 0.3308, + "step": 2522 + }, + { + "epoch": 0.19988116458704694, + "grad_norm": 2.1648646873883606, + "learning_rate": 1.8524423429715072e-05, + "loss": 0.2976, + "step": 2523 + }, + { + "epoch": 0.19996038819568232, + "grad_norm": 2.289324311627118, + "learning_rate": 1.8523081573182754e-05, + "loss": 0.3277, + "step": 2524 + }, + { + "epoch": 0.20003961180431767, + "grad_norm": 2.6359868580396957, + "learning_rate": 1.8521739155446527e-05, + "loss": 0.4441, + "step": 2525 + }, + { + "epoch": 0.20011883541295306, + "grad_norm": 2.0463999849180667, + "learning_rate": 1.852039617659478e-05, + "loss": 0.3373, + "step": 2526 + }, + { + "epoch": 0.20019805902158844, + "grad_norm": 2.233538096791076, + "learning_rate": 1.851905263671594e-05, + "loss": 0.3732, + "step": 2527 + }, + { + "epoch": 0.20027728263022382, + "grad_norm": 3.07992933976717, + "learning_rate": 1.8517708535898477e-05, + "loss": 0.3118, + "step": 2528 + }, + { + "epoch": 0.20035650623885917, + "grad_norm": 2.206639505911278, + "learning_rate": 1.851636387423089e-05, + "loss": 0.2737, + "step": 2529 + }, + { + "epoch": 0.20043572984749455, + "grad_norm": 2.088856315478698, + "learning_rate": 1.8515018651801723e-05, + "loss": 0.3001, + "step": 2530 + }, + { + "epoch": 0.20051495345612994, + "grad_norm": 2.173784435536571, + "learning_rate": 1.8513672868699547e-05, + "loss": 0.3735, + "step": 2531 + }, + { + "epoch": 0.2005941770647653, + "grad_norm": 2.0865718183931508, + "learning_rate": 1.851232652501298e-05, + "loss": 0.2852, + "step": 2532 + }, + { + "epoch": 0.20067340067340067, + "grad_norm": 2.4405662133864245, + "learning_rate": 1.851097962083067e-05, + "loss": 0.4349, + "step": 2533 + }, + { + "epoch": 0.20075262428203605, + "grad_norm": 1.8240503780746695, + "learning_rate": 1.85096321562413e-05, + "loss": 0.2592, + "step": 2534 + }, + { + "epoch": 0.20083184789067143, + "grad_norm": 2.0317250338114023, + "learning_rate": 1.8508284131333604e-05, + "loss": 0.3196, + "step": 2535 + }, + { + "epoch": 0.2009110714993068, + "grad_norm": 2.6578832137432653, + "learning_rate": 1.850693554619633e-05, + "loss": 0.4377, + "step": 2536 + }, + { + "epoch": 0.20099029510794217, + "grad_norm": 2.2538147570599394, + "learning_rate": 1.8505586400918288e-05, + "loss": 0.3298, + "step": 2537 + }, + { + "epoch": 0.20106951871657755, + "grad_norm": 2.7358544242777882, + "learning_rate": 1.8504236695588308e-05, + "loss": 0.3239, + "step": 2538 + }, + { + "epoch": 0.2011487423252129, + "grad_norm": 2.7866894854942323, + "learning_rate": 1.8502886430295262e-05, + "loss": 0.4308, + "step": 2539 + }, + { + "epoch": 0.20122796593384829, + "grad_norm": 2.403766386329718, + "learning_rate": 1.8501535605128054e-05, + "loss": 0.3067, + "step": 2540 + }, + { + "epoch": 0.20130718954248367, + "grad_norm": 2.172612609192193, + "learning_rate": 1.8500184220175636e-05, + "loss": 0.4173, + "step": 2541 + }, + { + "epoch": 0.20138641315111902, + "grad_norm": 2.183681647657208, + "learning_rate": 1.8498832275526988e-05, + "loss": 0.4232, + "step": 2542 + }, + { + "epoch": 0.2014656367597544, + "grad_norm": 2.1545552923233307, + "learning_rate": 1.8497479771271125e-05, + "loss": 0.3003, + "step": 2543 + }, + { + "epoch": 0.20154486036838978, + "grad_norm": 2.7155129320167153, + "learning_rate": 1.8496126707497112e-05, + "loss": 0.3735, + "step": 2544 + }, + { + "epoch": 0.20162408397702516, + "grad_norm": 2.446669786535323, + "learning_rate": 1.849477308429403e-05, + "loss": 0.3324, + "step": 2545 + }, + { + "epoch": 0.20170330758566052, + "grad_norm": 2.987567324241434, + "learning_rate": 1.8493418901751016e-05, + "loss": 0.4053, + "step": 2546 + }, + { + "epoch": 0.2017825311942959, + "grad_norm": 2.0451748233881775, + "learning_rate": 1.849206415995724e-05, + "loss": 0.3628, + "step": 2547 + }, + { + "epoch": 0.20186175480293128, + "grad_norm": 2.366937693151244, + "learning_rate": 1.8490708859001896e-05, + "loss": 0.3862, + "step": 2548 + }, + { + "epoch": 0.20194097841156664, + "grad_norm": 2.285537809785544, + "learning_rate": 1.8489352998974227e-05, + "loss": 0.4229, + "step": 2549 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.14664097818107, + "learning_rate": 1.8487996579963515e-05, + "loss": 0.3176, + "step": 2550 + }, + { + "epoch": 0.2020994256288374, + "grad_norm": 2.6709319714445683, + "learning_rate": 1.8486639602059066e-05, + "loss": 0.3936, + "step": 2551 + }, + { + "epoch": 0.20217864923747278, + "grad_norm": 2.2299067893671043, + "learning_rate": 1.8485282065350237e-05, + "loss": 0.3677, + "step": 2552 + }, + { + "epoch": 0.20225787284610813, + "grad_norm": 1.8376533793627137, + "learning_rate": 1.848392396992641e-05, + "loss": 0.2929, + "step": 2553 + }, + { + "epoch": 0.20233709645474351, + "grad_norm": 2.326285525705798, + "learning_rate": 1.8482565315877013e-05, + "loss": 0.4047, + "step": 2554 + }, + { + "epoch": 0.2024163200633789, + "grad_norm": 2.2071973868350843, + "learning_rate": 1.8481206103291506e-05, + "loss": 0.3289, + "step": 2555 + }, + { + "epoch": 0.20249554367201425, + "grad_norm": 2.253218749715048, + "learning_rate": 1.8479846332259388e-05, + "loss": 0.3552, + "step": 2556 + }, + { + "epoch": 0.20257476728064963, + "grad_norm": 2.4826969943083292, + "learning_rate": 1.847848600287019e-05, + "loss": 0.4362, + "step": 2557 + }, + { + "epoch": 0.202653990889285, + "grad_norm": 2.4615813938979074, + "learning_rate": 1.8477125115213484e-05, + "loss": 0.4523, + "step": 2558 + }, + { + "epoch": 0.20273321449792037, + "grad_norm": 2.589911331391582, + "learning_rate": 1.8475763669378878e-05, + "loss": 0.3923, + "step": 2559 + }, + { + "epoch": 0.20281243810655575, + "grad_norm": 2.1544316122631684, + "learning_rate": 1.8474401665456016e-05, + "loss": 0.3036, + "step": 2560 + }, + { + "epoch": 0.20289166171519113, + "grad_norm": 2.200846607013613, + "learning_rate": 1.8473039103534583e-05, + "loss": 0.4175, + "step": 2561 + }, + { + "epoch": 0.2029708853238265, + "grad_norm": 2.4356438861821474, + "learning_rate": 1.8471675983704295e-05, + "loss": 0.4491, + "step": 2562 + }, + { + "epoch": 0.20305010893246186, + "grad_norm": 2.29837279168617, + "learning_rate": 1.8470312306054903e-05, + "loss": 0.3452, + "step": 2563 + }, + { + "epoch": 0.20312933254109725, + "grad_norm": 2.0487267473067896, + "learning_rate": 1.8468948070676205e-05, + "loss": 0.3094, + "step": 2564 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 2.1381754994275597, + "learning_rate": 1.8467583277658026e-05, + "loss": 0.2855, + "step": 2565 + }, + { + "epoch": 0.20328777975836798, + "grad_norm": 1.8764463167573255, + "learning_rate": 1.8466217927090232e-05, + "loss": 0.2959, + "step": 2566 + }, + { + "epoch": 0.20336700336700336, + "grad_norm": 2.5526234060684847, + "learning_rate": 1.8464852019062726e-05, + "loss": 0.3627, + "step": 2567 + }, + { + "epoch": 0.20344622697563874, + "grad_norm": 2.1899386174730573, + "learning_rate": 1.846348555366544e-05, + "loss": 0.3836, + "step": 2568 + }, + { + "epoch": 0.20352545058427413, + "grad_norm": 2.5673985506546115, + "learning_rate": 1.8462118530988356e-05, + "loss": 0.4674, + "step": 2569 + }, + { + "epoch": 0.20360467419290948, + "grad_norm": 2.6110808937235697, + "learning_rate": 1.8460750951121487e-05, + "loss": 0.4123, + "step": 2570 + }, + { + "epoch": 0.20368389780154486, + "grad_norm": 1.9865053476855534, + "learning_rate": 1.8459382814154874e-05, + "loss": 0.3951, + "step": 2571 + }, + { + "epoch": 0.20376312141018024, + "grad_norm": 2.4539083543726097, + "learning_rate": 1.845801412017861e-05, + "loss": 0.3154, + "step": 2572 + }, + { + "epoch": 0.2038423450188156, + "grad_norm": 2.498386015589712, + "learning_rate": 1.845664486928281e-05, + "loss": 0.3749, + "step": 2573 + }, + { + "epoch": 0.20392156862745098, + "grad_norm": 2.1661368931050053, + "learning_rate": 1.8455275061557643e-05, + "loss": 0.2674, + "step": 2574 + }, + { + "epoch": 0.20400079223608636, + "grad_norm": 1.9942421387612144, + "learning_rate": 1.845390469709329e-05, + "loss": 0.3599, + "step": 2575 + }, + { + "epoch": 0.20408001584472174, + "grad_norm": 2.14985491413104, + "learning_rate": 1.8452533775979992e-05, + "loss": 0.3147, + "step": 2576 + }, + { + "epoch": 0.2041592394533571, + "grad_norm": 2.3317594207362267, + "learning_rate": 1.845116229830802e-05, + "loss": 0.3676, + "step": 2577 + }, + { + "epoch": 0.20423846306199248, + "grad_norm": 2.155006085273215, + "learning_rate": 1.8449790264167672e-05, + "loss": 0.2258, + "step": 2578 + }, + { + "epoch": 0.20431768667062786, + "grad_norm": 2.6071663515527987, + "learning_rate": 1.8448417673649292e-05, + "loss": 0.4777, + "step": 2579 + }, + { + "epoch": 0.2043969102792632, + "grad_norm": 2.239493857358544, + "learning_rate": 1.844704452684326e-05, + "loss": 0.3044, + "step": 2580 + }, + { + "epoch": 0.2044761338878986, + "grad_norm": 2.226009112781301, + "learning_rate": 1.844567082383999e-05, + "loss": 0.358, + "step": 2581 + }, + { + "epoch": 0.20455535749653397, + "grad_norm": 2.1352822392390842, + "learning_rate": 1.8444296564729935e-05, + "loss": 0.4358, + "step": 2582 + }, + { + "epoch": 0.20463458110516933, + "grad_norm": 2.165123326896601, + "learning_rate": 1.8442921749603586e-05, + "loss": 0.3361, + "step": 2583 + }, + { + "epoch": 0.2047138047138047, + "grad_norm": 2.2842604106043507, + "learning_rate": 1.8441546378551457e-05, + "loss": 0.3988, + "step": 2584 + }, + { + "epoch": 0.2047930283224401, + "grad_norm": 3.228491038171374, + "learning_rate": 1.8440170451664122e-05, + "loss": 0.3863, + "step": 2585 + }, + { + "epoch": 0.20487225193107547, + "grad_norm": 2.4373413126721912, + "learning_rate": 1.8438793969032175e-05, + "loss": 0.4361, + "step": 2586 + }, + { + "epoch": 0.20495147553971083, + "grad_norm": 2.2254259951936906, + "learning_rate": 1.8437416930746248e-05, + "loss": 0.3859, + "step": 2587 + }, + { + "epoch": 0.2050306991483462, + "grad_norm": 1.8439603395194666, + "learning_rate": 1.8436039336897015e-05, + "loss": 0.2175, + "step": 2588 + }, + { + "epoch": 0.2051099227569816, + "grad_norm": 2.5136251343204172, + "learning_rate": 1.8434661187575183e-05, + "loss": 0.4304, + "step": 2589 + }, + { + "epoch": 0.20518914636561694, + "grad_norm": 1.883767199381963, + "learning_rate": 1.8433282482871497e-05, + "loss": 0.3192, + "step": 2590 + }, + { + "epoch": 0.20526836997425232, + "grad_norm": 2.319317007307682, + "learning_rate": 1.8431903222876737e-05, + "loss": 0.3958, + "step": 2591 + }, + { + "epoch": 0.2053475935828877, + "grad_norm": 2.0192310345324946, + "learning_rate": 1.8430523407681723e-05, + "loss": 0.3304, + "step": 2592 + }, + { + "epoch": 0.2054268171915231, + "grad_norm": 2.4550122509585743, + "learning_rate": 1.8429143037377305e-05, + "loss": 0.4448, + "step": 2593 + }, + { + "epoch": 0.20550604080015844, + "grad_norm": 2.6209542854199355, + "learning_rate": 1.8427762112054378e-05, + "loss": 0.4104, + "step": 2594 + }, + { + "epoch": 0.20558526440879382, + "grad_norm": 1.7827263733795262, + "learning_rate": 1.842638063180387e-05, + "loss": 0.2895, + "step": 2595 + }, + { + "epoch": 0.2056644880174292, + "grad_norm": 2.1731669035069814, + "learning_rate": 1.8424998596716743e-05, + "loss": 0.4229, + "step": 2596 + }, + { + "epoch": 0.20574371162606456, + "grad_norm": 2.0356240598075344, + "learning_rate": 1.8423616006883994e-05, + "loss": 0.1909, + "step": 2597 + }, + { + "epoch": 0.20582293523469994, + "grad_norm": 2.4375123859882986, + "learning_rate": 1.8422232862396663e-05, + "loss": 0.3283, + "step": 2598 + }, + { + "epoch": 0.20590215884333532, + "grad_norm": 2.3089164850513773, + "learning_rate": 1.8420849163345824e-05, + "loss": 0.3994, + "step": 2599 + }, + { + "epoch": 0.20598138245197067, + "grad_norm": 2.8526619245294014, + "learning_rate": 1.8419464909822585e-05, + "loss": 0.3482, + "step": 2600 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 2.2822456832216225, + "learning_rate": 1.8418080101918095e-05, + "loss": 0.3764, + "step": 2601 + }, + { + "epoch": 0.20613982966924144, + "grad_norm": 1.7550603316833326, + "learning_rate": 1.8416694739723535e-05, + "loss": 0.3156, + "step": 2602 + }, + { + "epoch": 0.20621905327787682, + "grad_norm": 2.3386797685966805, + "learning_rate": 1.841530882333012e-05, + "loss": 0.4354, + "step": 2603 + }, + { + "epoch": 0.20629827688651217, + "grad_norm": 2.439938355488765, + "learning_rate": 1.8413922352829118e-05, + "loss": 0.3275, + "step": 2604 + }, + { + "epoch": 0.20637750049514755, + "grad_norm": 1.8912032583509386, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.25, + "step": 2605 + }, + { + "epoch": 0.20645672410378293, + "grad_norm": 1.951746034926776, + "learning_rate": 1.8411147749869536e-05, + "loss": 0.3341, + "step": 2606 + }, + { + "epoch": 0.2065359477124183, + "grad_norm": 2.211967105953523, + "learning_rate": 1.840975961759365e-05, + "loss": 0.3588, + "step": 2607 + }, + { + "epoch": 0.20661517132105367, + "grad_norm": 2.129109282217052, + "learning_rate": 1.8408370931575556e-05, + "loss": 0.3472, + "step": 2608 + }, + { + "epoch": 0.20669439492968905, + "grad_norm": 1.9980283945785957, + "learning_rate": 1.84069816919067e-05, + "loss": 0.3169, + "step": 2609 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 2.366220442774412, + "learning_rate": 1.8405591898678546e-05, + "loss": 0.334, + "step": 2610 + }, + { + "epoch": 0.2068528421469598, + "grad_norm": 2.590989486280597, + "learning_rate": 1.8404201551982612e-05, + "loss": 0.4114, + "step": 2611 + }, + { + "epoch": 0.20693206575559517, + "grad_norm": 2.7065415039991283, + "learning_rate": 1.8402810651910444e-05, + "loss": 0.3114, + "step": 2612 + }, + { + "epoch": 0.20701128936423055, + "grad_norm": 2.420248316470613, + "learning_rate": 1.840141919855363e-05, + "loss": 0.4917, + "step": 2613 + }, + { + "epoch": 0.2070905129728659, + "grad_norm": 2.3910321768992016, + "learning_rate": 1.8400027192003782e-05, + "loss": 0.4136, + "step": 2614 + }, + { + "epoch": 0.20716973658150128, + "grad_norm": 2.317256882536081, + "learning_rate": 1.8398634632352562e-05, + "loss": 0.432, + "step": 2615 + }, + { + "epoch": 0.20724896019013667, + "grad_norm": 2.1674740524169103, + "learning_rate": 1.8397241519691667e-05, + "loss": 0.3308, + "step": 2616 + }, + { + "epoch": 0.20732818379877205, + "grad_norm": 2.0220465250143294, + "learning_rate": 1.839584785411282e-05, + "loss": 0.3448, + "step": 2617 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 2.0931024764127857, + "learning_rate": 1.839445363570779e-05, + "loss": 0.3316, + "step": 2618 + }, + { + "epoch": 0.20748663101604278, + "grad_norm": 2.1880844704372215, + "learning_rate": 1.8393058864568383e-05, + "loss": 0.3456, + "step": 2619 + }, + { + "epoch": 0.20756585462467816, + "grad_norm": 1.977257775751363, + "learning_rate": 1.839166354078643e-05, + "loss": 0.3164, + "step": 2620 + }, + { + "epoch": 0.20764507823331352, + "grad_norm": 2.05211555291372, + "learning_rate": 1.8390267664453815e-05, + "loss": 0.395, + "step": 2621 + }, + { + "epoch": 0.2077243018419489, + "grad_norm": 2.1628169614419046, + "learning_rate": 1.8388871235662442e-05, + "loss": 0.2661, + "step": 2622 + }, + { + "epoch": 0.20780352545058428, + "grad_norm": 2.1323652417115495, + "learning_rate": 1.8387474254504265e-05, + "loss": 0.3184, + "step": 2623 + }, + { + "epoch": 0.20788274905921963, + "grad_norm": 2.5508570251721467, + "learning_rate": 1.8386076721071265e-05, + "loss": 0.4939, + "step": 2624 + }, + { + "epoch": 0.20796197266785502, + "grad_norm": 2.8045404773786577, + "learning_rate": 1.8384678635455467e-05, + "loss": 0.3343, + "step": 2625 + }, + { + "epoch": 0.2080411962764904, + "grad_norm": 1.8651823508151928, + "learning_rate": 1.838327999774892e-05, + "loss": 0.2498, + "step": 2626 + }, + { + "epoch": 0.20812041988512578, + "grad_norm": 2.1930794777905933, + "learning_rate": 1.838188080804373e-05, + "loss": 0.3193, + "step": 2627 + }, + { + "epoch": 0.20819964349376113, + "grad_norm": 2.6666841957853724, + "learning_rate": 1.8380481066432014e-05, + "loss": 0.3227, + "step": 2628 + }, + { + "epoch": 0.20827886710239651, + "grad_norm": 2.0500333012469394, + "learning_rate": 1.8379080773005947e-05, + "loss": 0.3185, + "step": 2629 + }, + { + "epoch": 0.2083580907110319, + "grad_norm": 1.8323762996745598, + "learning_rate": 1.8377679927857727e-05, + "loss": 0.2804, + "step": 2630 + }, + { + "epoch": 0.20843731431966725, + "grad_norm": 1.5661517308983972, + "learning_rate": 1.8376278531079594e-05, + "loss": 0.2732, + "step": 2631 + }, + { + "epoch": 0.20851653792830263, + "grad_norm": 2.2268710652453936, + "learning_rate": 1.8374876582763828e-05, + "loss": 0.357, + "step": 2632 + }, + { + "epoch": 0.208595761536938, + "grad_norm": 2.297070724373449, + "learning_rate": 1.8373474083002732e-05, + "loss": 0.393, + "step": 2633 + }, + { + "epoch": 0.2086749851455734, + "grad_norm": 1.9198036488834096, + "learning_rate": 1.837207103188866e-05, + "loss": 0.2875, + "step": 2634 + }, + { + "epoch": 0.20875420875420875, + "grad_norm": 2.1961277032385236, + "learning_rate": 1.8370667429513992e-05, + "loss": 0.3546, + "step": 2635 + }, + { + "epoch": 0.20883343236284413, + "grad_norm": 2.4534474572110674, + "learning_rate": 1.8369263275971153e-05, + "loss": 0.4266, + "step": 2636 + }, + { + "epoch": 0.2089126559714795, + "grad_norm": 2.0447295786211988, + "learning_rate": 1.8367858571352603e-05, + "loss": 0.3419, + "step": 2637 + }, + { + "epoch": 0.20899187958011486, + "grad_norm": 2.430849448300121, + "learning_rate": 1.8366453315750822e-05, + "loss": 0.3886, + "step": 2638 + }, + { + "epoch": 0.20907110318875025, + "grad_norm": 2.199666297789227, + "learning_rate": 1.8365047509258346e-05, + "loss": 0.3537, + "step": 2639 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 1.8918731842530536, + "learning_rate": 1.8363641151967747e-05, + "loss": 0.3759, + "step": 2640 + }, + { + "epoch": 0.20922955040602098, + "grad_norm": 2.081939893913907, + "learning_rate": 1.836223424397162e-05, + "loss": 0.2967, + "step": 2641 + }, + { + "epoch": 0.20930877401465636, + "grad_norm": 2.4497323812173732, + "learning_rate": 1.8360826785362603e-05, + "loss": 0.2896, + "step": 2642 + }, + { + "epoch": 0.20938799762329174, + "grad_norm": 2.276983591209558, + "learning_rate": 1.835941877623337e-05, + "loss": 0.3864, + "step": 2643 + }, + { + "epoch": 0.20946722123192713, + "grad_norm": 2.1956607346184143, + "learning_rate": 1.835801021667664e-05, + "loss": 0.3027, + "step": 2644 + }, + { + "epoch": 0.20954644484056248, + "grad_norm": 2.3536200318001326, + "learning_rate": 1.8356601106785148e-05, + "loss": 0.3932, + "step": 2645 + }, + { + "epoch": 0.20962566844919786, + "grad_norm": 2.1621747899113823, + "learning_rate": 1.8355191446651687e-05, + "loss": 0.2965, + "step": 2646 + }, + { + "epoch": 0.20970489205783324, + "grad_norm": 2.1551953744640744, + "learning_rate": 1.8353781236369065e-05, + "loss": 0.3712, + "step": 2647 + }, + { + "epoch": 0.2097841156664686, + "grad_norm": 2.598357265924713, + "learning_rate": 1.8352370476030147e-05, + "loss": 0.3331, + "step": 2648 + }, + { + "epoch": 0.20986333927510398, + "grad_norm": 3.2250553483696787, + "learning_rate": 1.8350959165727826e-05, + "loss": 0.3816, + "step": 2649 + }, + { + "epoch": 0.20994256288373936, + "grad_norm": 2.1365979100227723, + "learning_rate": 1.8349547305555023e-05, + "loss": 0.2894, + "step": 2650 + }, + { + "epoch": 0.21002178649237474, + "grad_norm": 2.644424194721846, + "learning_rate": 1.8348134895604708e-05, + "loss": 0.3396, + "step": 2651 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 2.109756127932805, + "learning_rate": 1.8346721935969878e-05, + "loss": 0.3661, + "step": 2652 + }, + { + "epoch": 0.21018023370964548, + "grad_norm": 2.3028644071572724, + "learning_rate": 1.8345308426743568e-05, + "loss": 0.3585, + "step": 2653 + }, + { + "epoch": 0.21025945731828086, + "grad_norm": 2.522542063004414, + "learning_rate": 1.8343894368018854e-05, + "loss": 0.3958, + "step": 2654 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 2.124994261044452, + "learning_rate": 1.8342479759888844e-05, + "loss": 0.2394, + "step": 2655 + }, + { + "epoch": 0.2104179045355516, + "grad_norm": 2.430304254237179, + "learning_rate": 1.8341064602446686e-05, + "loss": 0.3612, + "step": 2656 + }, + { + "epoch": 0.21049712814418697, + "grad_norm": 1.8303348210674257, + "learning_rate": 1.8339648895785556e-05, + "loss": 0.2152, + "step": 2657 + }, + { + "epoch": 0.21057635175282233, + "grad_norm": 2.4233598407210133, + "learning_rate": 1.8338232639998672e-05, + "loss": 0.2722, + "step": 2658 + }, + { + "epoch": 0.2106555753614577, + "grad_norm": 2.1245333910050364, + "learning_rate": 1.8336815835179295e-05, + "loss": 0.2951, + "step": 2659 + }, + { + "epoch": 0.2107347989700931, + "grad_norm": 2.0940444104124336, + "learning_rate": 1.8335398481420705e-05, + "loss": 0.366, + "step": 2660 + }, + { + "epoch": 0.21081402257872847, + "grad_norm": 2.513197961247772, + "learning_rate": 1.8333980578816234e-05, + "loss": 0.5175, + "step": 2661 + }, + { + "epoch": 0.21089324618736383, + "grad_norm": 2.0638517247404704, + "learning_rate": 1.8332562127459242e-05, + "loss": 0.3637, + "step": 2662 + }, + { + "epoch": 0.2109724697959992, + "grad_norm": 2.6062995492531855, + "learning_rate": 1.833114312744313e-05, + "loss": 0.3722, + "step": 2663 + }, + { + "epoch": 0.2110516934046346, + "grad_norm": 2.2418245928242495, + "learning_rate": 1.8329723578861328e-05, + "loss": 0.2643, + "step": 2664 + }, + { + "epoch": 0.21113091701326994, + "grad_norm": 1.9513762458939297, + "learning_rate": 1.8328303481807306e-05, + "loss": 0.2953, + "step": 2665 + }, + { + "epoch": 0.21121014062190532, + "grad_norm": 2.041780750713817, + "learning_rate": 1.832688283637458e-05, + "loss": 0.3284, + "step": 2666 + }, + { + "epoch": 0.2112893642305407, + "grad_norm": 1.8743691510752067, + "learning_rate": 1.8325461642656676e-05, + "loss": 0.2557, + "step": 2667 + }, + { + "epoch": 0.2113685878391761, + "grad_norm": 2.545594139253551, + "learning_rate": 1.832403990074719e-05, + "loss": 0.3291, + "step": 2668 + }, + { + "epoch": 0.21144781144781144, + "grad_norm": 2.5102765646842475, + "learning_rate": 1.8322617610739726e-05, + "loss": 0.3089, + "step": 2669 + }, + { + "epoch": 0.21152703505644682, + "grad_norm": 2.153122294702497, + "learning_rate": 1.8321194772727938e-05, + "loss": 0.3374, + "step": 2670 + }, + { + "epoch": 0.2116062586650822, + "grad_norm": 2.0620687951065313, + "learning_rate": 1.8319771386805514e-05, + "loss": 0.3148, + "step": 2671 + }, + { + "epoch": 0.21168548227371756, + "grad_norm": 2.609068027853725, + "learning_rate": 1.8318347453066176e-05, + "loss": 0.281, + "step": 2672 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 2.327885318425126, + "learning_rate": 1.8316922971603685e-05, + "loss": 0.324, + "step": 2673 + }, + { + "epoch": 0.21184392949098832, + "grad_norm": 2.5567265620509247, + "learning_rate": 1.8315497942511836e-05, + "loss": 0.4571, + "step": 2674 + }, + { + "epoch": 0.2119231530996237, + "grad_norm": 1.9530602351507904, + "learning_rate": 1.8314072365884455e-05, + "loss": 0.3368, + "step": 2675 + }, + { + "epoch": 0.21200237670825905, + "grad_norm": 2.183172551614286, + "learning_rate": 1.831264624181542e-05, + "loss": 0.381, + "step": 2676 + }, + { + "epoch": 0.21208160031689444, + "grad_norm": 2.1165451898629657, + "learning_rate": 1.8311219570398618e-05, + "loss": 0.3704, + "step": 2677 + }, + { + "epoch": 0.21216082392552982, + "grad_norm": 2.173447764552139, + "learning_rate": 1.8309792351728006e-05, + "loss": 0.3065, + "step": 2678 + }, + { + "epoch": 0.21224004753416517, + "grad_norm": 2.509060173114008, + "learning_rate": 1.830836458589755e-05, + "loss": 0.3491, + "step": 2679 + }, + { + "epoch": 0.21231927114280055, + "grad_norm": 2.657889401903535, + "learning_rate": 1.8306936273001258e-05, + "loss": 0.337, + "step": 2680 + }, + { + "epoch": 0.21239849475143593, + "grad_norm": 2.3382739629251303, + "learning_rate": 1.830550741313319e-05, + "loss": 0.3242, + "step": 2681 + }, + { + "epoch": 0.2124777183600713, + "grad_norm": 1.9207616662604226, + "learning_rate": 1.830407800638742e-05, + "loss": 0.2679, + "step": 2682 + }, + { + "epoch": 0.21255694196870667, + "grad_norm": 2.7518944547251483, + "learning_rate": 1.830264805285807e-05, + "loss": 0.3123, + "step": 2683 + }, + { + "epoch": 0.21263616557734205, + "grad_norm": 2.188738456911267, + "learning_rate": 1.8301217552639294e-05, + "loss": 0.2878, + "step": 2684 + }, + { + "epoch": 0.21271538918597743, + "grad_norm": 2.341328680782817, + "learning_rate": 1.8299786505825286e-05, + "loss": 0.351, + "step": 2685 + }, + { + "epoch": 0.2127946127946128, + "grad_norm": 2.084815946338467, + "learning_rate": 1.8298354912510273e-05, + "loss": 0.2899, + "step": 2686 + }, + { + "epoch": 0.21287383640324817, + "grad_norm": 2.2783993367167557, + "learning_rate": 1.8296922772788522e-05, + "loss": 0.4847, + "step": 2687 + }, + { + "epoch": 0.21295306001188355, + "grad_norm": 2.2558782307655316, + "learning_rate": 1.8295490086754325e-05, + "loss": 0.3419, + "step": 2688 + }, + { + "epoch": 0.2130322836205189, + "grad_norm": 2.4362783331473783, + "learning_rate": 1.829405685450202e-05, + "loss": 0.3236, + "step": 2689 + }, + { + "epoch": 0.21311150722915428, + "grad_norm": 2.2056672430638447, + "learning_rate": 1.8292623076125983e-05, + "loss": 0.3833, + "step": 2690 + }, + { + "epoch": 0.21319073083778967, + "grad_norm": 1.9879736236867172, + "learning_rate": 1.8291188751720615e-05, + "loss": 0.2964, + "step": 2691 + }, + { + "epoch": 0.21326995444642505, + "grad_norm": 1.9903371697394152, + "learning_rate": 1.828975388138036e-05, + "loss": 0.3504, + "step": 2692 + }, + { + "epoch": 0.2133491780550604, + "grad_norm": 2.3339548547217586, + "learning_rate": 1.8288318465199705e-05, + "loss": 0.2651, + "step": 2693 + }, + { + "epoch": 0.21342840166369578, + "grad_norm": 2.827813814974156, + "learning_rate": 1.8286882503273157e-05, + "loss": 0.3639, + "step": 2694 + }, + { + "epoch": 0.21350762527233116, + "grad_norm": 4.897325653438817, + "learning_rate": 1.828544599569527e-05, + "loss": 0.3537, + "step": 2695 + }, + { + "epoch": 0.21358684888096652, + "grad_norm": 2.2427380222423396, + "learning_rate": 1.8284008942560634e-05, + "loss": 0.4186, + "step": 2696 + }, + { + "epoch": 0.2136660724896019, + "grad_norm": 2.002453051781248, + "learning_rate": 1.8282571343963865e-05, + "loss": 0.3463, + "step": 2697 + }, + { + "epoch": 0.21374529609823728, + "grad_norm": 2.185418342588789, + "learning_rate": 1.8281133199999628e-05, + "loss": 0.2677, + "step": 2698 + }, + { + "epoch": 0.21382451970687263, + "grad_norm": 2.9173466222075426, + "learning_rate": 1.8279694510762616e-05, + "loss": 0.4729, + "step": 2699 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 2.1494535842839606, + "learning_rate": 1.8278255276347563e-05, + "loss": 0.3377, + "step": 2700 + }, + { + "epoch": 0.2139829669241434, + "grad_norm": 1.9491211564441508, + "learning_rate": 1.8276815496849227e-05, + "loss": 0.2439, + "step": 2701 + }, + { + "epoch": 0.21406219053277878, + "grad_norm": 2.84181437827925, + "learning_rate": 1.827537517236242e-05, + "loss": 0.4375, + "step": 2702 + }, + { + "epoch": 0.21414141414141413, + "grad_norm": 2.2023445326109456, + "learning_rate": 1.8273934302981975e-05, + "loss": 0.338, + "step": 2703 + }, + { + "epoch": 0.21422063775004951, + "grad_norm": 1.9480787022280412, + "learning_rate": 1.8272492888802767e-05, + "loss": 0.2912, + "step": 2704 + }, + { + "epoch": 0.2142998613586849, + "grad_norm": 1.9597184234300726, + "learning_rate": 1.8271050929919707e-05, + "loss": 0.3007, + "step": 2705 + }, + { + "epoch": 0.21437908496732025, + "grad_norm": 2.765426265653849, + "learning_rate": 1.8269608426427743e-05, + "loss": 0.4354, + "step": 2706 + }, + { + "epoch": 0.21445830857595563, + "grad_norm": 2.4772794757151813, + "learning_rate": 1.8268165378421852e-05, + "loss": 0.3696, + "step": 2707 + }, + { + "epoch": 0.214537532184591, + "grad_norm": 1.9931273474661464, + "learning_rate": 1.826672178599706e-05, + "loss": 0.2954, + "step": 2708 + }, + { + "epoch": 0.2146167557932264, + "grad_norm": 2.5104670234474016, + "learning_rate": 1.826527764924841e-05, + "loss": 0.4035, + "step": 2709 + }, + { + "epoch": 0.21469597940186175, + "grad_norm": 2.451612315875279, + "learning_rate": 1.8263832968271e-05, + "loss": 0.3751, + "step": 2710 + }, + { + "epoch": 0.21477520301049713, + "grad_norm": 1.8990796504331235, + "learning_rate": 1.826238774315995e-05, + "loss": 0.2747, + "step": 2711 + }, + { + "epoch": 0.2148544266191325, + "grad_norm": 2.279804520537856, + "learning_rate": 1.8260941974010425e-05, + "loss": 0.2956, + "step": 2712 + }, + { + "epoch": 0.21493365022776786, + "grad_norm": 1.9915139382310376, + "learning_rate": 1.825949566091762e-05, + "loss": 0.2541, + "step": 2713 + }, + { + "epoch": 0.21501287383640325, + "grad_norm": 2.1412302963252112, + "learning_rate": 1.8258048803976763e-05, + "loss": 0.3052, + "step": 2714 + }, + { + "epoch": 0.21509209744503863, + "grad_norm": 2.6115177209962672, + "learning_rate": 1.8256601403283133e-05, + "loss": 0.3075, + "step": 2715 + }, + { + "epoch": 0.215171321053674, + "grad_norm": 2.740814523403143, + "learning_rate": 1.8255153458932028e-05, + "loss": 0.3103, + "step": 2716 + }, + { + "epoch": 0.21525054466230936, + "grad_norm": 2.4482295912055134, + "learning_rate": 1.825370497101879e-05, + "loss": 0.3356, + "step": 2717 + }, + { + "epoch": 0.21532976827094474, + "grad_norm": 2.866688289624525, + "learning_rate": 1.825225593963879e-05, + "loss": 0.372, + "step": 2718 + }, + { + "epoch": 0.21540899187958013, + "grad_norm": 1.9197172492573251, + "learning_rate": 1.8250806364887446e-05, + "loss": 0.3426, + "step": 2719 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 2.1916516118073965, + "learning_rate": 1.8249356246860205e-05, + "loss": 0.2859, + "step": 2720 + }, + { + "epoch": 0.21556743909685086, + "grad_norm": 2.427429680619192, + "learning_rate": 1.8247905585652545e-05, + "loss": 0.4012, + "step": 2721 + }, + { + "epoch": 0.21564666270548624, + "grad_norm": 1.9188622740558436, + "learning_rate": 1.824645438135999e-05, + "loss": 0.2689, + "step": 2722 + }, + { + "epoch": 0.2157258863141216, + "grad_norm": 2.161860602553763, + "learning_rate": 1.8245002634078095e-05, + "loss": 0.3892, + "step": 2723 + }, + { + "epoch": 0.21580510992275698, + "grad_norm": 2.607534296474651, + "learning_rate": 1.8243550343902447e-05, + "loss": 0.3281, + "step": 2724 + }, + { + "epoch": 0.21588433353139236, + "grad_norm": 2.068957315285945, + "learning_rate": 1.8242097510928672e-05, + "loss": 0.3697, + "step": 2725 + }, + { + "epoch": 0.21596355714002774, + "grad_norm": 1.8549841237336069, + "learning_rate": 1.824064413525244e-05, + "loss": 0.3334, + "step": 2726 + }, + { + "epoch": 0.2160427807486631, + "grad_norm": 2.1747686167069418, + "learning_rate": 1.823919021696944e-05, + "loss": 0.3331, + "step": 2727 + }, + { + "epoch": 0.21612200435729848, + "grad_norm": 2.4904249370413116, + "learning_rate": 1.8237735756175408e-05, + "loss": 0.4155, + "step": 2728 + }, + { + "epoch": 0.21620122796593386, + "grad_norm": 2.0184441427647966, + "learning_rate": 1.8236280752966115e-05, + "loss": 0.2879, + "step": 2729 + }, + { + "epoch": 0.2162804515745692, + "grad_norm": 2.9282617704471354, + "learning_rate": 1.8234825207437365e-05, + "loss": 0.3292, + "step": 2730 + }, + { + "epoch": 0.2163596751832046, + "grad_norm": 2.3388655214942036, + "learning_rate": 1.8233369119685e-05, + "loss": 0.322, + "step": 2731 + }, + { + "epoch": 0.21643889879183997, + "grad_norm": 2.4859934978397122, + "learning_rate": 1.8231912489804893e-05, + "loss": 0.3553, + "step": 2732 + }, + { + "epoch": 0.21651812240047535, + "grad_norm": 2.3926347070693432, + "learning_rate": 1.8230455317892957e-05, + "loss": 0.2291, + "step": 2733 + }, + { + "epoch": 0.2165973460091107, + "grad_norm": 2.0529055491989032, + "learning_rate": 1.822899760404514e-05, + "loss": 0.3131, + "step": 2734 + }, + { + "epoch": 0.2166765696177461, + "grad_norm": 2.1458705751821596, + "learning_rate": 1.822753934835743e-05, + "loss": 0.4049, + "step": 2735 + }, + { + "epoch": 0.21675579322638147, + "grad_norm": 2.6818148428180306, + "learning_rate": 1.822608055092584e-05, + "loss": 0.4103, + "step": 2736 + }, + { + "epoch": 0.21683501683501682, + "grad_norm": 2.432146998159694, + "learning_rate": 1.8224621211846426e-05, + "loss": 0.5275, + "step": 2737 + }, + { + "epoch": 0.2169142404436522, + "grad_norm": 2.3261395407093617, + "learning_rate": 1.8223161331215285e-05, + "loss": 0.3637, + "step": 2738 + }, + { + "epoch": 0.2169934640522876, + "grad_norm": 2.231923908541737, + "learning_rate": 1.822170090912853e-05, + "loss": 0.362, + "step": 2739 + }, + { + "epoch": 0.21707268766092294, + "grad_norm": 1.629742574444992, + "learning_rate": 1.8220239945682337e-05, + "loss": 0.193, + "step": 2740 + }, + { + "epoch": 0.21715191126955832, + "grad_norm": 1.7399483003186347, + "learning_rate": 1.8218778440972893e-05, + "loss": 0.4342, + "step": 2741 + }, + { + "epoch": 0.2172311348781937, + "grad_norm": 1.7398891607799012, + "learning_rate": 1.8217316395096438e-05, + "loss": 0.3162, + "step": 2742 + }, + { + "epoch": 0.21731035848682909, + "grad_norm": 1.906386816586062, + "learning_rate": 1.8215853808149237e-05, + "loss": 0.2695, + "step": 2743 + }, + { + "epoch": 0.21738958209546444, + "grad_norm": 2.133144989915411, + "learning_rate": 1.8214390680227588e-05, + "loss": 0.304, + "step": 2744 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 2.32089456726155, + "learning_rate": 1.8212927011427847e-05, + "loss": 0.4325, + "step": 2745 + }, + { + "epoch": 0.2175480293127352, + "grad_norm": 2.1732248436578185, + "learning_rate": 1.8211462801846375e-05, + "loss": 0.3641, + "step": 2746 + }, + { + "epoch": 0.21762725292137056, + "grad_norm": 1.943990570786058, + "learning_rate": 1.820999805157959e-05, + "loss": 0.3231, + "step": 2747 + }, + { + "epoch": 0.21770647653000594, + "grad_norm": 2.176463650257162, + "learning_rate": 1.8208532760723937e-05, + "loss": 0.3067, + "step": 2748 + }, + { + "epoch": 0.21778570013864132, + "grad_norm": 2.248850988933716, + "learning_rate": 1.82070669293759e-05, + "loss": 0.3617, + "step": 2749 + }, + { + "epoch": 0.2178649237472767, + "grad_norm": 2.3425363068354446, + "learning_rate": 1.8205600557631995e-05, + "loss": 0.3593, + "step": 2750 + }, + { + "epoch": 0.21794414735591205, + "grad_norm": 2.451017582759792, + "learning_rate": 1.8204133645588774e-05, + "loss": 0.3984, + "step": 2751 + }, + { + "epoch": 0.21802337096454744, + "grad_norm": 3.1209178731826217, + "learning_rate": 1.8202666193342834e-05, + "loss": 0.3782, + "step": 2752 + }, + { + "epoch": 0.21810259457318282, + "grad_norm": 2.5570878697802053, + "learning_rate": 1.8201198200990787e-05, + "loss": 0.3137, + "step": 2753 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 1.8973210705038372, + "learning_rate": 1.8199729668629303e-05, + "loss": 0.3512, + "step": 2754 + }, + { + "epoch": 0.21826104179045355, + "grad_norm": 2.0668755393102156, + "learning_rate": 1.8198260596355077e-05, + "loss": 0.2919, + "step": 2755 + }, + { + "epoch": 0.21834026539908893, + "grad_norm": 2.5307994720131815, + "learning_rate": 1.8196790984264835e-05, + "loss": 0.4297, + "step": 2756 + }, + { + "epoch": 0.21841948900772432, + "grad_norm": 2.786472183984388, + "learning_rate": 1.8195320832455347e-05, + "loss": 0.3708, + "step": 2757 + }, + { + "epoch": 0.21849871261635967, + "grad_norm": 2.114873282141943, + "learning_rate": 1.819385014102342e-05, + "loss": 0.3173, + "step": 2758 + }, + { + "epoch": 0.21857793622499505, + "grad_norm": 2.090792708046673, + "learning_rate": 1.8192378910065882e-05, + "loss": 0.3205, + "step": 2759 + }, + { + "epoch": 0.21865715983363043, + "grad_norm": 2.00289789808227, + "learning_rate": 1.8190907139679614e-05, + "loss": 0.3112, + "step": 2760 + }, + { + "epoch": 0.21873638344226579, + "grad_norm": 2.807704205118221, + "learning_rate": 1.8189434829961525e-05, + "loss": 0.3694, + "step": 2761 + }, + { + "epoch": 0.21881560705090117, + "grad_norm": 1.887187878775249, + "learning_rate": 1.8187961981008554e-05, + "loss": 0.2792, + "step": 2762 + }, + { + "epoch": 0.21889483065953655, + "grad_norm": 2.1504083305840505, + "learning_rate": 1.8186488592917686e-05, + "loss": 0.3135, + "step": 2763 + }, + { + "epoch": 0.2189740542681719, + "grad_norm": 2.740866856024987, + "learning_rate": 1.8185014665785936e-05, + "loss": 0.4019, + "step": 2764 + }, + { + "epoch": 0.21905327787680728, + "grad_norm": 2.211419492581595, + "learning_rate": 1.8183540199710354e-05, + "loss": 0.3296, + "step": 2765 + }, + { + "epoch": 0.21913250148544267, + "grad_norm": 2.293345951867348, + "learning_rate": 1.8182065194788024e-05, + "loss": 0.3739, + "step": 2766 + }, + { + "epoch": 0.21921172509407805, + "grad_norm": 2.777503593857627, + "learning_rate": 1.8180589651116073e-05, + "loss": 0.425, + "step": 2767 + }, + { + "epoch": 0.2192909487027134, + "grad_norm": 2.0841800283325607, + "learning_rate": 1.8179113568791656e-05, + "loss": 0.3845, + "step": 2768 + }, + { + "epoch": 0.21937017231134878, + "grad_norm": 2.5131529791407425, + "learning_rate": 1.8177636947911964e-05, + "loss": 0.5106, + "step": 2769 + }, + { + "epoch": 0.21944939591998416, + "grad_norm": 2.2825466722925296, + "learning_rate": 1.817615978857423e-05, + "loss": 0.3962, + "step": 2770 + }, + { + "epoch": 0.21952861952861952, + "grad_norm": 2.7829011487305313, + "learning_rate": 1.8174682090875713e-05, + "loss": 0.4594, + "step": 2771 + }, + { + "epoch": 0.2196078431372549, + "grad_norm": 2.0260529137262693, + "learning_rate": 1.8173203854913714e-05, + "loss": 0.3251, + "step": 2772 + }, + { + "epoch": 0.21968706674589028, + "grad_norm": 2.0936671791159642, + "learning_rate": 1.817172508078557e-05, + "loss": 0.3519, + "step": 2773 + }, + { + "epoch": 0.21976629035452566, + "grad_norm": 2.0715936918818487, + "learning_rate": 1.817024576858865e-05, + "loss": 0.3339, + "step": 2774 + }, + { + "epoch": 0.21984551396316102, + "grad_norm": 2.51740792084349, + "learning_rate": 1.8168765918420358e-05, + "loss": 0.3975, + "step": 2775 + }, + { + "epoch": 0.2199247375717964, + "grad_norm": 2.2851669092555076, + "learning_rate": 1.8167285530378134e-05, + "loss": 0.3576, + "step": 2776 + }, + { + "epoch": 0.22000396118043178, + "grad_norm": 1.9428991776563813, + "learning_rate": 1.8165804604559455e-05, + "loss": 0.3135, + "step": 2777 + }, + { + "epoch": 0.22008318478906713, + "grad_norm": 1.9706812462745862, + "learning_rate": 1.816432314106184e-05, + "loss": 0.2432, + "step": 2778 + }, + { + "epoch": 0.2201624083977025, + "grad_norm": 2.0641478005522074, + "learning_rate": 1.8162841139982827e-05, + "loss": 0.3255, + "step": 2779 + }, + { + "epoch": 0.2202416320063379, + "grad_norm": 2.0162538964558623, + "learning_rate": 1.816135860142e-05, + "loss": 0.3206, + "step": 2780 + }, + { + "epoch": 0.22032085561497325, + "grad_norm": 1.8413196277632444, + "learning_rate": 1.8159875525470984e-05, + "loss": 0.3019, + "step": 2781 + }, + { + "epoch": 0.22040007922360863, + "grad_norm": 1.9624042378974955, + "learning_rate": 1.815839191223342e-05, + "loss": 0.2958, + "step": 2782 + }, + { + "epoch": 0.220479302832244, + "grad_norm": 2.5618488688306167, + "learning_rate": 1.815690776180501e-05, + "loss": 0.4318, + "step": 2783 + }, + { + "epoch": 0.2205585264408794, + "grad_norm": 2.446665133912094, + "learning_rate": 1.815542307428347e-05, + "loss": 0.4155, + "step": 2784 + }, + { + "epoch": 0.22063775004951475, + "grad_norm": 2.424049778856075, + "learning_rate": 1.8153937849766567e-05, + "loss": 0.2669, + "step": 2785 + }, + { + "epoch": 0.22071697365815013, + "grad_norm": 2.1478646318804846, + "learning_rate": 1.8152452088352084e-05, + "loss": 0.372, + "step": 2786 + }, + { + "epoch": 0.2207961972667855, + "grad_norm": 2.3955112163845107, + "learning_rate": 1.8150965790137863e-05, + "loss": 0.3584, + "step": 2787 + }, + { + "epoch": 0.22087542087542086, + "grad_norm": 2.661875222645604, + "learning_rate": 1.814947895522176e-05, + "loss": 0.3316, + "step": 2788 + }, + { + "epoch": 0.22095464448405625, + "grad_norm": 2.325175855490434, + "learning_rate": 1.8147991583701685e-05, + "loss": 0.39, + "step": 2789 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 2.199277321313363, + "learning_rate": 1.8146503675675568e-05, + "loss": 0.356, + "step": 2790 + }, + { + "epoch": 0.221113091701327, + "grad_norm": 2.310229606721889, + "learning_rate": 1.814501523124138e-05, + "loss": 0.3727, + "step": 2791 + }, + { + "epoch": 0.22119231530996236, + "grad_norm": 2.417042694775412, + "learning_rate": 1.8143526250497134e-05, + "loss": 0.3515, + "step": 2792 + }, + { + "epoch": 0.22127153891859774, + "grad_norm": 2.3438622600553893, + "learning_rate": 1.8142036733540868e-05, + "loss": 0.3176, + "step": 2793 + }, + { + "epoch": 0.22135076252723312, + "grad_norm": 2.8023696645605662, + "learning_rate": 1.814054668047066e-05, + "loss": 0.4563, + "step": 2794 + }, + { + "epoch": 0.22142998613586848, + "grad_norm": 2.0967926876237972, + "learning_rate": 1.8139056091384623e-05, + "loss": 0.3873, + "step": 2795 + }, + { + "epoch": 0.22150920974450386, + "grad_norm": 1.9080582301906857, + "learning_rate": 1.8137564966380905e-05, + "loss": 0.2884, + "step": 2796 + }, + { + "epoch": 0.22158843335313924, + "grad_norm": 2.5659845759948277, + "learning_rate": 1.813607330555769e-05, + "loss": 0.4875, + "step": 2797 + }, + { + "epoch": 0.2216676569617746, + "grad_norm": 2.4479632119932115, + "learning_rate": 1.8134581109013193e-05, + "loss": 0.3686, + "step": 2798 + }, + { + "epoch": 0.22174688057040998, + "grad_norm": 2.1075284303966244, + "learning_rate": 1.8133088376845675e-05, + "loss": 0.3733, + "step": 2799 + }, + { + "epoch": 0.22182610417904536, + "grad_norm": 2.1122372908019966, + "learning_rate": 1.8131595109153416e-05, + "loss": 0.3249, + "step": 2800 + }, + { + "epoch": 0.22190532778768074, + "grad_norm": 2.4272495035187176, + "learning_rate": 1.813010130603475e-05, + "loss": 0.3467, + "step": 2801 + }, + { + "epoch": 0.2219845513963161, + "grad_norm": 2.4179265379896786, + "learning_rate": 1.812860696758803e-05, + "loss": 0.3454, + "step": 2802 + }, + { + "epoch": 0.22206377500495147, + "grad_norm": 2.1448270930339586, + "learning_rate": 1.8127112093911655e-05, + "loss": 0.33, + "step": 2803 + }, + { + "epoch": 0.22214299861358686, + "grad_norm": 2.104691498733774, + "learning_rate": 1.8125616685104055e-05, + "loss": 0.3561, + "step": 2804 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.885218233425823, + "learning_rate": 1.8124120741263692e-05, + "loss": 0.4748, + "step": 2805 + }, + { + "epoch": 0.2223014458308576, + "grad_norm": 2.6782122074087766, + "learning_rate": 1.812262426248907e-05, + "loss": 0.3291, + "step": 2806 + }, + { + "epoch": 0.22238066943949297, + "grad_norm": 1.9251078060252946, + "learning_rate": 1.8121127248878726e-05, + "loss": 0.2945, + "step": 2807 + }, + { + "epoch": 0.22245989304812835, + "grad_norm": 2.5882451306476177, + "learning_rate": 1.8119629700531228e-05, + "loss": 0.4597, + "step": 2808 + }, + { + "epoch": 0.2225391166567637, + "grad_norm": 1.913229999193114, + "learning_rate": 1.8118131617545183e-05, + "loss": 0.3435, + "step": 2809 + }, + { + "epoch": 0.2226183402653991, + "grad_norm": 3.2774521290088243, + "learning_rate": 1.8116633000019233e-05, + "loss": 0.3352, + "step": 2810 + }, + { + "epoch": 0.22269756387403447, + "grad_norm": 2.1927178790263553, + "learning_rate": 1.8115133848052052e-05, + "loss": 0.3588, + "step": 2811 + }, + { + "epoch": 0.22277678748266982, + "grad_norm": 1.8271179319107111, + "learning_rate": 1.8113634161742356e-05, + "loss": 0.2917, + "step": 2812 + }, + { + "epoch": 0.2228560110913052, + "grad_norm": 1.9939738517158243, + "learning_rate": 1.8112133941188892e-05, + "loss": 0.2463, + "step": 2813 + }, + { + "epoch": 0.2229352346999406, + "grad_norm": 2.115788530061031, + "learning_rate": 1.811063318649044e-05, + "loss": 0.3847, + "step": 2814 + }, + { + "epoch": 0.22301445830857597, + "grad_norm": 2.3259541606821053, + "learning_rate": 1.8109131897745823e-05, + "loss": 0.5999, + "step": 2815 + }, + { + "epoch": 0.22309368191721132, + "grad_norm": 1.9979988228489234, + "learning_rate": 1.8107630075053883e-05, + "loss": 0.2689, + "step": 2816 + }, + { + "epoch": 0.2231729055258467, + "grad_norm": 2.576057003769713, + "learning_rate": 1.810612771851352e-05, + "loss": 0.4017, + "step": 2817 + }, + { + "epoch": 0.22325212913448209, + "grad_norm": 2.099998067135014, + "learning_rate": 1.8104624828223644e-05, + "loss": 0.2889, + "step": 2818 + }, + { + "epoch": 0.22333135274311744, + "grad_norm": 2.064592884476095, + "learning_rate": 1.8103121404283222e-05, + "loss": 0.2691, + "step": 2819 + }, + { + "epoch": 0.22341057635175282, + "grad_norm": 2.699722248009324, + "learning_rate": 1.8101617446791248e-05, + "loss": 0.4082, + "step": 2820 + }, + { + "epoch": 0.2234897999603882, + "grad_norm": 2.3072058492940672, + "learning_rate": 1.8100112955846746e-05, + "loss": 0.3894, + "step": 2821 + }, + { + "epoch": 0.22356902356902356, + "grad_norm": 1.8478529110795325, + "learning_rate": 1.8098607931548782e-05, + "loss": 0.2974, + "step": 2822 + }, + { + "epoch": 0.22364824717765894, + "grad_norm": 2.1023917488988726, + "learning_rate": 1.8097102373996453e-05, + "loss": 0.2879, + "step": 2823 + }, + { + "epoch": 0.22372747078629432, + "grad_norm": 2.4842397385833155, + "learning_rate": 1.809559628328889e-05, + "loss": 0.3728, + "step": 2824 + }, + { + "epoch": 0.2238066943949297, + "grad_norm": 2.748658895467443, + "learning_rate": 1.8094089659525274e-05, + "loss": 0.2689, + "step": 2825 + }, + { + "epoch": 0.22388591800356505, + "grad_norm": 2.432126920062806, + "learning_rate": 1.8092582502804793e-05, + "loss": 0.3129, + "step": 2826 + }, + { + "epoch": 0.22396514161220044, + "grad_norm": 2.480648062131945, + "learning_rate": 1.8091074813226696e-05, + "loss": 0.3336, + "step": 2827 + }, + { + "epoch": 0.22404436522083582, + "grad_norm": 2.439638467305275, + "learning_rate": 1.8089566590890253e-05, + "loss": 0.2972, + "step": 2828 + }, + { + "epoch": 0.22412358882947117, + "grad_norm": 1.9591642001963525, + "learning_rate": 1.8088057835894775e-05, + "loss": 0.2406, + "step": 2829 + }, + { + "epoch": 0.22420281243810655, + "grad_norm": 1.9939341737486795, + "learning_rate": 1.8086548548339604e-05, + "loss": 0.3465, + "step": 2830 + }, + { + "epoch": 0.22428203604674193, + "grad_norm": 2.522427302150872, + "learning_rate": 1.8085038728324123e-05, + "loss": 0.4351, + "step": 2831 + }, + { + "epoch": 0.22436125965537732, + "grad_norm": 2.4295206811675967, + "learning_rate": 1.8083528375947744e-05, + "loss": 0.3199, + "step": 2832 + }, + { + "epoch": 0.22444048326401267, + "grad_norm": 2.316738940300317, + "learning_rate": 1.808201749130992e-05, + "loss": 0.4538, + "step": 2833 + }, + { + "epoch": 0.22451970687264805, + "grad_norm": 2.2986382210777183, + "learning_rate": 1.8080506074510128e-05, + "loss": 0.3835, + "step": 2834 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 1.905129539710133, + "learning_rate": 1.8078994125647896e-05, + "loss": 0.2711, + "step": 2835 + }, + { + "epoch": 0.22467815408991879, + "grad_norm": 1.8255445852897605, + "learning_rate": 1.807748164482277e-05, + "loss": 0.2794, + "step": 2836 + }, + { + "epoch": 0.22475737769855417, + "grad_norm": 2.0904691221940537, + "learning_rate": 1.8075968632134343e-05, + "loss": 0.3683, + "step": 2837 + }, + { + "epoch": 0.22483660130718955, + "grad_norm": 1.905015642312692, + "learning_rate": 1.8074455087682247e-05, + "loss": 0.275, + "step": 2838 + }, + { + "epoch": 0.2249158249158249, + "grad_norm": 2.2645497651893267, + "learning_rate": 1.8072941011566133e-05, + "loss": 0.3043, + "step": 2839 + }, + { + "epoch": 0.22499504852446028, + "grad_norm": 2.3235871738430007, + "learning_rate": 1.8071426403885698e-05, + "loss": 0.3646, + "step": 2840 + }, + { + "epoch": 0.22507427213309567, + "grad_norm": 2.2783148377895204, + "learning_rate": 1.8069911264740667e-05, + "loss": 0.4315, + "step": 2841 + }, + { + "epoch": 0.22515349574173105, + "grad_norm": 1.6764015652476059, + "learning_rate": 1.8068395594230815e-05, + "loss": 0.2442, + "step": 2842 + }, + { + "epoch": 0.2252327193503664, + "grad_norm": 2.1534012111542484, + "learning_rate": 1.8066879392455932e-05, + "loss": 0.2912, + "step": 2843 + }, + { + "epoch": 0.22531194295900178, + "grad_norm": 1.8884219546090963, + "learning_rate": 1.8065362659515856e-05, + "loss": 0.3183, + "step": 2844 + }, + { + "epoch": 0.22539116656763716, + "grad_norm": 1.5850970041712427, + "learning_rate": 1.806384539551046e-05, + "loss": 0.1986, + "step": 2845 + }, + { + "epoch": 0.22547039017627252, + "grad_norm": 2.0475579202181295, + "learning_rate": 1.8062327600539643e-05, + "loss": 0.4377, + "step": 2846 + }, + { + "epoch": 0.2255496137849079, + "grad_norm": 1.9607631637609906, + "learning_rate": 1.8060809274703352e-05, + "loss": 0.2975, + "step": 2847 + }, + { + "epoch": 0.22562883739354328, + "grad_norm": 2.2140828779319746, + "learning_rate": 1.805929041810155e-05, + "loss": 0.2717, + "step": 2848 + }, + { + "epoch": 0.22570806100217866, + "grad_norm": 1.7974567746149128, + "learning_rate": 1.8057771030834255e-05, + "loss": 0.2751, + "step": 2849 + }, + { + "epoch": 0.22578728461081402, + "grad_norm": 2.386475211878843, + "learning_rate": 1.8056251113001508e-05, + "loss": 0.3615, + "step": 2850 + }, + { + "epoch": 0.2258665082194494, + "grad_norm": 2.49752307099072, + "learning_rate": 1.8054730664703393e-05, + "loss": 0.3746, + "step": 2851 + }, + { + "epoch": 0.22594573182808478, + "grad_norm": 2.7967442450321873, + "learning_rate": 1.8053209686040017e-05, + "loss": 0.3949, + "step": 2852 + }, + { + "epoch": 0.22602495543672013, + "grad_norm": 1.619050510468112, + "learning_rate": 1.8051688177111532e-05, + "loss": 0.2148, + "step": 2853 + }, + { + "epoch": 0.2261041790453555, + "grad_norm": 1.7740582455891074, + "learning_rate": 1.805016613801813e-05, + "loss": 0.2236, + "step": 2854 + }, + { + "epoch": 0.2261834026539909, + "grad_norm": 2.4837249942069204, + "learning_rate": 1.8048643568860015e-05, + "loss": 0.4577, + "step": 2855 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 2.2016832637598642, + "learning_rate": 1.804712046973745e-05, + "loss": 0.3829, + "step": 2856 + }, + { + "epoch": 0.22634184987126163, + "grad_norm": 2.438095061444324, + "learning_rate": 1.8045596840750722e-05, + "loss": 0.4635, + "step": 2857 + }, + { + "epoch": 0.226421073479897, + "grad_norm": 2.3786191331942166, + "learning_rate": 1.804407268200016e-05, + "loss": 0.5126, + "step": 2858 + }, + { + "epoch": 0.2265002970885324, + "grad_norm": 1.7382877157962437, + "learning_rate": 1.8042547993586114e-05, + "loss": 0.2779, + "step": 2859 + }, + { + "epoch": 0.22657952069716775, + "grad_norm": 2.2580177520879516, + "learning_rate": 1.8041022775608977e-05, + "loss": 0.348, + "step": 2860 + }, + { + "epoch": 0.22665874430580313, + "grad_norm": 2.1685683161357328, + "learning_rate": 1.803949702816919e-05, + "loss": 0.426, + "step": 2861 + }, + { + "epoch": 0.2267379679144385, + "grad_norm": 2.1460350431188933, + "learning_rate": 1.80379707513672e-05, + "loss": 0.3913, + "step": 2862 + }, + { + "epoch": 0.22681719152307386, + "grad_norm": 2.138814703615336, + "learning_rate": 1.8036443945303514e-05, + "loss": 0.3307, + "step": 2863 + }, + { + "epoch": 0.22689641513170924, + "grad_norm": 2.5653723267752233, + "learning_rate": 1.8034916610078665e-05, + "loss": 0.3779, + "step": 2864 + }, + { + "epoch": 0.22697563874034463, + "grad_norm": 2.4518404752264624, + "learning_rate": 1.8033388745793218e-05, + "loss": 0.3643, + "step": 2865 + }, + { + "epoch": 0.22705486234898, + "grad_norm": 2.62512043388419, + "learning_rate": 1.8031860352547777e-05, + "loss": 0.5207, + "step": 2866 + }, + { + "epoch": 0.22713408595761536, + "grad_norm": 2.3470155365932794, + "learning_rate": 1.8030331430442974e-05, + "loss": 0.421, + "step": 2867 + }, + { + "epoch": 0.22721330956625074, + "grad_norm": 1.8310345204256284, + "learning_rate": 1.8028801979579487e-05, + "loss": 0.3591, + "step": 2868 + }, + { + "epoch": 0.22729253317488612, + "grad_norm": 3.251820785918148, + "learning_rate": 1.8027272000058028e-05, + "loss": 0.3722, + "step": 2869 + }, + { + "epoch": 0.22737175678352148, + "grad_norm": 2.0047297137284215, + "learning_rate": 1.8025741491979326e-05, + "loss": 0.3068, + "step": 2870 + }, + { + "epoch": 0.22745098039215686, + "grad_norm": 2.103550627420197, + "learning_rate": 1.8024210455444168e-05, + "loss": 0.3744, + "step": 2871 + }, + { + "epoch": 0.22753020400079224, + "grad_norm": 2.3269590023557325, + "learning_rate": 1.8022678890553364e-05, + "loss": 0.3646, + "step": 2872 + }, + { + "epoch": 0.22760942760942762, + "grad_norm": 2.0459352564615263, + "learning_rate": 1.8021146797407752e-05, + "loss": 0.3454, + "step": 2873 + }, + { + "epoch": 0.22768865121806298, + "grad_norm": 1.9884501680386073, + "learning_rate": 1.801961417610822e-05, + "loss": 0.3498, + "step": 2874 + }, + { + "epoch": 0.22776787482669836, + "grad_norm": 2.473846253022974, + "learning_rate": 1.801808102675568e-05, + "loss": 0.4022, + "step": 2875 + }, + { + "epoch": 0.22784709843533374, + "grad_norm": 1.5653616473294338, + "learning_rate": 1.801654734945109e-05, + "loss": 0.2476, + "step": 2876 + }, + { + "epoch": 0.2279263220439691, + "grad_norm": 2.468943502463552, + "learning_rate": 1.801501314429543e-05, + "loss": 0.333, + "step": 2877 + }, + { + "epoch": 0.22800554565260447, + "grad_norm": 1.8586663229271703, + "learning_rate": 1.801347841138972e-05, + "loss": 0.3301, + "step": 2878 + }, + { + "epoch": 0.22808476926123986, + "grad_norm": 2.293706848926581, + "learning_rate": 1.8011943150835013e-05, + "loss": 0.3425, + "step": 2879 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 2.32557040060527, + "learning_rate": 1.80104073627324e-05, + "loss": 0.3143, + "step": 2880 + }, + { + "epoch": 0.2282432164785106, + "grad_norm": 2.542357365686963, + "learning_rate": 1.8008871047183005e-05, + "loss": 0.3799, + "step": 2881 + }, + { + "epoch": 0.22832244008714597, + "grad_norm": 1.6391160903639728, + "learning_rate": 1.800733420428799e-05, + "loss": 0.2466, + "step": 2882 + }, + { + "epoch": 0.22840166369578135, + "grad_norm": 2.2224153235736717, + "learning_rate": 1.8005796834148545e-05, + "loss": 0.246, + "step": 2883 + }, + { + "epoch": 0.2284808873044167, + "grad_norm": 2.1309638642878945, + "learning_rate": 1.8004258936865902e-05, + "loss": 0.3976, + "step": 2884 + }, + { + "epoch": 0.2285601109130521, + "grad_norm": 2.4139720687491937, + "learning_rate": 1.800272051254132e-05, + "loss": 0.3553, + "step": 2885 + }, + { + "epoch": 0.22863933452168747, + "grad_norm": 2.435499357951839, + "learning_rate": 1.80011815612761e-05, + "loss": 0.3988, + "step": 2886 + }, + { + "epoch": 0.22871855813032282, + "grad_norm": 1.879262567029515, + "learning_rate": 1.7999642083171576e-05, + "loss": 0.2874, + "step": 2887 + }, + { + "epoch": 0.2287977817389582, + "grad_norm": 2.023245868222001, + "learning_rate": 1.799810207832911e-05, + "loss": 0.2938, + "step": 2888 + }, + { + "epoch": 0.2288770053475936, + "grad_norm": 2.6685608718201426, + "learning_rate": 1.7996561546850105e-05, + "loss": 0.3572, + "step": 2889 + }, + { + "epoch": 0.22895622895622897, + "grad_norm": 2.2130170651991192, + "learning_rate": 1.7995020488836e-05, + "loss": 0.233, + "step": 2890 + }, + { + "epoch": 0.22903545256486432, + "grad_norm": 2.154666875445534, + "learning_rate": 1.799347890438827e-05, + "loss": 0.3709, + "step": 2891 + }, + { + "epoch": 0.2291146761734997, + "grad_norm": 2.0188603554095077, + "learning_rate": 1.799193679360841e-05, + "loss": 0.3068, + "step": 2892 + }, + { + "epoch": 0.22919389978213509, + "grad_norm": 1.818362308464351, + "learning_rate": 1.799039415659797e-05, + "loss": 0.2834, + "step": 2893 + }, + { + "epoch": 0.22927312339077044, + "grad_norm": 2.235979602354852, + "learning_rate": 1.798885099345852e-05, + "loss": 0.3462, + "step": 2894 + }, + { + "epoch": 0.22935234699940582, + "grad_norm": 2.4960063989619035, + "learning_rate": 1.7987307304291676e-05, + "loss": 0.3284, + "step": 2895 + }, + { + "epoch": 0.2294315706080412, + "grad_norm": 2.7434744305484506, + "learning_rate": 1.7985763089199073e-05, + "loss": 0.3514, + "step": 2896 + }, + { + "epoch": 0.22951079421667656, + "grad_norm": 1.9845034730050122, + "learning_rate": 1.79842183482824e-05, + "loss": 0.2858, + "step": 2897 + }, + { + "epoch": 0.22959001782531194, + "grad_norm": 1.9666854595762389, + "learning_rate": 1.7982673081643364e-05, + "loss": 0.216, + "step": 2898 + }, + { + "epoch": 0.22966924143394732, + "grad_norm": 1.9169888753826316, + "learning_rate": 1.7981127289383718e-05, + "loss": 0.2515, + "step": 2899 + }, + { + "epoch": 0.2297484650425827, + "grad_norm": 2.332755771807327, + "learning_rate": 1.797958097160524e-05, + "loss": 0.3947, + "step": 2900 + }, + { + "epoch": 0.22982768865121805, + "grad_norm": 2.158902626130824, + "learning_rate": 1.797803412840975e-05, + "loss": 0.3433, + "step": 2901 + }, + { + "epoch": 0.22990691225985344, + "grad_norm": 1.9914693487620776, + "learning_rate": 1.7976486759899103e-05, + "loss": 0.3463, + "step": 2902 + }, + { + "epoch": 0.22998613586848882, + "grad_norm": 2.2033191304166104, + "learning_rate": 1.797493886617518e-05, + "loss": 0.3565, + "step": 2903 + }, + { + "epoch": 0.23006535947712417, + "grad_norm": 2.0778455581291357, + "learning_rate": 1.797339044733991e-05, + "loss": 0.3573, + "step": 2904 + }, + { + "epoch": 0.23014458308575955, + "grad_norm": 2.4994002279753267, + "learning_rate": 1.797184150349524e-05, + "loss": 0.3831, + "step": 2905 + }, + { + "epoch": 0.23022380669439493, + "grad_norm": 1.9710331465542317, + "learning_rate": 1.7970292034743172e-05, + "loss": 0.2739, + "step": 2906 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 2.231762668543122, + "learning_rate": 1.7968742041185718e-05, + "loss": 0.3545, + "step": 2907 + }, + { + "epoch": 0.23038225391166567, + "grad_norm": 2.4829822610232313, + "learning_rate": 1.7967191522924946e-05, + "loss": 0.3644, + "step": 2908 + }, + { + "epoch": 0.23046147752030105, + "grad_norm": 2.4002838104282223, + "learning_rate": 1.7965640480062945e-05, + "loss": 0.4051, + "step": 2909 + }, + { + "epoch": 0.23054070112893643, + "grad_norm": 2.2386224198187556, + "learning_rate": 1.796408891270185e-05, + "loss": 0.3959, + "step": 2910 + }, + { + "epoch": 0.23061992473757179, + "grad_norm": 2.4165159631742323, + "learning_rate": 1.7962536820943822e-05, + "loss": 0.3215, + "step": 2911 + }, + { + "epoch": 0.23069914834620717, + "grad_norm": 2.71360000118413, + "learning_rate": 1.7960984204891055e-05, + "loss": 0.471, + "step": 2912 + }, + { + "epoch": 0.23077837195484255, + "grad_norm": 2.3426652598697677, + "learning_rate": 1.7959431064645786e-05, + "loss": 0.3228, + "step": 2913 + }, + { + "epoch": 0.23085759556347793, + "grad_norm": 2.193463315427334, + "learning_rate": 1.7957877400310275e-05, + "loss": 0.4157, + "step": 2914 + }, + { + "epoch": 0.23093681917211328, + "grad_norm": 2.136276471583766, + "learning_rate": 1.7956323211986833e-05, + "loss": 0.3445, + "step": 2915 + }, + { + "epoch": 0.23101604278074866, + "grad_norm": 1.7871354320514736, + "learning_rate": 1.795476849977779e-05, + "loss": 0.3168, + "step": 2916 + }, + { + "epoch": 0.23109526638938405, + "grad_norm": 2.011408081414906, + "learning_rate": 1.7953213263785513e-05, + "loss": 0.3476, + "step": 2917 + }, + { + "epoch": 0.2311744899980194, + "grad_norm": 2.0562473538107606, + "learning_rate": 1.7951657504112416e-05, + "loss": 0.3409, + "step": 2918 + }, + { + "epoch": 0.23125371360665478, + "grad_norm": 2.1752303766096164, + "learning_rate": 1.795010122086093e-05, + "loss": 0.4139, + "step": 2919 + }, + { + "epoch": 0.23133293721529016, + "grad_norm": 1.8107573650843167, + "learning_rate": 1.7948544414133534e-05, + "loss": 0.32, + "step": 2920 + }, + { + "epoch": 0.23141216082392552, + "grad_norm": 1.5762747209352221, + "learning_rate": 1.7946987084032733e-05, + "loss": 0.2817, + "step": 2921 + }, + { + "epoch": 0.2314913844325609, + "grad_norm": 2.0999905667464174, + "learning_rate": 1.794542923066107e-05, + "loss": 0.3092, + "step": 2922 + }, + { + "epoch": 0.23157060804119628, + "grad_norm": 2.6584197408651806, + "learning_rate": 1.7943870854121126e-05, + "loss": 0.3684, + "step": 2923 + }, + { + "epoch": 0.23164983164983166, + "grad_norm": 1.9513772617706642, + "learning_rate": 1.794231195451551e-05, + "loss": 0.2871, + "step": 2924 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 4.495199451645402, + "learning_rate": 1.7940752531946867e-05, + "loss": 0.3833, + "step": 2925 + }, + { + "epoch": 0.2318082788671024, + "grad_norm": 2.1564859049950815, + "learning_rate": 1.793919258651788e-05, + "loss": 0.3519, + "step": 2926 + }, + { + "epoch": 0.23188750247573778, + "grad_norm": 1.979307986257514, + "learning_rate": 1.7937632118331255e-05, + "loss": 0.3346, + "step": 2927 + }, + { + "epoch": 0.23196672608437313, + "grad_norm": 2.261647990004656, + "learning_rate": 1.7936071127489755e-05, + "loss": 0.4053, + "step": 2928 + }, + { + "epoch": 0.2320459496930085, + "grad_norm": 2.0955501333327318, + "learning_rate": 1.7934509614096156e-05, + "loss": 0.285, + "step": 2929 + }, + { + "epoch": 0.2321251733016439, + "grad_norm": 1.9447321125912682, + "learning_rate": 1.7932947578253273e-05, + "loss": 0.2999, + "step": 2930 + }, + { + "epoch": 0.23220439691027928, + "grad_norm": 2.3961018801608054, + "learning_rate": 1.793138502006397e-05, + "loss": 0.3123, + "step": 2931 + }, + { + "epoch": 0.23228362051891463, + "grad_norm": 2.248735843211351, + "learning_rate": 1.792982193963112e-05, + "loss": 0.2409, + "step": 2932 + }, + { + "epoch": 0.23236284412755, + "grad_norm": 2.1466220519787744, + "learning_rate": 1.7928258337057657e-05, + "loss": 0.3785, + "step": 2933 + }, + { + "epoch": 0.2324420677361854, + "grad_norm": 2.3267080861466907, + "learning_rate": 1.792669421244653e-05, + "loss": 0.3328, + "step": 2934 + }, + { + "epoch": 0.23252129134482075, + "grad_norm": 2.191770307734201, + "learning_rate": 1.7925129565900728e-05, + "loss": 0.3971, + "step": 2935 + }, + { + "epoch": 0.23260051495345613, + "grad_norm": 2.241527801496312, + "learning_rate": 1.792356439752328e-05, + "loss": 0.3226, + "step": 2936 + }, + { + "epoch": 0.2326797385620915, + "grad_norm": 2.235423542564452, + "learning_rate": 1.792199870741724e-05, + "loss": 0.3113, + "step": 2937 + }, + { + "epoch": 0.23275896217072686, + "grad_norm": 2.2830173602699246, + "learning_rate": 1.79204324956857e-05, + "loss": 0.3799, + "step": 2938 + }, + { + "epoch": 0.23283818577936224, + "grad_norm": 2.084657360357404, + "learning_rate": 1.7918865762431794e-05, + "loss": 0.2937, + "step": 2939 + }, + { + "epoch": 0.23291740938799763, + "grad_norm": 2.3708023727606014, + "learning_rate": 1.7917298507758684e-05, + "loss": 0.2781, + "step": 2940 + }, + { + "epoch": 0.232996632996633, + "grad_norm": 1.9726311656189268, + "learning_rate": 1.7915730731769558e-05, + "loss": 0.2762, + "step": 2941 + }, + { + "epoch": 0.23307585660526836, + "grad_norm": 2.559575870281985, + "learning_rate": 1.7914162434567653e-05, + "loss": 0.4123, + "step": 2942 + }, + { + "epoch": 0.23315508021390374, + "grad_norm": 2.2646105556653824, + "learning_rate": 1.791259361625623e-05, + "loss": 0.4099, + "step": 2943 + }, + { + "epoch": 0.23323430382253912, + "grad_norm": 1.975575846869981, + "learning_rate": 1.7911024276938595e-05, + "loss": 0.3123, + "step": 2944 + }, + { + "epoch": 0.23331352743117448, + "grad_norm": 2.2585021786528845, + "learning_rate": 1.7909454416718075e-05, + "loss": 0.3184, + "step": 2945 + }, + { + "epoch": 0.23339275103980986, + "grad_norm": 2.0979119072745047, + "learning_rate": 1.790788403569804e-05, + "loss": 0.3558, + "step": 2946 + }, + { + "epoch": 0.23347197464844524, + "grad_norm": 2.191737209229685, + "learning_rate": 1.7906313133981887e-05, + "loss": 0.3658, + "step": 2947 + }, + { + "epoch": 0.23355119825708062, + "grad_norm": 2.040410647235041, + "learning_rate": 1.7904741711673064e-05, + "loss": 0.3607, + "step": 2948 + }, + { + "epoch": 0.23363042186571598, + "grad_norm": 1.83934056089197, + "learning_rate": 1.790316976887503e-05, + "loss": 0.2585, + "step": 2949 + }, + { + "epoch": 0.23370964547435136, + "grad_norm": 2.19022147701637, + "learning_rate": 1.7901597305691294e-05, + "loss": 0.305, + "step": 2950 + }, + { + "epoch": 0.23378886908298674, + "grad_norm": 2.200737740426933, + "learning_rate": 1.7900024322225394e-05, + "loss": 0.3319, + "step": 2951 + }, + { + "epoch": 0.2338680926916221, + "grad_norm": 2.024506765875974, + "learning_rate": 1.789845081858091e-05, + "loss": 0.393, + "step": 2952 + }, + { + "epoch": 0.23394731630025747, + "grad_norm": 2.130022689939545, + "learning_rate": 1.7896876794861443e-05, + "loss": 0.3281, + "step": 2953 + }, + { + "epoch": 0.23402653990889286, + "grad_norm": 1.9970757082441841, + "learning_rate": 1.7895302251170636e-05, + "loss": 0.3131, + "step": 2954 + }, + { + "epoch": 0.23410576351752824, + "grad_norm": 2.630158114518891, + "learning_rate": 1.789372718761216e-05, + "loss": 0.3403, + "step": 2955 + }, + { + "epoch": 0.2341849871261636, + "grad_norm": 2.426182844574413, + "learning_rate": 1.7892151604289738e-05, + "loss": 0.4409, + "step": 2956 + }, + { + "epoch": 0.23426421073479897, + "grad_norm": 2.5788186681048573, + "learning_rate": 1.7890575501307105e-05, + "loss": 0.3494, + "step": 2957 + }, + { + "epoch": 0.23434343434343435, + "grad_norm": 2.2183709054173018, + "learning_rate": 1.7888998878768045e-05, + "loss": 0.3267, + "step": 2958 + }, + { + "epoch": 0.2344226579520697, + "grad_norm": 1.8539802798757794, + "learning_rate": 1.7887421736776364e-05, + "loss": 0.2039, + "step": 2959 + }, + { + "epoch": 0.2345018815607051, + "grad_norm": 2.0528093366755304, + "learning_rate": 1.7885844075435915e-05, + "loss": 0.3669, + "step": 2960 + }, + { + "epoch": 0.23458110516934047, + "grad_norm": 2.6408979751390036, + "learning_rate": 1.788426589485058e-05, + "loss": 0.3789, + "step": 2961 + }, + { + "epoch": 0.23466032877797582, + "grad_norm": 2.314651527578581, + "learning_rate": 1.788268719512427e-05, + "loss": 0.3333, + "step": 2962 + }, + { + "epoch": 0.2347395523866112, + "grad_norm": 1.9330681335377402, + "learning_rate": 1.788110797636094e-05, + "loss": 0.3406, + "step": 2963 + }, + { + "epoch": 0.2348187759952466, + "grad_norm": 2.151811889862483, + "learning_rate": 1.7879528238664567e-05, + "loss": 0.3437, + "step": 2964 + }, + { + "epoch": 0.23489799960388197, + "grad_norm": 2.103264771537712, + "learning_rate": 1.7877947982139177e-05, + "loss": 0.3799, + "step": 2965 + }, + { + "epoch": 0.23497722321251732, + "grad_norm": 2.2414401511680517, + "learning_rate": 1.7876367206888817e-05, + "loss": 0.383, + "step": 2966 + }, + { + "epoch": 0.2350564468211527, + "grad_norm": 2.026698573553762, + "learning_rate": 1.7874785913017575e-05, + "loss": 0.2826, + "step": 2967 + }, + { + "epoch": 0.23513567042978809, + "grad_norm": 2.0734253480544504, + "learning_rate": 1.7873204100629572e-05, + "loss": 0.3227, + "step": 2968 + }, + { + "epoch": 0.23521489403842344, + "grad_norm": 2.240976789977317, + "learning_rate": 1.7871621769828965e-05, + "loss": 0.433, + "step": 2969 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 2.3453524362665417, + "learning_rate": 1.7870038920719935e-05, + "loss": 0.2354, + "step": 2970 + }, + { + "epoch": 0.2353733412556942, + "grad_norm": 2.4563098030517856, + "learning_rate": 1.7868455553406713e-05, + "loss": 0.467, + "step": 2971 + }, + { + "epoch": 0.23545256486432958, + "grad_norm": 2.0869761006063157, + "learning_rate": 1.7866871667993554e-05, + "loss": 0.3801, + "step": 2972 + }, + { + "epoch": 0.23553178847296494, + "grad_norm": 2.103244513662612, + "learning_rate": 1.786528726458475e-05, + "loss": 0.3371, + "step": 2973 + }, + { + "epoch": 0.23561101208160032, + "grad_norm": 1.763642524301725, + "learning_rate": 1.786370234328462e-05, + "loss": 0.3112, + "step": 2974 + }, + { + "epoch": 0.2356902356902357, + "grad_norm": 2.259186435823224, + "learning_rate": 1.7862116904197534e-05, + "loss": 0.2928, + "step": 2975 + }, + { + "epoch": 0.23576945929887105, + "grad_norm": 2.194096374360557, + "learning_rate": 1.7860530947427878e-05, + "loss": 0.4277, + "step": 2976 + }, + { + "epoch": 0.23584868290750644, + "grad_norm": 2.4190719272387464, + "learning_rate": 1.785894447308008e-05, + "loss": 0.4561, + "step": 2977 + }, + { + "epoch": 0.23592790651614182, + "grad_norm": 2.204531321281794, + "learning_rate": 1.7857357481258603e-05, + "loss": 0.2377, + "step": 2978 + }, + { + "epoch": 0.23600713012477717, + "grad_norm": 1.7940348682390632, + "learning_rate": 1.7855769972067944e-05, + "loss": 0.2291, + "step": 2979 + }, + { + "epoch": 0.23608635373341255, + "grad_norm": 2.155516029656373, + "learning_rate": 1.785418194561263e-05, + "loss": 0.3892, + "step": 2980 + }, + { + "epoch": 0.23616557734204793, + "grad_norm": 2.2386092723063276, + "learning_rate": 1.7852593401997232e-05, + "loss": 0.3022, + "step": 2981 + }, + { + "epoch": 0.23624480095068331, + "grad_norm": 2.469883286408481, + "learning_rate": 1.785100434132634e-05, + "loss": 0.3573, + "step": 2982 + }, + { + "epoch": 0.23632402455931867, + "grad_norm": 1.8210031716987176, + "learning_rate": 1.7849414763704587e-05, + "loss": 0.3088, + "step": 2983 + }, + { + "epoch": 0.23640324816795405, + "grad_norm": 1.959552410119995, + "learning_rate": 1.7847824669236643e-05, + "loss": 0.2877, + "step": 2984 + }, + { + "epoch": 0.23648247177658943, + "grad_norm": 2.4431766471431065, + "learning_rate": 1.7846234058027207e-05, + "loss": 0.3203, + "step": 2985 + }, + { + "epoch": 0.23656169538522479, + "grad_norm": 2.118968639306495, + "learning_rate": 1.7844642930181008e-05, + "loss": 0.3997, + "step": 2986 + }, + { + "epoch": 0.23664091899386017, + "grad_norm": 2.204189394400891, + "learning_rate": 1.7843051285802823e-05, + "loss": 0.4061, + "step": 2987 + }, + { + "epoch": 0.23672014260249555, + "grad_norm": 2.232015235732415, + "learning_rate": 1.7841459124997445e-05, + "loss": 0.364, + "step": 2988 + }, + { + "epoch": 0.23679936621113093, + "grad_norm": 1.9942646681901783, + "learning_rate": 1.7839866447869717e-05, + "loss": 0.3405, + "step": 2989 + }, + { + "epoch": 0.23687858981976628, + "grad_norm": 2.3141503160082544, + "learning_rate": 1.7838273254524505e-05, + "loss": 0.2969, + "step": 2990 + }, + { + "epoch": 0.23695781342840166, + "grad_norm": 2.670787782891005, + "learning_rate": 1.7836679545066712e-05, + "loss": 0.3358, + "step": 2991 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 2.306282591921337, + "learning_rate": 1.7835085319601283e-05, + "loss": 0.3328, + "step": 2992 + }, + { + "epoch": 0.2371162606456724, + "grad_norm": 2.715330238933233, + "learning_rate": 1.783349057823318e-05, + "loss": 0.3557, + "step": 2993 + }, + { + "epoch": 0.23719548425430778, + "grad_norm": 2.3837024061971426, + "learning_rate": 1.783189532106742e-05, + "loss": 0.4084, + "step": 2994 + }, + { + "epoch": 0.23727470786294316, + "grad_norm": 2.1521504224066206, + "learning_rate": 1.783029954820904e-05, + "loss": 0.4635, + "step": 2995 + }, + { + "epoch": 0.23735393147157854, + "grad_norm": 2.1020779633639086, + "learning_rate": 1.7828703259763107e-05, + "loss": 0.2583, + "step": 2996 + }, + { + "epoch": 0.2374331550802139, + "grad_norm": 2.3296842155950053, + "learning_rate": 1.782710645583473e-05, + "loss": 0.3461, + "step": 2997 + }, + { + "epoch": 0.23751237868884928, + "grad_norm": 2.220311054136137, + "learning_rate": 1.7825509136529065e-05, + "loss": 0.3565, + "step": 2998 + }, + { + "epoch": 0.23759160229748466, + "grad_norm": 2.16336576802573, + "learning_rate": 1.782391130195127e-05, + "loss": 0.3771, + "step": 2999 + }, + { + "epoch": 0.23767082590612001, + "grad_norm": 2.5511785599205323, + "learning_rate": 1.7822312952206565e-05, + "loss": 0.3897, + "step": 3000 + }, + { + "epoch": 0.2377500495147554, + "grad_norm": 1.9360200712131914, + "learning_rate": 1.782071408740019e-05, + "loss": 0.3399, + "step": 3001 + }, + { + "epoch": 0.23782927312339078, + "grad_norm": 2.3419628859501405, + "learning_rate": 1.781911470763742e-05, + "loss": 0.3839, + "step": 3002 + }, + { + "epoch": 0.23790849673202613, + "grad_norm": 2.202891523914208, + "learning_rate": 1.7817514813023577e-05, + "loss": 0.4076, + "step": 3003 + }, + { + "epoch": 0.2379877203406615, + "grad_norm": 3.0492547362517453, + "learning_rate": 1.781591440366399e-05, + "loss": 0.3353, + "step": 3004 + }, + { + "epoch": 0.2380669439492969, + "grad_norm": 2.058581745043511, + "learning_rate": 1.7814313479664054e-05, + "loss": 0.3043, + "step": 3005 + }, + { + "epoch": 0.23814616755793228, + "grad_norm": 2.1482913033857245, + "learning_rate": 1.781271204112917e-05, + "loss": 0.3708, + "step": 3006 + }, + { + "epoch": 0.23822539116656763, + "grad_norm": 2.594643206242862, + "learning_rate": 1.7811110088164797e-05, + "loss": 0.2876, + "step": 3007 + }, + { + "epoch": 0.238304614775203, + "grad_norm": 2.43691657155438, + "learning_rate": 1.7809507620876406e-05, + "loss": 0.275, + "step": 3008 + }, + { + "epoch": 0.2383838383838384, + "grad_norm": 2.4233811864736032, + "learning_rate": 1.7807904639369512e-05, + "loss": 0.4782, + "step": 3009 + }, + { + "epoch": 0.23846306199247375, + "grad_norm": 1.708587861395923, + "learning_rate": 1.7806301143749672e-05, + "loss": 0.2687, + "step": 3010 + }, + { + "epoch": 0.23854228560110913, + "grad_norm": 1.8973464408153016, + "learning_rate": 1.780469713412246e-05, + "loss": 0.2567, + "step": 3011 + }, + { + "epoch": 0.2386215092097445, + "grad_norm": 2.3199391900538306, + "learning_rate": 1.78030926105935e-05, + "loss": 0.3851, + "step": 3012 + }, + { + "epoch": 0.2387007328183799, + "grad_norm": 2.0066302029158525, + "learning_rate": 1.7801487573268433e-05, + "loss": 0.3211, + "step": 3013 + }, + { + "epoch": 0.23877995642701524, + "grad_norm": 2.30231325136663, + "learning_rate": 1.7799882022252948e-05, + "loss": 0.3048, + "step": 3014 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 2.746329081825597, + "learning_rate": 1.7798275957652764e-05, + "loss": 0.3981, + "step": 3015 + }, + { + "epoch": 0.238938403644286, + "grad_norm": 2.2620152695474403, + "learning_rate": 1.779666937957363e-05, + "loss": 0.3382, + "step": 3016 + }, + { + "epoch": 0.23901762725292136, + "grad_norm": 2.1045241580740015, + "learning_rate": 1.7795062288121335e-05, + "loss": 0.3244, + "step": 3017 + }, + { + "epoch": 0.23909685086155674, + "grad_norm": 2.4100173773549582, + "learning_rate": 1.7793454683401692e-05, + "loss": 0.3069, + "step": 3018 + }, + { + "epoch": 0.23917607447019212, + "grad_norm": 2.1408494116591266, + "learning_rate": 1.779184656552056e-05, + "loss": 0.2665, + "step": 3019 + }, + { + "epoch": 0.23925529807882748, + "grad_norm": 2.0235463672632874, + "learning_rate": 1.7790237934583824e-05, + "loss": 0.2889, + "step": 3020 + }, + { + "epoch": 0.23933452168746286, + "grad_norm": 2.365147754866758, + "learning_rate": 1.7788628790697404e-05, + "loss": 0.3066, + "step": 3021 + }, + { + "epoch": 0.23941374529609824, + "grad_norm": 2.0202504708071753, + "learning_rate": 1.7787019133967252e-05, + "loss": 0.299, + "step": 3022 + }, + { + "epoch": 0.23949296890473362, + "grad_norm": 2.7561703045597525, + "learning_rate": 1.778540896449936e-05, + "loss": 0.43, + "step": 3023 + }, + { + "epoch": 0.23957219251336898, + "grad_norm": 2.1886780721159167, + "learning_rate": 1.778379828239975e-05, + "loss": 0.3512, + "step": 3024 + }, + { + "epoch": 0.23965141612200436, + "grad_norm": 2.534728288934522, + "learning_rate": 1.778218708777448e-05, + "loss": 0.241, + "step": 3025 + }, + { + "epoch": 0.23973063973063974, + "grad_norm": 1.9806599458068723, + "learning_rate": 1.7780575380729626e-05, + "loss": 0.3173, + "step": 3026 + }, + { + "epoch": 0.2398098633392751, + "grad_norm": 2.1423951082006987, + "learning_rate": 1.777896316137133e-05, + "loss": 0.2625, + "step": 3027 + }, + { + "epoch": 0.23988908694791047, + "grad_norm": 2.0676769322150794, + "learning_rate": 1.7777350429805734e-05, + "loss": 0.3346, + "step": 3028 + }, + { + "epoch": 0.23996831055654586, + "grad_norm": 2.7227403699979034, + "learning_rate": 1.777573718613904e-05, + "loss": 0.3865, + "step": 3029 + }, + { + "epoch": 0.24004753416518124, + "grad_norm": 2.090600867992233, + "learning_rate": 1.7774123430477464e-05, + "loss": 0.3416, + "step": 3030 + }, + { + "epoch": 0.2401267577738166, + "grad_norm": 2.2450937188770856, + "learning_rate": 1.7772509162927266e-05, + "loss": 0.3113, + "step": 3031 + }, + { + "epoch": 0.24020598138245197, + "grad_norm": 2.2750319212185977, + "learning_rate": 1.7770894383594737e-05, + "loss": 0.3003, + "step": 3032 + }, + { + "epoch": 0.24028520499108735, + "grad_norm": 2.243129801107735, + "learning_rate": 1.7769279092586205e-05, + "loss": 0.3222, + "step": 3033 + }, + { + "epoch": 0.2403644285997227, + "grad_norm": 1.8822340273952287, + "learning_rate": 1.776766329000803e-05, + "loss": 0.2315, + "step": 3034 + }, + { + "epoch": 0.2404436522083581, + "grad_norm": 2.035766777504126, + "learning_rate": 1.7766046975966603e-05, + "loss": 0.2886, + "step": 3035 + }, + { + "epoch": 0.24052287581699347, + "grad_norm": 2.507059382117977, + "learning_rate": 1.7764430150568347e-05, + "loss": 0.371, + "step": 3036 + }, + { + "epoch": 0.24060209942562882, + "grad_norm": 2.068633899948601, + "learning_rate": 1.776281281391973e-05, + "loss": 0.365, + "step": 3037 + }, + { + "epoch": 0.2406813230342642, + "grad_norm": 2.063740568498424, + "learning_rate": 1.776119496612724e-05, + "loss": 0.3588, + "step": 3038 + }, + { + "epoch": 0.2407605466428996, + "grad_norm": 2.0993081533658153, + "learning_rate": 1.7759576607297405e-05, + "loss": 0.3467, + "step": 3039 + }, + { + "epoch": 0.24083977025153497, + "grad_norm": 2.218139493602942, + "learning_rate": 1.7757957737536785e-05, + "loss": 0.3084, + "step": 3040 + }, + { + "epoch": 0.24091899386017032, + "grad_norm": 2.4767433228236504, + "learning_rate": 1.775633835695198e-05, + "loss": 0.4666, + "step": 3041 + }, + { + "epoch": 0.2409982174688057, + "grad_norm": 2.3302377873706135, + "learning_rate": 1.7754718465649618e-05, + "loss": 0.3576, + "step": 3042 + }, + { + "epoch": 0.24107744107744108, + "grad_norm": 2.1386286466831876, + "learning_rate": 1.7753098063736355e-05, + "loss": 0.3528, + "step": 3043 + }, + { + "epoch": 0.24115666468607644, + "grad_norm": 2.3938446628579446, + "learning_rate": 1.775147715131889e-05, + "loss": 0.3669, + "step": 3044 + }, + { + "epoch": 0.24123588829471182, + "grad_norm": 2.151310289353262, + "learning_rate": 1.7749855728503952e-05, + "loss": 0.266, + "step": 3045 + }, + { + "epoch": 0.2413151119033472, + "grad_norm": 2.3082344293082038, + "learning_rate": 1.7748233795398308e-05, + "loss": 0.3789, + "step": 3046 + }, + { + "epoch": 0.24139433551198258, + "grad_norm": 2.2287592132944476, + "learning_rate": 1.7746611352108744e-05, + "loss": 0.3057, + "step": 3047 + }, + { + "epoch": 0.24147355912061794, + "grad_norm": 2.2708771080614985, + "learning_rate": 1.7744988398742102e-05, + "loss": 0.3123, + "step": 3048 + }, + { + "epoch": 0.24155278272925332, + "grad_norm": 1.9809143706483787, + "learning_rate": 1.7743364935405238e-05, + "loss": 0.406, + "step": 3049 + }, + { + "epoch": 0.2416320063378887, + "grad_norm": 2.455023056700634, + "learning_rate": 1.7741740962205053e-05, + "loss": 0.2639, + "step": 3050 + }, + { + "epoch": 0.24171122994652405, + "grad_norm": 1.9763886126881152, + "learning_rate": 1.7740116479248474e-05, + "loss": 0.3797, + "step": 3051 + }, + { + "epoch": 0.24179045355515943, + "grad_norm": 2.2670810089222986, + "learning_rate": 1.773849148664247e-05, + "loss": 0.3623, + "step": 3052 + }, + { + "epoch": 0.24186967716379482, + "grad_norm": 2.278500974717792, + "learning_rate": 1.773686598449404e-05, + "loss": 0.3534, + "step": 3053 + }, + { + "epoch": 0.2419489007724302, + "grad_norm": 2.3668784101903046, + "learning_rate": 1.7735239972910208e-05, + "loss": 0.3174, + "step": 3054 + }, + { + "epoch": 0.24202812438106555, + "grad_norm": 2.3356926312568165, + "learning_rate": 1.7733613451998043e-05, + "loss": 0.2694, + "step": 3055 + }, + { + "epoch": 0.24210734798970093, + "grad_norm": 2.3631825641365802, + "learning_rate": 1.7731986421864645e-05, + "loss": 0.4334, + "step": 3056 + }, + { + "epoch": 0.24218657159833631, + "grad_norm": 2.2723942755041717, + "learning_rate": 1.7730358882617148e-05, + "loss": 0.4883, + "step": 3057 + }, + { + "epoch": 0.24226579520697167, + "grad_norm": 1.8995735427260922, + "learning_rate": 1.772873083436271e-05, + "loss": 0.282, + "step": 3058 + }, + { + "epoch": 0.24234501881560705, + "grad_norm": 1.7995097592245324, + "learning_rate": 1.7727102277208538e-05, + "loss": 0.2518, + "step": 3059 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 2.093629870468095, + "learning_rate": 1.772547321126186e-05, + "loss": 0.2826, + "step": 3060 + }, + { + "epoch": 0.24250346603287778, + "grad_norm": 2.0709187442149433, + "learning_rate": 1.7723843636629945e-05, + "loss": 0.2707, + "step": 3061 + }, + { + "epoch": 0.24258268964151317, + "grad_norm": 2.1257309057385902, + "learning_rate": 1.772221355342009e-05, + "loss": 0.2861, + "step": 3062 + }, + { + "epoch": 0.24266191325014855, + "grad_norm": 1.9676148121551202, + "learning_rate": 1.7720582961739628e-05, + "loss": 0.367, + "step": 3063 + }, + { + "epoch": 0.24274113685878393, + "grad_norm": 1.7386659213282003, + "learning_rate": 1.771895186169593e-05, + "loss": 0.2479, + "step": 3064 + }, + { + "epoch": 0.24282036046741928, + "grad_norm": 1.9995906052795158, + "learning_rate": 1.7717320253396393e-05, + "loss": 0.2695, + "step": 3065 + }, + { + "epoch": 0.24289958407605466, + "grad_norm": 2.194517693241196, + "learning_rate": 1.771568813694845e-05, + "loss": 0.3283, + "step": 3066 + }, + { + "epoch": 0.24297880768469005, + "grad_norm": 2.3772568656152013, + "learning_rate": 1.771405551245957e-05, + "loss": 0.2864, + "step": 3067 + }, + { + "epoch": 0.2430580312933254, + "grad_norm": 2.2628394684525666, + "learning_rate": 1.771242238003725e-05, + "loss": 0.4026, + "step": 3068 + }, + { + "epoch": 0.24313725490196078, + "grad_norm": 2.203306792565748, + "learning_rate": 1.7710788739789025e-05, + "loss": 0.2789, + "step": 3069 + }, + { + "epoch": 0.24321647851059616, + "grad_norm": 2.147749712669287, + "learning_rate": 1.7709154591822466e-05, + "loss": 0.3099, + "step": 3070 + }, + { + "epoch": 0.24329570211923154, + "grad_norm": 2.3707556466582265, + "learning_rate": 1.770751993624517e-05, + "loss": 0.3582, + "step": 3071 + }, + { + "epoch": 0.2433749257278669, + "grad_norm": 2.295430963197724, + "learning_rate": 1.770588477316477e-05, + "loss": 0.307, + "step": 3072 + }, + { + "epoch": 0.24345414933650228, + "grad_norm": 2.340305414586974, + "learning_rate": 1.770424910268894e-05, + "loss": 0.3244, + "step": 3073 + }, + { + "epoch": 0.24353337294513766, + "grad_norm": 2.6355225182577775, + "learning_rate": 1.7702612924925377e-05, + "loss": 0.3358, + "step": 3074 + }, + { + "epoch": 0.24361259655377301, + "grad_norm": 2.295335817741724, + "learning_rate": 1.7700976239981815e-05, + "loss": 0.3329, + "step": 3075 + }, + { + "epoch": 0.2436918201624084, + "grad_norm": 2.0403956444284344, + "learning_rate": 1.769933904796602e-05, + "loss": 0.2732, + "step": 3076 + }, + { + "epoch": 0.24377104377104378, + "grad_norm": 2.3723541641072567, + "learning_rate": 1.76977013489858e-05, + "loss": 0.3892, + "step": 3077 + }, + { + "epoch": 0.24385026737967913, + "grad_norm": 1.8792625787878923, + "learning_rate": 1.7696063143148982e-05, + "loss": 0.2877, + "step": 3078 + }, + { + "epoch": 0.2439294909883145, + "grad_norm": 2.046997372938021, + "learning_rate": 1.7694424430563436e-05, + "loss": 0.3901, + "step": 3079 + }, + { + "epoch": 0.2440087145969499, + "grad_norm": 1.9778716937543714, + "learning_rate": 1.769278521133707e-05, + "loss": 0.2999, + "step": 3080 + }, + { + "epoch": 0.24408793820558528, + "grad_norm": 1.7475003812751837, + "learning_rate": 1.769114548557781e-05, + "loss": 0.2259, + "step": 3081 + }, + { + "epoch": 0.24416716181422063, + "grad_norm": 1.8632732087521158, + "learning_rate": 1.768950525339362e-05, + "loss": 0.2879, + "step": 3082 + }, + { + "epoch": 0.244246385422856, + "grad_norm": 1.751176214358662, + "learning_rate": 1.7687864514892516e-05, + "loss": 0.2833, + "step": 3083 + }, + { + "epoch": 0.2443256090314914, + "grad_norm": 2.5221530820825264, + "learning_rate": 1.7686223270182524e-05, + "loss": 0.3853, + "step": 3084 + }, + { + "epoch": 0.24440483264012675, + "grad_norm": 2.2674015377016343, + "learning_rate": 1.7684581519371714e-05, + "loss": 0.3143, + "step": 3085 + }, + { + "epoch": 0.24448405624876213, + "grad_norm": 2.296367686739741, + "learning_rate": 1.768293926256819e-05, + "loss": 0.4126, + "step": 3086 + }, + { + "epoch": 0.2445632798573975, + "grad_norm": 1.9070545827895027, + "learning_rate": 1.7681296499880077e-05, + "loss": 0.295, + "step": 3087 + }, + { + "epoch": 0.2446425034660329, + "grad_norm": 1.949526017970783, + "learning_rate": 1.767965323141555e-05, + "loss": 0.2968, + "step": 3088 + }, + { + "epoch": 0.24472172707466824, + "grad_norm": 1.9152766318108931, + "learning_rate": 1.7678009457282816e-05, + "loss": 0.2791, + "step": 3089 + }, + { + "epoch": 0.24480095068330363, + "grad_norm": 2.409317888612591, + "learning_rate": 1.7676365177590097e-05, + "loss": 0.2913, + "step": 3090 + }, + { + "epoch": 0.244880174291939, + "grad_norm": 2.0151659724570608, + "learning_rate": 1.7674720392445672e-05, + "loss": 0.2866, + "step": 3091 + }, + { + "epoch": 0.24495939790057436, + "grad_norm": 2.128844799648101, + "learning_rate": 1.7673075101957837e-05, + "loss": 0.4401, + "step": 3092 + }, + { + "epoch": 0.24503862150920974, + "grad_norm": 2.589473703920038, + "learning_rate": 1.7671429306234924e-05, + "loss": 0.3183, + "step": 3093 + }, + { + "epoch": 0.24511784511784512, + "grad_norm": 1.9695204570266884, + "learning_rate": 1.7669783005385305e-05, + "loss": 0.2936, + "step": 3094 + }, + { + "epoch": 0.2451970687264805, + "grad_norm": 2.1502707856048096, + "learning_rate": 1.766813619951738e-05, + "loss": 0.3922, + "step": 3095 + }, + { + "epoch": 0.24527629233511586, + "grad_norm": 2.004752127042267, + "learning_rate": 1.7666488888739587e-05, + "loss": 0.3082, + "step": 3096 + }, + { + "epoch": 0.24535551594375124, + "grad_norm": 2.4496955910922007, + "learning_rate": 1.7664841073160383e-05, + "loss": 0.4009, + "step": 3097 + }, + { + "epoch": 0.24543473955238662, + "grad_norm": 2.0343759382155935, + "learning_rate": 1.766319275288828e-05, + "loss": 0.38, + "step": 3098 + }, + { + "epoch": 0.24551396316102198, + "grad_norm": 2.0328476675122316, + "learning_rate": 1.7661543928031802e-05, + "loss": 0.2678, + "step": 3099 + }, + { + "epoch": 0.24559318676965736, + "grad_norm": 2.072807095377211, + "learning_rate": 1.7659894598699527e-05, + "loss": 0.3471, + "step": 3100 + }, + { + "epoch": 0.24567241037829274, + "grad_norm": 1.9629062381503406, + "learning_rate": 1.765824476500005e-05, + "loss": 0.3397, + "step": 3101 + }, + { + "epoch": 0.2457516339869281, + "grad_norm": 2.2431491849824563, + "learning_rate": 1.7656594427041997e-05, + "loss": 0.3726, + "step": 3102 + }, + { + "epoch": 0.24583085759556347, + "grad_norm": 2.5833704438715817, + "learning_rate": 1.765494358493405e-05, + "loss": 0.3105, + "step": 3103 + }, + { + "epoch": 0.24591008120419885, + "grad_norm": 1.7363134813358179, + "learning_rate": 1.7653292238784897e-05, + "loss": 0.3231, + "step": 3104 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 2.0652764244106976, + "learning_rate": 1.7651640388703275e-05, + "loss": 0.3944, + "step": 3105 + }, + { + "epoch": 0.2460685284214696, + "grad_norm": 1.9505530580115473, + "learning_rate": 1.7649988034797952e-05, + "loss": 0.3854, + "step": 3106 + }, + { + "epoch": 0.24614775203010497, + "grad_norm": 2.435976544033609, + "learning_rate": 1.7648335177177725e-05, + "loss": 0.2442, + "step": 3107 + }, + { + "epoch": 0.24622697563874035, + "grad_norm": 2.2550878285340716, + "learning_rate": 1.764668181595143e-05, + "loss": 0.3357, + "step": 3108 + }, + { + "epoch": 0.2463061992473757, + "grad_norm": 2.2145810238031007, + "learning_rate": 1.764502795122793e-05, + "loss": 0.5642, + "step": 3109 + }, + { + "epoch": 0.2463854228560111, + "grad_norm": 2.7841251270083096, + "learning_rate": 1.7643373583116123e-05, + "loss": 0.3687, + "step": 3110 + }, + { + "epoch": 0.24646464646464647, + "grad_norm": 2.4664735729556955, + "learning_rate": 1.7641718711724947e-05, + "loss": 0.3189, + "step": 3111 + }, + { + "epoch": 0.24654387007328185, + "grad_norm": 1.7583655464796075, + "learning_rate": 1.764006333716336e-05, + "loss": 0.2496, + "step": 3112 + }, + { + "epoch": 0.2466230936819172, + "grad_norm": 2.0988095088574545, + "learning_rate": 1.7638407459540364e-05, + "loss": 0.3793, + "step": 3113 + }, + { + "epoch": 0.2467023172905526, + "grad_norm": 2.385704840297801, + "learning_rate": 1.7636751078964995e-05, + "loss": 0.3551, + "step": 3114 + }, + { + "epoch": 0.24678154089918797, + "grad_norm": 2.0017480505349097, + "learning_rate": 1.763509419554631e-05, + "loss": 0.309, + "step": 3115 + }, + { + "epoch": 0.24686076450782332, + "grad_norm": 2.0401873798694736, + "learning_rate": 1.763343680939341e-05, + "loss": 0.2781, + "step": 3116 + }, + { + "epoch": 0.2469399881164587, + "grad_norm": 2.029527798423958, + "learning_rate": 1.7631778920615427e-05, + "loss": 0.3596, + "step": 3117 + }, + { + "epoch": 0.24701921172509408, + "grad_norm": 2.3325494116650636, + "learning_rate": 1.7630120529321518e-05, + "loss": 0.3624, + "step": 3118 + }, + { + "epoch": 0.24709843533372944, + "grad_norm": 1.8482318735712697, + "learning_rate": 1.7628461635620895e-05, + "loss": 0.2841, + "step": 3119 + }, + { + "epoch": 0.24717765894236482, + "grad_norm": 1.9049012351755752, + "learning_rate": 1.7626802239622772e-05, + "loss": 0.3095, + "step": 3120 + }, + { + "epoch": 0.2472568825510002, + "grad_norm": 2.2047090684422312, + "learning_rate": 1.7625142341436423e-05, + "loss": 0.3357, + "step": 3121 + }, + { + "epoch": 0.24733610615963558, + "grad_norm": 2.0725570285729544, + "learning_rate": 1.762348194117114e-05, + "loss": 0.2257, + "step": 3122 + }, + { + "epoch": 0.24741532976827094, + "grad_norm": 2.134085836909251, + "learning_rate": 1.7621821038936257e-05, + "loss": 0.3571, + "step": 3123 + }, + { + "epoch": 0.24749455337690632, + "grad_norm": 2.1048871477271964, + "learning_rate": 1.7620159634841127e-05, + "loss": 0.3935, + "step": 3124 + }, + { + "epoch": 0.2475737769855417, + "grad_norm": 2.495189390379055, + "learning_rate": 1.761849772899515e-05, + "loss": 0.3773, + "step": 3125 + }, + { + "epoch": 0.24765300059417705, + "grad_norm": 2.4230035380106365, + "learning_rate": 1.7616835321507757e-05, + "loss": 0.3151, + "step": 3126 + }, + { + "epoch": 0.24773222420281243, + "grad_norm": 1.9173416923560322, + "learning_rate": 1.761517241248841e-05, + "loss": 0.1984, + "step": 3127 + }, + { + "epoch": 0.24781144781144782, + "grad_norm": 2.1033695195367734, + "learning_rate": 1.76135090020466e-05, + "loss": 0.2874, + "step": 3128 + }, + { + "epoch": 0.2478906714200832, + "grad_norm": 2.187014033601868, + "learning_rate": 1.7611845090291858e-05, + "loss": 0.4129, + "step": 3129 + }, + { + "epoch": 0.24796989502871855, + "grad_norm": 2.2368177220868253, + "learning_rate": 1.761018067733374e-05, + "loss": 0.3864, + "step": 3130 + }, + { + "epoch": 0.24804911863735393, + "grad_norm": 2.200626684729094, + "learning_rate": 1.7608515763281843e-05, + "loss": 0.351, + "step": 3131 + }, + { + "epoch": 0.24812834224598931, + "grad_norm": 2.970326823529292, + "learning_rate": 1.760685034824579e-05, + "loss": 0.3877, + "step": 3132 + }, + { + "epoch": 0.24820756585462467, + "grad_norm": 2.2838672524593404, + "learning_rate": 1.760518443233525e-05, + "loss": 0.3372, + "step": 3133 + }, + { + "epoch": 0.24828678946326005, + "grad_norm": 2.135225685427978, + "learning_rate": 1.7603518015659905e-05, + "loss": 0.4513, + "step": 3134 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 1.7243109491541246, + "learning_rate": 1.7601851098329484e-05, + "loss": 0.3649, + "step": 3135 + }, + { + "epoch": 0.24844523668053078, + "grad_norm": 2.1079321137847726, + "learning_rate": 1.7600183680453745e-05, + "loss": 0.3258, + "step": 3136 + }, + { + "epoch": 0.24852446028916617, + "grad_norm": 2.2520346222407084, + "learning_rate": 1.7598515762142484e-05, + "loss": 0.2979, + "step": 3137 + }, + { + "epoch": 0.24860368389780155, + "grad_norm": 2.20246326597006, + "learning_rate": 1.759684734350552e-05, + "loss": 0.4521, + "step": 3138 + }, + { + "epoch": 0.24868290750643693, + "grad_norm": 1.9578842163574657, + "learning_rate": 1.759517842465271e-05, + "loss": 0.2906, + "step": 3139 + }, + { + "epoch": 0.24876213111507228, + "grad_norm": 1.9412696546710464, + "learning_rate": 1.759350900569395e-05, + "loss": 0.3337, + "step": 3140 + }, + { + "epoch": 0.24884135472370766, + "grad_norm": 2.0308340112722765, + "learning_rate": 1.759183908673916e-05, + "loss": 0.3341, + "step": 3141 + }, + { + "epoch": 0.24892057833234305, + "grad_norm": 2.194955702738754, + "learning_rate": 1.759016866789829e-05, + "loss": 0.2867, + "step": 3142 + }, + { + "epoch": 0.2489998019409784, + "grad_norm": 2.6247322127504145, + "learning_rate": 1.7588497749281338e-05, + "loss": 0.3331, + "step": 3143 + }, + { + "epoch": 0.24907902554961378, + "grad_norm": 2.373571487890507, + "learning_rate": 1.7586826330998324e-05, + "loss": 0.332, + "step": 3144 + }, + { + "epoch": 0.24915824915824916, + "grad_norm": 2.101409320869915, + "learning_rate": 1.7585154413159304e-05, + "loss": 0.3089, + "step": 3145 + }, + { + "epoch": 0.24923747276688454, + "grad_norm": 2.584851109657674, + "learning_rate": 1.758348199587436e-05, + "loss": 0.3053, + "step": 3146 + }, + { + "epoch": 0.2493166963755199, + "grad_norm": 1.9170160278008994, + "learning_rate": 1.7581809079253616e-05, + "loss": 0.297, + "step": 3147 + }, + { + "epoch": 0.24939591998415528, + "grad_norm": 2.276804387586002, + "learning_rate": 1.7580135663407226e-05, + "loss": 0.4009, + "step": 3148 + }, + { + "epoch": 0.24947514359279066, + "grad_norm": 2.269897095319302, + "learning_rate": 1.7578461748445374e-05, + "loss": 0.3911, + "step": 3149 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 2.194189906855524, + "learning_rate": 1.7576787334478283e-05, + "loss": 0.3068, + "step": 3150 + }, + { + "epoch": 0.2496335908100614, + "grad_norm": 1.8365439412557574, + "learning_rate": 1.7575112421616203e-05, + "loss": 0.2874, + "step": 3151 + }, + { + "epoch": 0.24971281441869678, + "grad_norm": 2.3217891060378557, + "learning_rate": 1.757343700996942e-05, + "loss": 0.3028, + "step": 3152 + }, + { + "epoch": 0.24979203802733216, + "grad_norm": 2.3486808110748414, + "learning_rate": 1.757176109964825e-05, + "loss": 0.3674, + "step": 3153 + }, + { + "epoch": 0.2498712616359675, + "grad_norm": 2.492330136611301, + "learning_rate": 1.7570084690763042e-05, + "loss": 0.4028, + "step": 3154 + }, + { + "epoch": 0.2499504852446029, + "grad_norm": 1.991474236755262, + "learning_rate": 1.7568407783424187e-05, + "loss": 0.3106, + "step": 3155 + }, + { + "epoch": 0.2500297088532383, + "grad_norm": 2.0594099059834066, + "learning_rate": 1.7566730377742093e-05, + "loss": 0.3264, + "step": 3156 + }, + { + "epoch": 0.25010893246187366, + "grad_norm": 1.9855669044250501, + "learning_rate": 1.7565052473827213e-05, + "loss": 0.283, + "step": 3157 + }, + { + "epoch": 0.25018815607050904, + "grad_norm": 1.9828299502358042, + "learning_rate": 1.7563374071790028e-05, + "loss": 0.2596, + "step": 3158 + }, + { + "epoch": 0.25026737967914436, + "grad_norm": 2.528195858559667, + "learning_rate": 1.7561695171741054e-05, + "loss": 0.3769, + "step": 3159 + }, + { + "epoch": 0.25034660328777975, + "grad_norm": 1.788766226959051, + "learning_rate": 1.7560015773790837e-05, + "loss": 0.3699, + "step": 3160 + }, + { + "epoch": 0.2504258268964151, + "grad_norm": 2.210841811229164, + "learning_rate": 1.7558335878049955e-05, + "loss": 0.4443, + "step": 3161 + }, + { + "epoch": 0.2505050505050505, + "grad_norm": 1.860121709352534, + "learning_rate": 1.7556655484629028e-05, + "loss": 0.3621, + "step": 3162 + }, + { + "epoch": 0.2505842741136859, + "grad_norm": 2.663115144540321, + "learning_rate": 1.7554974593638697e-05, + "loss": 0.3224, + "step": 3163 + }, + { + "epoch": 0.25066349772232127, + "grad_norm": 1.670794074242099, + "learning_rate": 1.755329320518964e-05, + "loss": 0.3041, + "step": 3164 + }, + { + "epoch": 0.25074272133095665, + "grad_norm": 2.152913748232404, + "learning_rate": 1.7551611319392573e-05, + "loss": 0.375, + "step": 3165 + }, + { + "epoch": 0.250821944939592, + "grad_norm": 1.5217735878440846, + "learning_rate": 1.7549928936358232e-05, + "loss": 0.2081, + "step": 3166 + }, + { + "epoch": 0.25090116854822736, + "grad_norm": 2.3463524928828225, + "learning_rate": 1.75482460561974e-05, + "loss": 0.3585, + "step": 3167 + }, + { + "epoch": 0.25098039215686274, + "grad_norm": 2.572257685272147, + "learning_rate": 1.7546562679020884e-05, + "loss": 0.2567, + "step": 3168 + }, + { + "epoch": 0.2510596157654981, + "grad_norm": 2.392915260597958, + "learning_rate": 1.7544878804939528e-05, + "loss": 0.2652, + "step": 3169 + }, + { + "epoch": 0.2511388393741335, + "grad_norm": 1.9444629666606124, + "learning_rate": 1.7543194434064208e-05, + "loss": 0.3158, + "step": 3170 + }, + { + "epoch": 0.2512180629827689, + "grad_norm": 1.8543573155822384, + "learning_rate": 1.754150956650583e-05, + "loss": 0.3228, + "step": 3171 + }, + { + "epoch": 0.2512972865914042, + "grad_norm": 2.066453649492031, + "learning_rate": 1.753982420237533e-05, + "loss": 0.3562, + "step": 3172 + }, + { + "epoch": 0.2513765102000396, + "grad_norm": 2.3144003054391384, + "learning_rate": 1.753813834178369e-05, + "loss": 0.4167, + "step": 3173 + }, + { + "epoch": 0.251455733808675, + "grad_norm": 1.8389229039143093, + "learning_rate": 1.753645198484191e-05, + "loss": 0.2271, + "step": 3174 + }, + { + "epoch": 0.25153495741731036, + "grad_norm": 1.7919746495512738, + "learning_rate": 1.753476513166103e-05, + "loss": 0.3353, + "step": 3175 + }, + { + "epoch": 0.25161418102594574, + "grad_norm": 1.7598198919129016, + "learning_rate": 1.7533077782352123e-05, + "loss": 0.3617, + "step": 3176 + }, + { + "epoch": 0.2516934046345811, + "grad_norm": 1.8798753175127672, + "learning_rate": 1.753138993702629e-05, + "loss": 0.2751, + "step": 3177 + }, + { + "epoch": 0.2517726282432165, + "grad_norm": 2.189914940785449, + "learning_rate": 1.752970159579467e-05, + "loss": 0.3447, + "step": 3178 + }, + { + "epoch": 0.2518518518518518, + "grad_norm": 2.114285745548243, + "learning_rate": 1.7528012758768426e-05, + "loss": 0.3189, + "step": 3179 + }, + { + "epoch": 0.2519310754604872, + "grad_norm": 2.2412589062856867, + "learning_rate": 1.7526323426058767e-05, + "loss": 0.3778, + "step": 3180 + }, + { + "epoch": 0.2520102990691226, + "grad_norm": 2.1606364122220993, + "learning_rate": 1.7524633597776923e-05, + "loss": 0.4242, + "step": 3181 + }, + { + "epoch": 0.25208952267775797, + "grad_norm": 1.9567124486250131, + "learning_rate": 1.7522943274034165e-05, + "loss": 0.3176, + "step": 3182 + }, + { + "epoch": 0.25216874628639335, + "grad_norm": 1.8611110382284815, + "learning_rate": 1.752125245494179e-05, + "loss": 0.3322, + "step": 3183 + }, + { + "epoch": 0.25224796989502873, + "grad_norm": 1.9982039060348933, + "learning_rate": 1.751956114061113e-05, + "loss": 0.3835, + "step": 3184 + }, + { + "epoch": 0.2523271935036641, + "grad_norm": 2.8968457584128906, + "learning_rate": 1.751786933115355e-05, + "loss": 0.4097, + "step": 3185 + }, + { + "epoch": 0.25240641711229944, + "grad_norm": 2.2111841014606206, + "learning_rate": 1.751617702668045e-05, + "loss": 0.2665, + "step": 3186 + }, + { + "epoch": 0.2524856407209348, + "grad_norm": 2.425423174013597, + "learning_rate": 1.751448422730326e-05, + "loss": 0.3385, + "step": 3187 + }, + { + "epoch": 0.2525648643295702, + "grad_norm": 2.180026919812749, + "learning_rate": 1.7512790933133435e-05, + "loss": 0.3916, + "step": 3188 + }, + { + "epoch": 0.2526440879382056, + "grad_norm": 2.0805844298936225, + "learning_rate": 1.7511097144282482e-05, + "loss": 0.3591, + "step": 3189 + }, + { + "epoch": 0.25272331154684097, + "grad_norm": 2.2950658830861177, + "learning_rate": 1.7509402860861923e-05, + "loss": 0.3946, + "step": 3190 + }, + { + "epoch": 0.25280253515547635, + "grad_norm": 2.3868853083364723, + "learning_rate": 1.7507708082983313e-05, + "loss": 0.3427, + "step": 3191 + }, + { + "epoch": 0.25288175876411173, + "grad_norm": 2.402135313830431, + "learning_rate": 1.7506012810758254e-05, + "loss": 0.4316, + "step": 3192 + }, + { + "epoch": 0.25296098237274706, + "grad_norm": 1.932140722868322, + "learning_rate": 1.750431704429837e-05, + "loss": 0.3912, + "step": 3193 + }, + { + "epoch": 0.25304020598138244, + "grad_norm": 2.0683311609142434, + "learning_rate": 1.7502620783715316e-05, + "loss": 0.3283, + "step": 3194 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 2.458022152617356, + "learning_rate": 1.7500924029120782e-05, + "loss": 0.3316, + "step": 3195 + }, + { + "epoch": 0.2531986531986532, + "grad_norm": 2.165077050442694, + "learning_rate": 1.7499226780626494e-05, + "loss": 0.3481, + "step": 3196 + }, + { + "epoch": 0.2532778768072886, + "grad_norm": 2.0504403696436744, + "learning_rate": 1.7497529038344208e-05, + "loss": 0.2968, + "step": 3197 + }, + { + "epoch": 0.25335710041592396, + "grad_norm": 2.0919997919238678, + "learning_rate": 1.7495830802385707e-05, + "loss": 0.3292, + "step": 3198 + }, + { + "epoch": 0.25343632402455935, + "grad_norm": 2.1356896504056566, + "learning_rate": 1.7494132072862818e-05, + "loss": 0.3231, + "step": 3199 + }, + { + "epoch": 0.25351554763319467, + "grad_norm": 2.0703937912551407, + "learning_rate": 1.7492432849887387e-05, + "loss": 0.3043, + "step": 3200 + }, + { + "epoch": 0.25359477124183005, + "grad_norm": 2.283695389893035, + "learning_rate": 1.749073313357131e-05, + "loss": 0.3684, + "step": 3201 + }, + { + "epoch": 0.25367399485046543, + "grad_norm": 1.6313716388147497, + "learning_rate": 1.7489032924026496e-05, + "loss": 0.2225, + "step": 3202 + }, + { + "epoch": 0.2537532184591008, + "grad_norm": 1.7962770489180542, + "learning_rate": 1.74873322213649e-05, + "loss": 0.2627, + "step": 3203 + }, + { + "epoch": 0.2538324420677362, + "grad_norm": 2.105269362629646, + "learning_rate": 1.7485631025698504e-05, + "loss": 0.3818, + "step": 3204 + }, + { + "epoch": 0.2539116656763716, + "grad_norm": 2.4373811253813904, + "learning_rate": 1.7483929337139326e-05, + "loss": 0.3456, + "step": 3205 + }, + { + "epoch": 0.2539908892850069, + "grad_norm": 1.8757043676973533, + "learning_rate": 1.748222715579941e-05, + "loss": 0.2276, + "step": 3206 + }, + { + "epoch": 0.2540701128936423, + "grad_norm": 2.212246242481772, + "learning_rate": 1.7480524481790835e-05, + "loss": 0.3669, + "step": 3207 + }, + { + "epoch": 0.25414933650227767, + "grad_norm": 1.9697219569320832, + "learning_rate": 1.7478821315225717e-05, + "loss": 0.2792, + "step": 3208 + }, + { + "epoch": 0.25422856011091305, + "grad_norm": 2.428866513284939, + "learning_rate": 1.7477117656216206e-05, + "loss": 0.5144, + "step": 3209 + }, + { + "epoch": 0.25430778371954843, + "grad_norm": 1.9880183386562895, + "learning_rate": 1.7475413504874474e-05, + "loss": 0.2765, + "step": 3210 + }, + { + "epoch": 0.2543870073281838, + "grad_norm": 2.3965006100719215, + "learning_rate": 1.7473708861312727e-05, + "loss": 0.3426, + "step": 3211 + }, + { + "epoch": 0.2544662309368192, + "grad_norm": 2.14779450836371, + "learning_rate": 1.7472003725643215e-05, + "loss": 0.3278, + "step": 3212 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 1.9644775960544885, + "learning_rate": 1.747029809797821e-05, + "loss": 0.2905, + "step": 3213 + }, + { + "epoch": 0.2546246781540899, + "grad_norm": 1.7695218249493199, + "learning_rate": 1.7468591978430024e-05, + "loss": 0.2559, + "step": 3214 + }, + { + "epoch": 0.2547039017627253, + "grad_norm": 1.9945889070636953, + "learning_rate": 1.746688536711099e-05, + "loss": 0.2843, + "step": 3215 + }, + { + "epoch": 0.25478312537136066, + "grad_norm": 2.0425649140663507, + "learning_rate": 1.7465178264133482e-05, + "loss": 0.3341, + "step": 3216 + }, + { + "epoch": 0.25486234897999605, + "grad_norm": 2.0042442532245652, + "learning_rate": 1.7463470669609907e-05, + "loss": 0.3296, + "step": 3217 + }, + { + "epoch": 0.2549415725886314, + "grad_norm": 2.081881892438971, + "learning_rate": 1.74617625836527e-05, + "loss": 0.3071, + "step": 3218 + }, + { + "epoch": 0.2550207961972668, + "grad_norm": 2.231791062220781, + "learning_rate": 1.746005400637433e-05, + "loss": 0.3477, + "step": 3219 + }, + { + "epoch": 0.25510001980590213, + "grad_norm": 1.9695744281777843, + "learning_rate": 1.74583449378873e-05, + "loss": 0.2839, + "step": 3220 + }, + { + "epoch": 0.2551792434145375, + "grad_norm": 1.7860489643023751, + "learning_rate": 1.7456635378304143e-05, + "loss": 0.3131, + "step": 3221 + }, + { + "epoch": 0.2552584670231729, + "grad_norm": 2.24608401418775, + "learning_rate": 1.7454925327737426e-05, + "loss": 0.3816, + "step": 3222 + }, + { + "epoch": 0.2553376906318083, + "grad_norm": 2.0872890517461014, + "learning_rate": 1.7453214786299746e-05, + "loss": 0.3019, + "step": 3223 + }, + { + "epoch": 0.25541691424044366, + "grad_norm": 2.0836811010193275, + "learning_rate": 1.7451503754103735e-05, + "loss": 0.3271, + "step": 3224 + }, + { + "epoch": 0.25549613784907904, + "grad_norm": 2.894784140181905, + "learning_rate": 1.7449792231262056e-05, + "loss": 0.3491, + "step": 3225 + }, + { + "epoch": 0.2555753614577144, + "grad_norm": 2.443183334778466, + "learning_rate": 1.7448080217887403e-05, + "loss": 0.3695, + "step": 3226 + }, + { + "epoch": 0.25565458506634975, + "grad_norm": 2.2047669206530878, + "learning_rate": 1.7446367714092508e-05, + "loss": 0.3523, + "step": 3227 + }, + { + "epoch": 0.25573380867498513, + "grad_norm": 1.8939969596676325, + "learning_rate": 1.7444654719990128e-05, + "loss": 0.2027, + "step": 3228 + }, + { + "epoch": 0.2558130322836205, + "grad_norm": 2.105530670544525, + "learning_rate": 1.7442941235693058e-05, + "loss": 0.4269, + "step": 3229 + }, + { + "epoch": 0.2558922558922559, + "grad_norm": 2.1493057193863403, + "learning_rate": 1.744122726131412e-05, + "loss": 0.3556, + "step": 3230 + }, + { + "epoch": 0.2559714795008913, + "grad_norm": 2.5560916004327727, + "learning_rate": 1.7439512796966165e-05, + "loss": 0.384, + "step": 3231 + }, + { + "epoch": 0.25605070310952666, + "grad_norm": 2.0765856221477232, + "learning_rate": 1.7437797842762098e-05, + "loss": 0.3012, + "step": 3232 + }, + { + "epoch": 0.25612992671816204, + "grad_norm": 2.2611887844681755, + "learning_rate": 1.743608239881483e-05, + "loss": 0.2891, + "step": 3233 + }, + { + "epoch": 0.25620915032679736, + "grad_norm": 1.5392627594504231, + "learning_rate": 1.7434366465237312e-05, + "loss": 0.2233, + "step": 3234 + }, + { + "epoch": 0.25628837393543275, + "grad_norm": 2.0866414541682583, + "learning_rate": 1.7432650042142535e-05, + "loss": 0.3982, + "step": 3235 + }, + { + "epoch": 0.2563675975440681, + "grad_norm": 2.43862585588995, + "learning_rate": 1.743093312964352e-05, + "loss": 0.4161, + "step": 3236 + }, + { + "epoch": 0.2564468211527035, + "grad_norm": 2.207354380002613, + "learning_rate": 1.742921572785331e-05, + "loss": 0.4492, + "step": 3237 + }, + { + "epoch": 0.2565260447613389, + "grad_norm": 1.7832572052544342, + "learning_rate": 1.7427497836884995e-05, + "loss": 0.3152, + "step": 3238 + }, + { + "epoch": 0.25660526836997427, + "grad_norm": 2.0479439293624937, + "learning_rate": 1.7425779456851683e-05, + "loss": 0.272, + "step": 3239 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 2.1496191465684693, + "learning_rate": 1.7424060587866526e-05, + "loss": 0.4464, + "step": 3240 + }, + { + "epoch": 0.256763715587245, + "grad_norm": 2.535979390949642, + "learning_rate": 1.74223412300427e-05, + "loss": 0.3754, + "step": 3241 + }, + { + "epoch": 0.25684293919588036, + "grad_norm": 1.8068336484004952, + "learning_rate": 1.7420621383493423e-05, + "loss": 0.2138, + "step": 3242 + }, + { + "epoch": 0.25692216280451574, + "grad_norm": 2.1913731656884368, + "learning_rate": 1.7418901048331927e-05, + "loss": 0.3604, + "step": 3243 + }, + { + "epoch": 0.2570013864131511, + "grad_norm": 2.3797284152895735, + "learning_rate": 1.7417180224671497e-05, + "loss": 0.3969, + "step": 3244 + }, + { + "epoch": 0.2570806100217865, + "grad_norm": 1.8754716905679043, + "learning_rate": 1.741545891262544e-05, + "loss": 0.3134, + "step": 3245 + }, + { + "epoch": 0.2571598336304219, + "grad_norm": 2.302959913561448, + "learning_rate": 1.7413737112307092e-05, + "loss": 0.4423, + "step": 3246 + }, + { + "epoch": 0.2572390572390572, + "grad_norm": 1.7052146413781168, + "learning_rate": 1.741201482382983e-05, + "loss": 0.3037, + "step": 3247 + }, + { + "epoch": 0.2573182808476926, + "grad_norm": 1.943526165275058, + "learning_rate": 1.7410292047307054e-05, + "loss": 0.3148, + "step": 3248 + }, + { + "epoch": 0.257397504456328, + "grad_norm": 1.8051129337523533, + "learning_rate": 1.7408568782852204e-05, + "loss": 0.258, + "step": 3249 + }, + { + "epoch": 0.25747672806496336, + "grad_norm": 2.330297661958351, + "learning_rate": 1.7406845030578747e-05, + "loss": 0.4081, + "step": 3250 + }, + { + "epoch": 0.25755595167359874, + "grad_norm": 2.1945372742787206, + "learning_rate": 1.7405120790600185e-05, + "loss": 0.2748, + "step": 3251 + }, + { + "epoch": 0.2576351752822341, + "grad_norm": 2.038801311904655, + "learning_rate": 1.740339606303005e-05, + "loss": 0.315, + "step": 3252 + }, + { + "epoch": 0.2577143988908695, + "grad_norm": 1.5262391755761546, + "learning_rate": 1.7401670847981906e-05, + "loss": 0.2739, + "step": 3253 + }, + { + "epoch": 0.2577936224995048, + "grad_norm": 1.98721405649299, + "learning_rate": 1.7399945145569353e-05, + "loss": 0.3137, + "step": 3254 + }, + { + "epoch": 0.2578728461081402, + "grad_norm": 2.1136067175226882, + "learning_rate": 1.7398218955906017e-05, + "loss": 0.323, + "step": 3255 + }, + { + "epoch": 0.2579520697167756, + "grad_norm": 2.1009780117040586, + "learning_rate": 1.7396492279105562e-05, + "loss": 0.3522, + "step": 3256 + }, + { + "epoch": 0.25803129332541097, + "grad_norm": 1.8529069187609732, + "learning_rate": 1.7394765115281678e-05, + "loss": 0.3032, + "step": 3257 + }, + { + "epoch": 0.25811051693404635, + "grad_norm": 2.260794264150886, + "learning_rate": 1.7393037464548094e-05, + "loss": 0.2797, + "step": 3258 + }, + { + "epoch": 0.25818974054268173, + "grad_norm": 2.2931706300882664, + "learning_rate": 1.7391309327018566e-05, + "loss": 0.2881, + "step": 3259 + }, + { + "epoch": 0.2582689641513171, + "grad_norm": 1.6868311582189306, + "learning_rate": 1.7389580702806884e-05, + "loss": 0.2794, + "step": 3260 + }, + { + "epoch": 0.25834818775995244, + "grad_norm": 2.392280611340802, + "learning_rate": 1.7387851592026868e-05, + "loss": 0.3857, + "step": 3261 + }, + { + "epoch": 0.2584274113685878, + "grad_norm": 2.0771443378287326, + "learning_rate": 1.738612199479237e-05, + "loss": 0.3751, + "step": 3262 + }, + { + "epoch": 0.2585066349772232, + "grad_norm": 2.413822918861285, + "learning_rate": 1.7384391911217283e-05, + "loss": 0.4064, + "step": 3263 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 2.062026400168057, + "learning_rate": 1.738266134141552e-05, + "loss": 0.4244, + "step": 3264 + }, + { + "epoch": 0.25866508219449397, + "grad_norm": 2.4641907316013016, + "learning_rate": 1.738093028550103e-05, + "loss": 0.4091, + "step": 3265 + }, + { + "epoch": 0.25874430580312935, + "grad_norm": 2.2061444371890433, + "learning_rate": 1.7379198743587794e-05, + "loss": 0.3615, + "step": 3266 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 2.0898042935744257, + "learning_rate": 1.7377466715789828e-05, + "loss": 0.271, + "step": 3267 + }, + { + "epoch": 0.25890275302040006, + "grad_norm": 1.9304710185047331, + "learning_rate": 1.7375734202221174e-05, + "loss": 0.2474, + "step": 3268 + }, + { + "epoch": 0.25898197662903544, + "grad_norm": 1.9235989854758635, + "learning_rate": 1.7374001202995918e-05, + "loss": 0.2316, + "step": 3269 + }, + { + "epoch": 0.2590612002376708, + "grad_norm": 1.8484448547070276, + "learning_rate": 1.7372267718228163e-05, + "loss": 0.2325, + "step": 3270 + }, + { + "epoch": 0.2591404238463062, + "grad_norm": 2.5725275746219034, + "learning_rate": 1.7370533748032047e-05, + "loss": 0.2746, + "step": 3271 + }, + { + "epoch": 0.2592196474549416, + "grad_norm": 1.8631633864043184, + "learning_rate": 1.7368799292521754e-05, + "loss": 0.3036, + "step": 3272 + }, + { + "epoch": 0.25929887106357696, + "grad_norm": 2.1290448621478633, + "learning_rate": 1.736706435181148e-05, + "loss": 0.4418, + "step": 3273 + }, + { + "epoch": 0.25937809467221234, + "grad_norm": 1.9321493077490677, + "learning_rate": 1.736532892601547e-05, + "loss": 0.3132, + "step": 3274 + }, + { + "epoch": 0.25945731828084767, + "grad_norm": 2.187271121697484, + "learning_rate": 1.7363593015247987e-05, + "loss": 0.3549, + "step": 3275 + }, + { + "epoch": 0.25953654188948305, + "grad_norm": 1.8224622208624737, + "learning_rate": 1.7361856619623338e-05, + "loss": 0.2881, + "step": 3276 + }, + { + "epoch": 0.25961576549811843, + "grad_norm": 1.6987950303485146, + "learning_rate": 1.736011973925585e-05, + "loss": 0.2133, + "step": 3277 + }, + { + "epoch": 0.2596949891067538, + "grad_norm": 2.3325877398108625, + "learning_rate": 1.7358382374259895e-05, + "loss": 0.3981, + "step": 3278 + }, + { + "epoch": 0.2597742127153892, + "grad_norm": 3.2234901755688954, + "learning_rate": 1.7356644524749867e-05, + "loss": 0.4248, + "step": 3279 + }, + { + "epoch": 0.2598534363240246, + "grad_norm": 1.9142990830082351, + "learning_rate": 1.7354906190840194e-05, + "loss": 0.3691, + "step": 3280 + }, + { + "epoch": 0.25993265993265996, + "grad_norm": 2.0571695158798238, + "learning_rate": 1.7353167372645337e-05, + "loss": 0.3494, + "step": 3281 + }, + { + "epoch": 0.2600118835412953, + "grad_norm": 1.8267685585052116, + "learning_rate": 1.735142807027979e-05, + "loss": 0.3425, + "step": 3282 + }, + { + "epoch": 0.26009110714993067, + "grad_norm": 2.2844424109868924, + "learning_rate": 1.734968828385808e-05, + "loss": 0.3369, + "step": 3283 + }, + { + "epoch": 0.26017033075856605, + "grad_norm": 1.8169862733311553, + "learning_rate": 1.7347948013494758e-05, + "loss": 0.2921, + "step": 3284 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 1.9677950581977128, + "learning_rate": 1.7346207259304415e-05, + "loss": 0.3779, + "step": 3285 + }, + { + "epoch": 0.2603287779758368, + "grad_norm": 1.8586838384070026, + "learning_rate": 1.7344466021401673e-05, + "loss": 0.3858, + "step": 3286 + }, + { + "epoch": 0.2604080015844722, + "grad_norm": 2.102587123892838, + "learning_rate": 1.734272429990118e-05, + "loss": 0.2525, + "step": 3287 + }, + { + "epoch": 0.2604872251931075, + "grad_norm": 2.0240899817033053, + "learning_rate": 1.7340982094917627e-05, + "loss": 0.3671, + "step": 3288 + }, + { + "epoch": 0.2605664488017429, + "grad_norm": 2.1359480032432083, + "learning_rate": 1.7339239406565723e-05, + "loss": 0.2414, + "step": 3289 + }, + { + "epoch": 0.2606456724103783, + "grad_norm": 1.8244353527446417, + "learning_rate": 1.733749623496022e-05, + "loss": 0.2628, + "step": 3290 + }, + { + "epoch": 0.26072489601901366, + "grad_norm": 1.995623870348072, + "learning_rate": 1.7335752580215898e-05, + "loss": 0.2407, + "step": 3291 + }, + { + "epoch": 0.26080411962764904, + "grad_norm": 1.9265589608442888, + "learning_rate": 1.733400844244756e-05, + "loss": 0.3299, + "step": 3292 + }, + { + "epoch": 0.2608833432362844, + "grad_norm": 2.0373546699445115, + "learning_rate": 1.733226382177006e-05, + "loss": 0.277, + "step": 3293 + }, + { + "epoch": 0.2609625668449198, + "grad_norm": 2.443805810877364, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.352, + "step": 3294 + }, + { + "epoch": 0.26104179045355513, + "grad_norm": 2.102354642128771, + "learning_rate": 1.7328773132147086e-05, + "loss": 0.335, + "step": 3295 + }, + { + "epoch": 0.2611210140621905, + "grad_norm": 2.1423854862913654, + "learning_rate": 1.732702706343146e-05, + "loss": 0.388, + "step": 3296 + }, + { + "epoch": 0.2612002376708259, + "grad_norm": 2.0592330857010164, + "learning_rate": 1.7325280512266357e-05, + "loss": 0.3831, + "step": 3297 + }, + { + "epoch": 0.2612794612794613, + "grad_norm": 2.0217267599216697, + "learning_rate": 1.7323533478766777e-05, + "loss": 0.2938, + "step": 3298 + }, + { + "epoch": 0.26135868488809666, + "grad_norm": 2.504916276778093, + "learning_rate": 1.732178596304776e-05, + "loss": 0.3584, + "step": 3299 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 1.8943535790300179, + "learning_rate": 1.7320037965224365e-05, + "loss": 0.2428, + "step": 3300 + }, + { + "epoch": 0.2615171321053674, + "grad_norm": 1.7202663161805765, + "learning_rate": 1.731828948541169e-05, + "loss": 0.2775, + "step": 3301 + }, + { + "epoch": 0.26159635571400275, + "grad_norm": 2.067244447280836, + "learning_rate": 1.731654052372487e-05, + "loss": 0.2909, + "step": 3302 + }, + { + "epoch": 0.26167557932263813, + "grad_norm": 1.8371874674562263, + "learning_rate": 1.731479108027906e-05, + "loss": 0.368, + "step": 3303 + }, + { + "epoch": 0.2617548029312735, + "grad_norm": 1.9860994597058739, + "learning_rate": 1.7313041155189454e-05, + "loss": 0.4676, + "step": 3304 + }, + { + "epoch": 0.2618340265399089, + "grad_norm": 1.9991120807496603, + "learning_rate": 1.7311290748571273e-05, + "loss": 0.3268, + "step": 3305 + }, + { + "epoch": 0.2619132501485443, + "grad_norm": 2.0800115964919854, + "learning_rate": 1.7309539860539783e-05, + "loss": 0.306, + "step": 3306 + }, + { + "epoch": 0.26199247375717966, + "grad_norm": 1.7009326575219386, + "learning_rate": 1.7307788491210257e-05, + "loss": 0.2824, + "step": 3307 + }, + { + "epoch": 0.26207169736581504, + "grad_norm": 2.1062715591386185, + "learning_rate": 1.7306036640698024e-05, + "loss": 0.2678, + "step": 3308 + }, + { + "epoch": 0.26215092097445036, + "grad_norm": 2.185510246191739, + "learning_rate": 1.7304284309118436e-05, + "loss": 0.4491, + "step": 3309 + }, + { + "epoch": 0.26223014458308574, + "grad_norm": 2.1919386935161085, + "learning_rate": 1.7302531496586866e-05, + "loss": 0.3958, + "step": 3310 + }, + { + "epoch": 0.2623093681917211, + "grad_norm": 1.934797454249626, + "learning_rate": 1.730077820321874e-05, + "loss": 0.3354, + "step": 3311 + }, + { + "epoch": 0.2623885918003565, + "grad_norm": 1.799694206858261, + "learning_rate": 1.7299024429129497e-05, + "loss": 0.2744, + "step": 3312 + }, + { + "epoch": 0.2624678154089919, + "grad_norm": 2.038853195647611, + "learning_rate": 1.7297270174434613e-05, + "loss": 0.3197, + "step": 3313 + }, + { + "epoch": 0.26254703901762727, + "grad_norm": 2.116760797685011, + "learning_rate": 1.7295515439249608e-05, + "loss": 0.3337, + "step": 3314 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 2.4294043123625544, + "learning_rate": 1.7293760223690008e-05, + "loss": 0.4728, + "step": 3315 + }, + { + "epoch": 0.262705486234898, + "grad_norm": 1.8635766098439808, + "learning_rate": 1.729200452787139e-05, + "loss": 0.3491, + "step": 3316 + }, + { + "epoch": 0.26278470984353336, + "grad_norm": 1.7346554430970518, + "learning_rate": 1.729024835190937e-05, + "loss": 0.2755, + "step": 3317 + }, + { + "epoch": 0.26286393345216874, + "grad_norm": 1.942318264523939, + "learning_rate": 1.7288491695919567e-05, + "loss": 0.2305, + "step": 3318 + }, + { + "epoch": 0.2629431570608041, + "grad_norm": 2.3059979514013382, + "learning_rate": 1.728673456001766e-05, + "loss": 0.2954, + "step": 3319 + }, + { + "epoch": 0.2630223806694395, + "grad_norm": 1.9388207109981859, + "learning_rate": 1.728497694431934e-05, + "loss": 0.2637, + "step": 3320 + }, + { + "epoch": 0.2631016042780749, + "grad_norm": 2.0770342811848495, + "learning_rate": 1.7283218848940344e-05, + "loss": 0.3618, + "step": 3321 + }, + { + "epoch": 0.26318082788671027, + "grad_norm": 2.1631574225529735, + "learning_rate": 1.728146027399643e-05, + "loss": 0.3476, + "step": 3322 + }, + { + "epoch": 0.2632600514953456, + "grad_norm": 2.038833819636668, + "learning_rate": 1.7279701219603394e-05, + "loss": 0.3631, + "step": 3323 + }, + { + "epoch": 0.263339275103981, + "grad_norm": 2.132490739011031, + "learning_rate": 1.727794168587706e-05, + "loss": 0.2823, + "step": 3324 + }, + { + "epoch": 0.26341849871261636, + "grad_norm": 2.0932111609772344, + "learning_rate": 1.7276181672933287e-05, + "loss": 0.3566, + "step": 3325 + }, + { + "epoch": 0.26349772232125174, + "grad_norm": 2.158072948934231, + "learning_rate": 1.7274421180887958e-05, + "loss": 0.3005, + "step": 3326 + }, + { + "epoch": 0.2635769459298871, + "grad_norm": 2.0368869102115577, + "learning_rate": 1.7272660209857e-05, + "loss": 0.3139, + "step": 3327 + }, + { + "epoch": 0.2636561695385225, + "grad_norm": 2.215593103630561, + "learning_rate": 1.727089875995636e-05, + "loss": 0.3541, + "step": 3328 + }, + { + "epoch": 0.2637353931471578, + "grad_norm": 2.697015147530889, + "learning_rate": 1.726913683130202e-05, + "loss": 0.4182, + "step": 3329 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 2.5116736595514166, + "learning_rate": 1.7267374424009998e-05, + "loss": 0.3195, + "step": 3330 + }, + { + "epoch": 0.2638938403644286, + "grad_norm": 2.3595958089491433, + "learning_rate": 1.726561153819634e-05, + "loss": 0.3875, + "step": 3331 + }, + { + "epoch": 0.26397306397306397, + "grad_norm": 1.901354202275452, + "learning_rate": 1.7263848173977122e-05, + "loss": 0.2701, + "step": 3332 + }, + { + "epoch": 0.26405228758169935, + "grad_norm": 2.522415618531674, + "learning_rate": 1.726208433146845e-05, + "loss": 0.3083, + "step": 3333 + }, + { + "epoch": 0.26413151119033473, + "grad_norm": 1.9211384918597951, + "learning_rate": 1.726032001078647e-05, + "loss": 0.2899, + "step": 3334 + }, + { + "epoch": 0.2642107347989701, + "grad_norm": 1.9647184220259637, + "learning_rate": 1.725855521204735e-05, + "loss": 0.2816, + "step": 3335 + }, + { + "epoch": 0.26428995840760544, + "grad_norm": 2.165193659503709, + "learning_rate": 1.7256789935367296e-05, + "loss": 0.2738, + "step": 3336 + }, + { + "epoch": 0.2643691820162408, + "grad_norm": 1.9607703731106676, + "learning_rate": 1.7255024180862546e-05, + "loss": 0.2549, + "step": 3337 + }, + { + "epoch": 0.2644484056248762, + "grad_norm": 2.0194548077496033, + "learning_rate": 1.7253257948649357e-05, + "loss": 0.2944, + "step": 3338 + }, + { + "epoch": 0.2645276292335116, + "grad_norm": 1.8841420416239127, + "learning_rate": 1.7251491238844038e-05, + "loss": 0.2976, + "step": 3339 + }, + { + "epoch": 0.26460685284214697, + "grad_norm": 1.7858477313144323, + "learning_rate": 1.7249724051562905e-05, + "loss": 0.2512, + "step": 3340 + }, + { + "epoch": 0.26468607645078235, + "grad_norm": 2.128248672173283, + "learning_rate": 1.7247956386922334e-05, + "loss": 0.2976, + "step": 3341 + }, + { + "epoch": 0.26476530005941773, + "grad_norm": 2.2043898618326576, + "learning_rate": 1.7246188245038705e-05, + "loss": 0.3567, + "step": 3342 + }, + { + "epoch": 0.26484452366805306, + "grad_norm": 2.35282305092463, + "learning_rate": 1.7244419626028454e-05, + "loss": 0.4477, + "step": 3343 + }, + { + "epoch": 0.26492374727668844, + "grad_norm": 1.617635459364415, + "learning_rate": 1.724265053000802e-05, + "loss": 0.2292, + "step": 3344 + }, + { + "epoch": 0.2650029708853238, + "grad_norm": 2.125265322866548, + "learning_rate": 1.7240880957093903e-05, + "loss": 0.2552, + "step": 3345 + }, + { + "epoch": 0.2650821944939592, + "grad_norm": 1.7969324592980613, + "learning_rate": 1.7239110907402615e-05, + "loss": 0.2744, + "step": 3346 + }, + { + "epoch": 0.2651614181025946, + "grad_norm": 2.391008221371728, + "learning_rate": 1.72373403810507e-05, + "loss": 0.3207, + "step": 3347 + }, + { + "epoch": 0.26524064171122996, + "grad_norm": 2.1596446537617835, + "learning_rate": 1.7235569378154752e-05, + "loss": 0.2656, + "step": 3348 + }, + { + "epoch": 0.26531986531986534, + "grad_norm": 1.893349642320206, + "learning_rate": 1.7233797898831376e-05, + "loss": 0.3016, + "step": 3349 + }, + { + "epoch": 0.26539908892850067, + "grad_norm": 2.0026866978649767, + "learning_rate": 1.7232025943197213e-05, + "loss": 0.286, + "step": 3350 + }, + { + "epoch": 0.26547831253713605, + "grad_norm": 1.7355436574030136, + "learning_rate": 1.723025351136894e-05, + "loss": 0.2244, + "step": 3351 + }, + { + "epoch": 0.26555753614577143, + "grad_norm": 2.2079376495757885, + "learning_rate": 1.722848060346326e-05, + "loss": 0.3482, + "step": 3352 + }, + { + "epoch": 0.2656367597544068, + "grad_norm": 1.6954301381879433, + "learning_rate": 1.7226707219596918e-05, + "loss": 0.3574, + "step": 3353 + }, + { + "epoch": 0.2657159833630422, + "grad_norm": 2.777001468568871, + "learning_rate": 1.7224933359886676e-05, + "loss": 0.4344, + "step": 3354 + }, + { + "epoch": 0.2657952069716776, + "grad_norm": 2.9241073423726687, + "learning_rate": 1.7223159024449338e-05, + "loss": 0.4517, + "step": 3355 + }, + { + "epoch": 0.26587443058031296, + "grad_norm": 2.399511530399849, + "learning_rate": 1.7221384213401732e-05, + "loss": 0.3225, + "step": 3356 + }, + { + "epoch": 0.2659536541889483, + "grad_norm": 1.728416325304639, + "learning_rate": 1.7219608926860726e-05, + "loss": 0.2689, + "step": 3357 + }, + { + "epoch": 0.26603287779758367, + "grad_norm": 1.735648579420402, + "learning_rate": 1.721783316494321e-05, + "loss": 0.3004, + "step": 3358 + }, + { + "epoch": 0.26611210140621905, + "grad_norm": 2.3216093410646064, + "learning_rate": 1.7216056927766106e-05, + "loss": 0.3843, + "step": 3359 + }, + { + "epoch": 0.26619132501485443, + "grad_norm": 2.761399788877409, + "learning_rate": 1.721428021544638e-05, + "loss": 0.3857, + "step": 3360 + }, + { + "epoch": 0.2662705486234898, + "grad_norm": 2.2120445712875902, + "learning_rate": 1.7212503028101012e-05, + "loss": 0.4201, + "step": 3361 + }, + { + "epoch": 0.2663497722321252, + "grad_norm": 2.4005627922450055, + "learning_rate": 1.721072536584702e-05, + "loss": 0.4349, + "step": 3362 + }, + { + "epoch": 0.2664289958407606, + "grad_norm": 1.8909743152480871, + "learning_rate": 1.7208947228801464e-05, + "loss": 0.267, + "step": 3363 + }, + { + "epoch": 0.2665082194493959, + "grad_norm": 1.8126766367599259, + "learning_rate": 1.7207168617081418e-05, + "loss": 0.2903, + "step": 3364 + }, + { + "epoch": 0.2665874430580313, + "grad_norm": 1.740841195438343, + "learning_rate": 1.7205389530804e-05, + "loss": 0.2232, + "step": 3365 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8321824172234051, + "learning_rate": 1.7203609970086347e-05, + "loss": 0.264, + "step": 3366 + }, + { + "epoch": 0.26674589027530204, + "grad_norm": 2.359089679248261, + "learning_rate": 1.720182993504564e-05, + "loss": 0.3437, + "step": 3367 + }, + { + "epoch": 0.2668251138839374, + "grad_norm": 1.9946362292912054, + "learning_rate": 1.7200049425799087e-05, + "loss": 0.3399, + "step": 3368 + }, + { + "epoch": 0.2669043374925728, + "grad_norm": 1.775687053524371, + "learning_rate": 1.7198268442463923e-05, + "loss": 0.3073, + "step": 3369 + }, + { + "epoch": 0.26698356110120813, + "grad_norm": 1.6446176168511335, + "learning_rate": 1.719648698515742e-05, + "loss": 0.2388, + "step": 3370 + }, + { + "epoch": 0.2670627847098435, + "grad_norm": 2.169796884925276, + "learning_rate": 1.7194705053996873e-05, + "loss": 0.2728, + "step": 3371 + }, + { + "epoch": 0.2671420083184789, + "grad_norm": 1.853749628602967, + "learning_rate": 1.719292264909962e-05, + "loss": 0.3543, + "step": 3372 + }, + { + "epoch": 0.2672212319271143, + "grad_norm": 2.2220969340241745, + "learning_rate": 1.7191139770583015e-05, + "loss": 0.3358, + "step": 3373 + }, + { + "epoch": 0.26730045553574966, + "grad_norm": 1.9016013931004871, + "learning_rate": 1.7189356418564463e-05, + "loss": 0.3122, + "step": 3374 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 1.811533174902071, + "learning_rate": 1.7187572593161382e-05, + "loss": 0.2395, + "step": 3375 + }, + { + "epoch": 0.2674589027530204, + "grad_norm": 1.9815976462937936, + "learning_rate": 1.7185788294491232e-05, + "loss": 0.3322, + "step": 3376 + }, + { + "epoch": 0.26753812636165575, + "grad_norm": 1.8483588231747987, + "learning_rate": 1.7184003522671497e-05, + "loss": 0.2476, + "step": 3377 + }, + { + "epoch": 0.26761734997029113, + "grad_norm": 1.858329517675542, + "learning_rate": 1.7182218277819697e-05, + "loss": 0.2063, + "step": 3378 + }, + { + "epoch": 0.2676965735789265, + "grad_norm": 2.0331968250360397, + "learning_rate": 1.718043256005338e-05, + "loss": 0.3271, + "step": 3379 + }, + { + "epoch": 0.2677757971875619, + "grad_norm": 1.8281770863363087, + "learning_rate": 1.717864636949013e-05, + "loss": 0.2449, + "step": 3380 + }, + { + "epoch": 0.2678550207961973, + "grad_norm": 1.9349374504083263, + "learning_rate": 1.7176859706247563e-05, + "loss": 0.2794, + "step": 3381 + }, + { + "epoch": 0.26793424440483266, + "grad_norm": 1.8730970808507403, + "learning_rate": 1.717507257044331e-05, + "loss": 0.361, + "step": 3382 + }, + { + "epoch": 0.26801346801346804, + "grad_norm": 1.8493380523649967, + "learning_rate": 1.717328496219506e-05, + "loss": 0.1875, + "step": 3383 + }, + { + "epoch": 0.26809269162210336, + "grad_norm": 2.473703148725078, + "learning_rate": 1.7171496881620507e-05, + "loss": 0.4314, + "step": 3384 + }, + { + "epoch": 0.26817191523073874, + "grad_norm": 1.8371546951078674, + "learning_rate": 1.716970832883739e-05, + "loss": 0.2863, + "step": 3385 + }, + { + "epoch": 0.2682511388393741, + "grad_norm": 2.2521562254570346, + "learning_rate": 1.716791930396348e-05, + "loss": 0.3446, + "step": 3386 + }, + { + "epoch": 0.2683303624480095, + "grad_norm": 2.1877287676789647, + "learning_rate": 1.716612980711657e-05, + "loss": 0.3864, + "step": 3387 + }, + { + "epoch": 0.2684095860566449, + "grad_norm": 2.216438671179258, + "learning_rate": 1.7164339838414496e-05, + "loss": 0.2763, + "step": 3388 + }, + { + "epoch": 0.26848880966528027, + "grad_norm": 2.173025537378721, + "learning_rate": 1.7162549397975118e-05, + "loss": 0.2324, + "step": 3389 + }, + { + "epoch": 0.26856803327391565, + "grad_norm": 1.7275553775691295, + "learning_rate": 1.7160758485916325e-05, + "loss": 0.206, + "step": 3390 + }, + { + "epoch": 0.268647256882551, + "grad_norm": 2.109874916464338, + "learning_rate": 1.715896710235604e-05, + "loss": 0.3108, + "step": 3391 + }, + { + "epoch": 0.26872648049118636, + "grad_norm": 2.063356303116072, + "learning_rate": 1.715717524741222e-05, + "loss": 0.3225, + "step": 3392 + }, + { + "epoch": 0.26880570409982174, + "grad_norm": 2.074422457560864, + "learning_rate": 1.7155382921202844e-05, + "loss": 0.4281, + "step": 3393 + }, + { + "epoch": 0.2688849277084571, + "grad_norm": 2.3564315078913594, + "learning_rate": 1.7153590123845938e-05, + "loss": 0.4054, + "step": 3394 + }, + { + "epoch": 0.2689641513170925, + "grad_norm": 2.015368707310253, + "learning_rate": 1.715179685545954e-05, + "loss": 0.3914, + "step": 3395 + }, + { + "epoch": 0.2690433749257279, + "grad_norm": 1.8676094100668932, + "learning_rate": 1.7150003116161734e-05, + "loss": 0.2481, + "step": 3396 + }, + { + "epoch": 0.26912259853436327, + "grad_norm": 1.9978192501374776, + "learning_rate": 1.714820890607062e-05, + "loss": 0.3082, + "step": 3397 + }, + { + "epoch": 0.2692018221429986, + "grad_norm": 2.0549641598061825, + "learning_rate": 1.714641422530435e-05, + "loss": 0.4326, + "step": 3398 + }, + { + "epoch": 0.269281045751634, + "grad_norm": 1.7291000095708726, + "learning_rate": 1.7144619073981088e-05, + "loss": 0.2368, + "step": 3399 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 1.5685858177501182, + "learning_rate": 1.7142823452219036e-05, + "loss": 0.2623, + "step": 3400 + }, + { + "epoch": 0.26943949296890474, + "grad_norm": 1.8022665585665223, + "learning_rate": 1.714102736013643e-05, + "loss": 0.2953, + "step": 3401 + }, + { + "epoch": 0.2695187165775401, + "grad_norm": 1.7148333665239874, + "learning_rate": 1.7139230797851537e-05, + "loss": 0.2899, + "step": 3402 + }, + { + "epoch": 0.2695979401861755, + "grad_norm": 2.0486075979032226, + "learning_rate": 1.7137433765482644e-05, + "loss": 0.3429, + "step": 3403 + }, + { + "epoch": 0.2696771637948109, + "grad_norm": 2.254638670894289, + "learning_rate": 1.713563626314808e-05, + "loss": 0.4674, + "step": 3404 + }, + { + "epoch": 0.2697563874034462, + "grad_norm": 1.9844839294437284, + "learning_rate": 1.71338382909662e-05, + "loss": 0.2142, + "step": 3405 + }, + { + "epoch": 0.2698356110120816, + "grad_norm": 1.9967467790777143, + "learning_rate": 1.71320398490554e-05, + "loss": 0.3291, + "step": 3406 + }, + { + "epoch": 0.26991483462071697, + "grad_norm": 1.750876537251268, + "learning_rate": 1.713024093753409e-05, + "loss": 0.2859, + "step": 3407 + }, + { + "epoch": 0.26999405822935235, + "grad_norm": 2.0531637004843155, + "learning_rate": 1.7128441556520723e-05, + "loss": 0.3012, + "step": 3408 + }, + { + "epoch": 0.27007328183798773, + "grad_norm": 2.029446732194121, + "learning_rate": 1.7126641706133782e-05, + "loss": 0.2994, + "step": 3409 + }, + { + "epoch": 0.2701525054466231, + "grad_norm": 1.825503137882742, + "learning_rate": 1.7124841386491774e-05, + "loss": 0.325, + "step": 3410 + }, + { + "epoch": 0.27023172905525844, + "grad_norm": 1.8663217216949353, + "learning_rate": 1.7123040597713242e-05, + "loss": 0.2081, + "step": 3411 + }, + { + "epoch": 0.2703109526638938, + "grad_norm": 2.1946547168324386, + "learning_rate": 1.7121239339916763e-05, + "loss": 0.2858, + "step": 3412 + }, + { + "epoch": 0.2703901762725292, + "grad_norm": 1.785401676136347, + "learning_rate": 1.7119437613220936e-05, + "loss": 0.2338, + "step": 3413 + }, + { + "epoch": 0.2704693998811646, + "grad_norm": 2.0546556830274647, + "learning_rate": 1.71176354177444e-05, + "loss": 0.3663, + "step": 3414 + }, + { + "epoch": 0.27054862348979997, + "grad_norm": 1.9598841949012182, + "learning_rate": 1.711583275360582e-05, + "loss": 0.2318, + "step": 3415 + }, + { + "epoch": 0.27062784709843535, + "grad_norm": 1.7839842431481236, + "learning_rate": 1.711402962092389e-05, + "loss": 0.2699, + "step": 3416 + }, + { + "epoch": 0.27070707070707073, + "grad_norm": 1.9928719154903598, + "learning_rate": 1.7112226019817345e-05, + "loss": 0.2359, + "step": 3417 + }, + { + "epoch": 0.27078629431570606, + "grad_norm": 2.515487231914646, + "learning_rate": 1.7110421950404935e-05, + "loss": 0.3807, + "step": 3418 + }, + { + "epoch": 0.27086551792434144, + "grad_norm": 2.063975860768467, + "learning_rate": 1.710861741280545e-05, + "loss": 0.3594, + "step": 3419 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 1.9150625017258538, + "learning_rate": 1.710681240713772e-05, + "loss": 0.2379, + "step": 3420 + }, + { + "epoch": 0.2710239651416122, + "grad_norm": 2.449227820414286, + "learning_rate": 1.7105006933520584e-05, + "loss": 0.2713, + "step": 3421 + }, + { + "epoch": 0.2711031887502476, + "grad_norm": 2.0171076153944782, + "learning_rate": 1.710320099207293e-05, + "loss": 0.229, + "step": 3422 + }, + { + "epoch": 0.27118241235888296, + "grad_norm": 1.7603283630385689, + "learning_rate": 1.7101394582913667e-05, + "loss": 0.2705, + "step": 3423 + }, + { + "epoch": 0.27126163596751834, + "grad_norm": 1.8706520400199922, + "learning_rate": 1.709958770616174e-05, + "loss": 0.2379, + "step": 3424 + }, + { + "epoch": 0.27134085957615367, + "grad_norm": 1.6082111530086896, + "learning_rate": 1.7097780361936128e-05, + "loss": 0.1881, + "step": 3425 + }, + { + "epoch": 0.27142008318478905, + "grad_norm": 1.4648527812333807, + "learning_rate": 1.709597255035583e-05, + "loss": 0.2438, + "step": 3426 + }, + { + "epoch": 0.27149930679342443, + "grad_norm": 1.9096550438093076, + "learning_rate": 1.709416427153988e-05, + "loss": 0.3507, + "step": 3427 + }, + { + "epoch": 0.2715785304020598, + "grad_norm": 2.006438980160681, + "learning_rate": 1.7092355525607352e-05, + "loss": 0.3203, + "step": 3428 + }, + { + "epoch": 0.2716577540106952, + "grad_norm": 2.116227302303852, + "learning_rate": 1.7090546312677335e-05, + "loss": 0.2967, + "step": 3429 + }, + { + "epoch": 0.2717369776193306, + "grad_norm": 2.0026112783102747, + "learning_rate": 1.7088736632868964e-05, + "loss": 0.3262, + "step": 3430 + }, + { + "epoch": 0.27181620122796596, + "grad_norm": 2.1452967188624887, + "learning_rate": 1.7086926486301393e-05, + "loss": 0.2768, + "step": 3431 + }, + { + "epoch": 0.2718954248366013, + "grad_norm": 2.417920698203675, + "learning_rate": 1.7085115873093814e-05, + "loss": 0.3042, + "step": 3432 + }, + { + "epoch": 0.27197464844523667, + "grad_norm": 1.7361940364004071, + "learning_rate": 1.7083304793365445e-05, + "loss": 0.2848, + "step": 3433 + }, + { + "epoch": 0.27205387205387205, + "grad_norm": 2.0124176725836906, + "learning_rate": 1.7081493247235537e-05, + "loss": 0.3293, + "step": 3434 + }, + { + "epoch": 0.27213309566250743, + "grad_norm": 2.1813643158396885, + "learning_rate": 1.7079681234823374e-05, + "loss": 0.3486, + "step": 3435 + }, + { + "epoch": 0.2722123192711428, + "grad_norm": 2.034921035954899, + "learning_rate": 1.7077868756248265e-05, + "loss": 0.377, + "step": 3436 + }, + { + "epoch": 0.2722915428797782, + "grad_norm": 2.3344973685258665, + "learning_rate": 1.7076055811629556e-05, + "loss": 0.4017, + "step": 3437 + }, + { + "epoch": 0.2723707664884136, + "grad_norm": 2.3327574830878017, + "learning_rate": 1.7074242401086623e-05, + "loss": 0.3413, + "step": 3438 + }, + { + "epoch": 0.2724499900970489, + "grad_norm": 2.1733727982323003, + "learning_rate": 1.7072428524738865e-05, + "loss": 0.3579, + "step": 3439 + }, + { + "epoch": 0.2725292137056843, + "grad_norm": 1.6974348724238708, + "learning_rate": 1.707061418270572e-05, + "loss": 0.3138, + "step": 3440 + }, + { + "epoch": 0.27260843731431966, + "grad_norm": 1.9907008970891409, + "learning_rate": 1.706879937510665e-05, + "loss": 0.3513, + "step": 3441 + }, + { + "epoch": 0.27268766092295504, + "grad_norm": 2.1046107430848573, + "learning_rate": 1.7066984102061155e-05, + "loss": 0.2928, + "step": 3442 + }, + { + "epoch": 0.2727668845315904, + "grad_norm": 2.1076791732420763, + "learning_rate": 1.706516836368876e-05, + "loss": 0.3933, + "step": 3443 + }, + { + "epoch": 0.2728461081402258, + "grad_norm": 1.8503842878987184, + "learning_rate": 1.7063352160109026e-05, + "loss": 0.3153, + "step": 3444 + }, + { + "epoch": 0.27292533174886113, + "grad_norm": 1.8954500335377484, + "learning_rate": 1.7061535491441538e-05, + "loss": 0.422, + "step": 3445 + }, + { + "epoch": 0.2730045553574965, + "grad_norm": 2.8329061978914694, + "learning_rate": 1.7059718357805915e-05, + "loss": 0.4148, + "step": 3446 + }, + { + "epoch": 0.2730837789661319, + "grad_norm": 1.9058951118772942, + "learning_rate": 1.705790075932181e-05, + "loss": 0.3352, + "step": 3447 + }, + { + "epoch": 0.2731630025747673, + "grad_norm": 2.1506428320789692, + "learning_rate": 1.7056082696108896e-05, + "loss": 0.3541, + "step": 3448 + }, + { + "epoch": 0.27324222618340266, + "grad_norm": 2.29124772955635, + "learning_rate": 1.7054264168286892e-05, + "loss": 0.2718, + "step": 3449 + }, + { + "epoch": 0.27332144979203804, + "grad_norm": 1.8496851493923643, + "learning_rate": 1.7052445175975533e-05, + "loss": 0.2461, + "step": 3450 + }, + { + "epoch": 0.2734006734006734, + "grad_norm": 1.8939713897535773, + "learning_rate": 1.7050625719294593e-05, + "loss": 0.3335, + "step": 3451 + }, + { + "epoch": 0.27347989700930875, + "grad_norm": 1.9409859724876397, + "learning_rate": 1.7048805798363876e-05, + "loss": 0.2514, + "step": 3452 + }, + { + "epoch": 0.27355912061794413, + "grad_norm": 2.290938856530663, + "learning_rate": 1.7046985413303215e-05, + "loss": 0.3822, + "step": 3453 + }, + { + "epoch": 0.2736383442265795, + "grad_norm": 2.0332328663757298, + "learning_rate": 1.7045164564232474e-05, + "loss": 0.3248, + "step": 3454 + }, + { + "epoch": 0.2737175678352149, + "grad_norm": 2.7222764485818844, + "learning_rate": 1.704334325127154e-05, + "loss": 0.2372, + "step": 3455 + }, + { + "epoch": 0.2737967914438503, + "grad_norm": 1.9436490829624704, + "learning_rate": 1.704152147454035e-05, + "loss": 0.241, + "step": 3456 + }, + { + "epoch": 0.27387601505248566, + "grad_norm": 1.8552986674436382, + "learning_rate": 1.7039699234158846e-05, + "loss": 0.3725, + "step": 3457 + }, + { + "epoch": 0.27395523866112104, + "grad_norm": 1.803009481218911, + "learning_rate": 1.7037876530247025e-05, + "loss": 0.2819, + "step": 3458 + }, + { + "epoch": 0.27403446226975636, + "grad_norm": 2.2327289539249717, + "learning_rate": 1.7036053362924896e-05, + "loss": 0.2886, + "step": 3459 + }, + { + "epoch": 0.27411368587839174, + "grad_norm": 1.7654127900726622, + "learning_rate": 1.7034229732312512e-05, + "loss": 0.3399, + "step": 3460 + }, + { + "epoch": 0.2741929094870271, + "grad_norm": 2.001618149518056, + "learning_rate": 1.703240563852994e-05, + "loss": 0.3341, + "step": 3461 + }, + { + "epoch": 0.2742721330956625, + "grad_norm": 1.8275278815146694, + "learning_rate": 1.70305810816973e-05, + "loss": 0.2905, + "step": 3462 + }, + { + "epoch": 0.2743513567042979, + "grad_norm": 1.9754697629304463, + "learning_rate": 1.7028756061934722e-05, + "loss": 0.2721, + "step": 3463 + }, + { + "epoch": 0.27443058031293327, + "grad_norm": 1.9224899832519573, + "learning_rate": 1.702693057936238e-05, + "loss": 0.3465, + "step": 3464 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 2.047121478483813, + "learning_rate": 1.702510463410047e-05, + "loss": 0.375, + "step": 3465 + }, + { + "epoch": 0.274589027530204, + "grad_norm": 1.9609023286637646, + "learning_rate": 1.7023278226269222e-05, + "loss": 0.2976, + "step": 3466 + }, + { + "epoch": 0.27466825113883936, + "grad_norm": 2.2701917237976623, + "learning_rate": 1.7021451355988895e-05, + "loss": 0.3709, + "step": 3467 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 1.6581893154818668, + "learning_rate": 1.7019624023379784e-05, + "loss": 0.2215, + "step": 3468 + }, + { + "epoch": 0.2748266983561101, + "grad_norm": 2.1512060926941223, + "learning_rate": 1.7017796228562206e-05, + "loss": 0.4825, + "step": 3469 + }, + { + "epoch": 0.2749059219647455, + "grad_norm": 1.9605498989527748, + "learning_rate": 1.7015967971656513e-05, + "loss": 0.3221, + "step": 3470 + }, + { + "epoch": 0.2749851455733809, + "grad_norm": 1.7094513779059308, + "learning_rate": 1.7014139252783092e-05, + "loss": 0.2783, + "step": 3471 + }, + { + "epoch": 0.27506436918201627, + "grad_norm": 1.715976794116535, + "learning_rate": 1.7012310072062348e-05, + "loss": 0.2278, + "step": 3472 + }, + { + "epoch": 0.2751435927906516, + "grad_norm": 1.904943137397442, + "learning_rate": 1.7010480429614726e-05, + "loss": 0.2834, + "step": 3473 + }, + { + "epoch": 0.275222816399287, + "grad_norm": 2.402731313763493, + "learning_rate": 1.70086503255607e-05, + "loss": 0.2803, + "step": 3474 + }, + { + "epoch": 0.27530204000792236, + "grad_norm": 2.219686760281645, + "learning_rate": 1.7006819760020773e-05, + "loss": 0.3458, + "step": 3475 + }, + { + "epoch": 0.27538126361655774, + "grad_norm": 1.9100238600024793, + "learning_rate": 1.700498873311548e-05, + "loss": 0.2849, + "step": 3476 + }, + { + "epoch": 0.2754604872251931, + "grad_norm": 2.1779120871548026, + "learning_rate": 1.7003157244965387e-05, + "loss": 0.3716, + "step": 3477 + }, + { + "epoch": 0.2755397108338285, + "grad_norm": 1.7042861256404576, + "learning_rate": 1.700132529569109e-05, + "loss": 0.3119, + "step": 3478 + }, + { + "epoch": 0.2756189344424639, + "grad_norm": 1.9961802261549253, + "learning_rate": 1.69994928854132e-05, + "loss": 0.3712, + "step": 3479 + }, + { + "epoch": 0.2756981580510992, + "grad_norm": 2.2530353620611097, + "learning_rate": 1.6997660014252392e-05, + "loss": 0.3452, + "step": 3480 + }, + { + "epoch": 0.2757773816597346, + "grad_norm": 2.1210282783750998, + "learning_rate": 1.699582668232934e-05, + "loss": 0.3903, + "step": 3481 + }, + { + "epoch": 0.27585660526836997, + "grad_norm": 2.1661304360605502, + "learning_rate": 1.6993992889764758e-05, + "loss": 0.291, + "step": 3482 + }, + { + "epoch": 0.27593582887700535, + "grad_norm": 2.0454860038864764, + "learning_rate": 1.69921586366794e-05, + "loss": 0.3169, + "step": 3483 + }, + { + "epoch": 0.27601505248564073, + "grad_norm": 2.061118649960762, + "learning_rate": 1.6990323923194042e-05, + "loss": 0.3644, + "step": 3484 + }, + { + "epoch": 0.2760942760942761, + "grad_norm": 1.981208434375285, + "learning_rate": 1.698848874942949e-05, + "loss": 0.3909, + "step": 3485 + }, + { + "epoch": 0.27617349970291144, + "grad_norm": 2.045603357192902, + "learning_rate": 1.698665311550658e-05, + "loss": 0.2978, + "step": 3486 + }, + { + "epoch": 0.2762527233115468, + "grad_norm": 1.9548500614536992, + "learning_rate": 1.6984817021546177e-05, + "loss": 0.2961, + "step": 3487 + }, + { + "epoch": 0.2763319469201822, + "grad_norm": 2.4851479274008104, + "learning_rate": 1.6982980467669183e-05, + "loss": 0.4116, + "step": 3488 + }, + { + "epoch": 0.2764111705288176, + "grad_norm": 2.1283429585508493, + "learning_rate": 1.6981143453996524e-05, + "loss": 0.2032, + "step": 3489 + }, + { + "epoch": 0.27649039413745297, + "grad_norm": 1.9797279961550915, + "learning_rate": 1.697930598064916e-05, + "loss": 0.3148, + "step": 3490 + }, + { + "epoch": 0.27656961774608835, + "grad_norm": 2.2805286707108, + "learning_rate": 1.697746804774808e-05, + "loss": 0.3736, + "step": 3491 + }, + { + "epoch": 0.27664884135472373, + "grad_norm": 1.9757464079684457, + "learning_rate": 1.6975629655414304e-05, + "loss": 0.2633, + "step": 3492 + }, + { + "epoch": 0.27672806496335906, + "grad_norm": 2.0193121205070956, + "learning_rate": 1.6973790803768875e-05, + "loss": 0.3611, + "step": 3493 + }, + { + "epoch": 0.27680728857199444, + "grad_norm": 1.8007626832785526, + "learning_rate": 1.6971951492932882e-05, + "loss": 0.2225, + "step": 3494 + }, + { + "epoch": 0.2768865121806298, + "grad_norm": 1.966187112911452, + "learning_rate": 1.697011172302743e-05, + "loss": 0.3494, + "step": 3495 + }, + { + "epoch": 0.2769657357892652, + "grad_norm": 2.2795464880490215, + "learning_rate": 1.696827149417366e-05, + "loss": 0.3866, + "step": 3496 + }, + { + "epoch": 0.2770449593979006, + "grad_norm": 1.8932000201309704, + "learning_rate": 1.696643080649274e-05, + "loss": 0.2862, + "step": 3497 + }, + { + "epoch": 0.27712418300653596, + "grad_norm": 2.3551986748591407, + "learning_rate": 1.696458966010587e-05, + "loss": 0.4194, + "step": 3498 + }, + { + "epoch": 0.27720340661517134, + "grad_norm": 2.1182925030402417, + "learning_rate": 1.6962748055134283e-05, + "loss": 0.329, + "step": 3499 + }, + { + "epoch": 0.27728263022380667, + "grad_norm": 2.2859125334664787, + "learning_rate": 1.696090599169924e-05, + "loss": 0.3628, + "step": 3500 + }, + { + "epoch": 0.27736185383244205, + "grad_norm": 2.4958878560634714, + "learning_rate": 1.695906346992203e-05, + "loss": 0.3302, + "step": 3501 + }, + { + "epoch": 0.27744107744107743, + "grad_norm": 2.2141659017644715, + "learning_rate": 1.6957220489923978e-05, + "loss": 0.2901, + "step": 3502 + }, + { + "epoch": 0.2775203010497128, + "grad_norm": 1.7824419867104577, + "learning_rate": 1.695537705182643e-05, + "loss": 0.2348, + "step": 3503 + }, + { + "epoch": 0.2775995246583482, + "grad_norm": 2.3098994777762134, + "learning_rate": 1.695353315575077e-05, + "loss": 0.2679, + "step": 3504 + }, + { + "epoch": 0.2776787482669836, + "grad_norm": 1.8635940905020172, + "learning_rate": 1.6951688801818413e-05, + "loss": 0.2092, + "step": 3505 + }, + { + "epoch": 0.27775797187561896, + "grad_norm": 2.073096902943524, + "learning_rate": 1.6949843990150798e-05, + "loss": 0.3515, + "step": 3506 + }, + { + "epoch": 0.2778371954842543, + "grad_norm": 1.8106237182660103, + "learning_rate": 1.6947998720869394e-05, + "loss": 0.3101, + "step": 3507 + }, + { + "epoch": 0.27791641909288967, + "grad_norm": 1.9537670109286247, + "learning_rate": 1.6946152994095705e-05, + "loss": 0.2823, + "step": 3508 + }, + { + "epoch": 0.27799564270152505, + "grad_norm": 1.8528504962789316, + "learning_rate": 1.6944306809951264e-05, + "loss": 0.2335, + "step": 3509 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 2.248412889345186, + "learning_rate": 1.694246016855764e-05, + "loss": 0.3095, + "step": 3510 + }, + { + "epoch": 0.2781540899187958, + "grad_norm": 1.6960367712117062, + "learning_rate": 1.694061307003641e-05, + "loss": 0.2585, + "step": 3511 + }, + { + "epoch": 0.2782333135274312, + "grad_norm": 2.045105421530441, + "learning_rate": 1.693876551450921e-05, + "loss": 0.33, + "step": 3512 + }, + { + "epoch": 0.2783125371360666, + "grad_norm": 1.9009183683308206, + "learning_rate": 1.693691750209769e-05, + "loss": 0.275, + "step": 3513 + }, + { + "epoch": 0.2783917607447019, + "grad_norm": 1.8786818326446126, + "learning_rate": 1.6935069032923525e-05, + "loss": 0.3702, + "step": 3514 + }, + { + "epoch": 0.2784709843533373, + "grad_norm": 2.21713110251996, + "learning_rate": 1.6933220107108438e-05, + "loss": 0.4924, + "step": 3515 + }, + { + "epoch": 0.27855020796197266, + "grad_norm": 2.2291293085755255, + "learning_rate": 1.6931370724774166e-05, + "loss": 0.2728, + "step": 3516 + }, + { + "epoch": 0.27862943157060804, + "grad_norm": 1.8046522243481335, + "learning_rate": 1.6929520886042486e-05, + "loss": 0.2754, + "step": 3517 + }, + { + "epoch": 0.2787086551792434, + "grad_norm": 1.7487421348460166, + "learning_rate": 1.6927670591035195e-05, + "loss": 0.3436, + "step": 3518 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 2.019776805383985, + "learning_rate": 1.692581983987413e-05, + "loss": 0.3243, + "step": 3519 + }, + { + "epoch": 0.2788671023965142, + "grad_norm": 2.110360785216861, + "learning_rate": 1.6923968632681155e-05, + "loss": 0.3165, + "step": 3520 + }, + { + "epoch": 0.2789463260051495, + "grad_norm": 2.0722510875379916, + "learning_rate": 1.6922116969578163e-05, + "loss": 0.1952, + "step": 3521 + }, + { + "epoch": 0.2790255496137849, + "grad_norm": 2.093271655456249, + "learning_rate": 1.692026485068707e-05, + "loss": 0.3892, + "step": 3522 + }, + { + "epoch": 0.2791047732224203, + "grad_norm": 2.2802937614085166, + "learning_rate": 1.6918412276129837e-05, + "loss": 0.3902, + "step": 3523 + }, + { + "epoch": 0.27918399683105566, + "grad_norm": 1.8380856505009215, + "learning_rate": 1.691655924602845e-05, + "loss": 0.3321, + "step": 3524 + }, + { + "epoch": 0.27926322043969104, + "grad_norm": 2.577987441071288, + "learning_rate": 1.6914705760504913e-05, + "loss": 0.4003, + "step": 3525 + }, + { + "epoch": 0.2793424440483264, + "grad_norm": 1.8455243983091896, + "learning_rate": 1.6912851819681272e-05, + "loss": 0.2099, + "step": 3526 + }, + { + "epoch": 0.27942166765696175, + "grad_norm": 2.574855512597409, + "learning_rate": 1.69109974236796e-05, + "loss": 0.334, + "step": 3527 + }, + { + "epoch": 0.27950089126559713, + "grad_norm": 2.189702092016546, + "learning_rate": 1.6909142572622003e-05, + "loss": 0.2827, + "step": 3528 + }, + { + "epoch": 0.2795801148742325, + "grad_norm": 2.0898676765935984, + "learning_rate": 1.6907287266630614e-05, + "loss": 0.351, + "step": 3529 + }, + { + "epoch": 0.2796593384828679, + "grad_norm": 2.4892231804154767, + "learning_rate": 1.6905431505827595e-05, + "loss": 0.287, + "step": 3530 + }, + { + "epoch": 0.2797385620915033, + "grad_norm": 1.9382705291067408, + "learning_rate": 1.6903575290335136e-05, + "loss": 0.2526, + "step": 3531 + }, + { + "epoch": 0.27981778570013865, + "grad_norm": 1.9735049122877046, + "learning_rate": 1.690171862027546e-05, + "loss": 0.2289, + "step": 3532 + }, + { + "epoch": 0.27989700930877404, + "grad_norm": 4.74144127134824, + "learning_rate": 1.6899861495770827e-05, + "loss": 0.3942, + "step": 3533 + }, + { + "epoch": 0.27997623291740936, + "grad_norm": 2.504025500575295, + "learning_rate": 1.689800391694351e-05, + "loss": 0.2323, + "step": 3534 + }, + { + "epoch": 0.28005545652604474, + "grad_norm": 2.363072999087013, + "learning_rate": 1.689614588391583e-05, + "loss": 0.3983, + "step": 3535 + }, + { + "epoch": 0.2801346801346801, + "grad_norm": 1.9585205896537063, + "learning_rate": 1.689428739681012e-05, + "loss": 0.2833, + "step": 3536 + }, + { + "epoch": 0.2802139037433155, + "grad_norm": 4.073854755311286, + "learning_rate": 1.6892428455748762e-05, + "loss": 0.2544, + "step": 3537 + }, + { + "epoch": 0.2802931273519509, + "grad_norm": 2.100093931788127, + "learning_rate": 1.6890569060854156e-05, + "loss": 0.219, + "step": 3538 + }, + { + "epoch": 0.28037235096058627, + "grad_norm": 2.893602540009141, + "learning_rate": 1.6888709212248728e-05, + "loss": 0.3959, + "step": 3539 + }, + { + "epoch": 0.28045157456922165, + "grad_norm": 2.501694315304918, + "learning_rate": 1.6886848910054947e-05, + "loss": 0.4967, + "step": 3540 + }, + { + "epoch": 0.280530798177857, + "grad_norm": 2.301233341164083, + "learning_rate": 1.6884988154395304e-05, + "loss": 0.3129, + "step": 3541 + }, + { + "epoch": 0.28061002178649236, + "grad_norm": 2.2660350398429236, + "learning_rate": 1.688312694539232e-05, + "loss": 0.4105, + "step": 3542 + }, + { + "epoch": 0.28068924539512774, + "grad_norm": 2.5343703470780365, + "learning_rate": 1.6881265283168543e-05, + "loss": 0.432, + "step": 3543 + }, + { + "epoch": 0.2807684690037631, + "grad_norm": 2.064622891027119, + "learning_rate": 1.6879403167846556e-05, + "loss": 0.3034, + "step": 3544 + }, + { + "epoch": 0.2808476926123985, + "grad_norm": 1.7379244947562247, + "learning_rate": 1.6877540599548977e-05, + "loss": 0.409, + "step": 3545 + }, + { + "epoch": 0.2809269162210339, + "grad_norm": 2.1850962461437002, + "learning_rate": 1.6875677578398442e-05, + "loss": 0.3902, + "step": 3546 + }, + { + "epoch": 0.28100613982966927, + "grad_norm": 2.6374808658118476, + "learning_rate": 1.6873814104517617e-05, + "loss": 0.375, + "step": 3547 + }, + { + "epoch": 0.2810853634383046, + "grad_norm": 2.0622907379385906, + "learning_rate": 1.6871950178029216e-05, + "loss": 0.2744, + "step": 3548 + }, + { + "epoch": 0.28116458704694, + "grad_norm": 2.121968929902448, + "learning_rate": 1.6870085799055956e-05, + "loss": 0.3439, + "step": 3549 + }, + { + "epoch": 0.28124381065557535, + "grad_norm": 2.386560246097312, + "learning_rate": 1.6868220967720604e-05, + "loss": 0.2962, + "step": 3550 + }, + { + "epoch": 0.28132303426421074, + "grad_norm": 2.109175502399617, + "learning_rate": 1.686635568414595e-05, + "loss": 0.4228, + "step": 3551 + }, + { + "epoch": 0.2814022578728461, + "grad_norm": 2.106910782322327, + "learning_rate": 1.686448994845481e-05, + "loss": 0.3233, + "step": 3552 + }, + { + "epoch": 0.2814814814814815, + "grad_norm": 2.3877406921161426, + "learning_rate": 1.6862623760770038e-05, + "loss": 0.5141, + "step": 3553 + }, + { + "epoch": 0.2815607050901169, + "grad_norm": 1.8639037639411236, + "learning_rate": 1.6860757121214513e-05, + "loss": 0.2009, + "step": 3554 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 2.1914663337553226, + "learning_rate": 1.685889002991114e-05, + "loss": 0.4505, + "step": 3555 + }, + { + "epoch": 0.2817191523073876, + "grad_norm": 2.232641874872952, + "learning_rate": 1.6857022486982865e-05, + "loss": 0.3326, + "step": 3556 + }, + { + "epoch": 0.28179837591602297, + "grad_norm": 2.5640197816652854, + "learning_rate": 1.6855154492552656e-05, + "loss": 0.3061, + "step": 3557 + }, + { + "epoch": 0.28187759952465835, + "grad_norm": 1.668174839635691, + "learning_rate": 1.6853286046743505e-05, + "loss": 0.2929, + "step": 3558 + }, + { + "epoch": 0.28195682313329373, + "grad_norm": 1.8556588588344165, + "learning_rate": 1.6851417149678442e-05, + "loss": 0.3338, + "step": 3559 + }, + { + "epoch": 0.2820360467419291, + "grad_norm": 1.987759747378164, + "learning_rate": 1.684954780148053e-05, + "loss": 0.3289, + "step": 3560 + }, + { + "epoch": 0.2821152703505645, + "grad_norm": 1.83485460760186, + "learning_rate": 1.684767800227285e-05, + "loss": 0.2603, + "step": 3561 + }, + { + "epoch": 0.2821944939591998, + "grad_norm": 1.7141895985015019, + "learning_rate": 1.6845807752178528e-05, + "loss": 0.3035, + "step": 3562 + }, + { + "epoch": 0.2822737175678352, + "grad_norm": 2.375681914953145, + "learning_rate": 1.68439370513207e-05, + "loss": 0.2527, + "step": 3563 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 1.7910327977988199, + "learning_rate": 1.6842065899822548e-05, + "loss": 0.3252, + "step": 3564 + }, + { + "epoch": 0.28243216478510597, + "grad_norm": 2.2763206240222145, + "learning_rate": 1.6840194297807283e-05, + "loss": 0.3886, + "step": 3565 + }, + { + "epoch": 0.28251138839374135, + "grad_norm": 2.1690502030834433, + "learning_rate": 1.6838322245398135e-05, + "loss": 0.3165, + "step": 3566 + }, + { + "epoch": 0.28259061200237673, + "grad_norm": 2.3205598436667763, + "learning_rate": 1.6836449742718367e-05, + "loss": 0.3334, + "step": 3567 + }, + { + "epoch": 0.28266983561101205, + "grad_norm": 2.322813610233888, + "learning_rate": 1.6834576789891282e-05, + "loss": 0.3761, + "step": 3568 + }, + { + "epoch": 0.28274905921964744, + "grad_norm": 2.0865471060637093, + "learning_rate": 1.68327033870402e-05, + "loss": 0.415, + "step": 3569 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 1.9750981776735403, + "learning_rate": 1.6830829534288475e-05, + "loss": 0.2678, + "step": 3570 + }, + { + "epoch": 0.2829075064369182, + "grad_norm": 2.1200493395005857, + "learning_rate": 1.6828955231759495e-05, + "loss": 0.2851, + "step": 3571 + }, + { + "epoch": 0.2829867300455536, + "grad_norm": 2.0286185811001203, + "learning_rate": 1.682708047957667e-05, + "loss": 0.3328, + "step": 3572 + }, + { + "epoch": 0.28306595365418896, + "grad_norm": 2.02119392039722, + "learning_rate": 1.682520527786345e-05, + "loss": 0.3359, + "step": 3573 + }, + { + "epoch": 0.28314517726282434, + "grad_norm": 1.998318348475542, + "learning_rate": 1.6823329626743298e-05, + "loss": 0.2564, + "step": 3574 + }, + { + "epoch": 0.28322440087145967, + "grad_norm": 1.96852727328102, + "learning_rate": 1.6821453526339727e-05, + "loss": 0.2832, + "step": 3575 + }, + { + "epoch": 0.28330362448009505, + "grad_norm": 2.224176902081358, + "learning_rate": 1.6819576976776262e-05, + "loss": 0.3629, + "step": 3576 + }, + { + "epoch": 0.28338284808873043, + "grad_norm": 2.3555540355175495, + "learning_rate": 1.6817699978176464e-05, + "loss": 0.431, + "step": 3577 + }, + { + "epoch": 0.2834620716973658, + "grad_norm": 2.0133080493539444, + "learning_rate": 1.681582253066393e-05, + "loss": 0.3447, + "step": 3578 + }, + { + "epoch": 0.2835412953060012, + "grad_norm": 1.884766970419607, + "learning_rate": 1.681394463436228e-05, + "loss": 0.2601, + "step": 3579 + }, + { + "epoch": 0.2836205189146366, + "grad_norm": 2.4606668834815237, + "learning_rate": 1.6812066289395157e-05, + "loss": 0.4795, + "step": 3580 + }, + { + "epoch": 0.28369974252327196, + "grad_norm": 1.910508258048931, + "learning_rate": 1.681018749588625e-05, + "loss": 0.2632, + "step": 3581 + }, + { + "epoch": 0.2837789661319073, + "grad_norm": 2.3432004006672256, + "learning_rate": 1.6808308253959263e-05, + "loss": 0.2895, + "step": 3582 + }, + { + "epoch": 0.28385818974054267, + "grad_norm": 2.059271374398235, + "learning_rate": 1.680642856373794e-05, + "loss": 0.3668, + "step": 3583 + }, + { + "epoch": 0.28393741334917805, + "grad_norm": 1.9232457541921084, + "learning_rate": 1.680454842534604e-05, + "loss": 0.329, + "step": 3584 + }, + { + "epoch": 0.28401663695781343, + "grad_norm": 2.193321808622056, + "learning_rate": 1.6802667838907374e-05, + "loss": 0.3295, + "step": 3585 + }, + { + "epoch": 0.2840958605664488, + "grad_norm": 2.306313480534004, + "learning_rate": 1.680078680454576e-05, + "loss": 0.4148, + "step": 3586 + }, + { + "epoch": 0.2841750841750842, + "grad_norm": 1.7855198358027773, + "learning_rate": 1.6798905322385063e-05, + "loss": 0.2983, + "step": 3587 + }, + { + "epoch": 0.2842543077837196, + "grad_norm": 2.1383344587956556, + "learning_rate": 1.6797023392549157e-05, + "loss": 0.4068, + "step": 3588 + }, + { + "epoch": 0.2843335313923549, + "grad_norm": 1.8516225810619964, + "learning_rate": 1.679514101516197e-05, + "loss": 0.2654, + "step": 3589 + }, + { + "epoch": 0.2844127550009903, + "grad_norm": 1.8688152557506754, + "learning_rate": 1.6793258190347445e-05, + "loss": 0.3171, + "step": 3590 + }, + { + "epoch": 0.28449197860962566, + "grad_norm": 1.854097322738989, + "learning_rate": 1.679137491822955e-05, + "loss": 0.2939, + "step": 3591 + }, + { + "epoch": 0.28457120221826104, + "grad_norm": 2.3661591801846695, + "learning_rate": 1.6789491198932302e-05, + "loss": 0.2824, + "step": 3592 + }, + { + "epoch": 0.2846504258268964, + "grad_norm": 2.1640603983847244, + "learning_rate": 1.6787607032579724e-05, + "loss": 0.3046, + "step": 3593 + }, + { + "epoch": 0.2847296494355318, + "grad_norm": 1.7442931811303524, + "learning_rate": 1.678572241929588e-05, + "loss": 0.2556, + "step": 3594 + }, + { + "epoch": 0.2848088730441672, + "grad_norm": 2.121506717533791, + "learning_rate": 1.6783837359204868e-05, + "loss": 0.4281, + "step": 3595 + }, + { + "epoch": 0.2848880966528025, + "grad_norm": 2.210396454838994, + "learning_rate": 1.6781951852430813e-05, + "loss": 0.3138, + "step": 3596 + }, + { + "epoch": 0.2849673202614379, + "grad_norm": 1.65898651711375, + "learning_rate": 1.6780065899097853e-05, + "loss": 0.2239, + "step": 3597 + }, + { + "epoch": 0.2850465438700733, + "grad_norm": 2.0225418164027547, + "learning_rate": 1.677817949933018e-05, + "loss": 0.3401, + "step": 3598 + }, + { + "epoch": 0.28512576747870866, + "grad_norm": 1.8000689437663295, + "learning_rate": 1.6776292653252e-05, + "loss": 0.2647, + "step": 3599 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 2.217499317459855, + "learning_rate": 1.6774405360987556e-05, + "loss": 0.3322, + "step": 3600 + }, + { + "epoch": 0.2852842146959794, + "grad_norm": 2.675302464292596, + "learning_rate": 1.6772517622661115e-05, + "loss": 0.2487, + "step": 3601 + }, + { + "epoch": 0.2853634383046148, + "grad_norm": 1.7460731376420404, + "learning_rate": 1.6770629438396973e-05, + "loss": 0.2797, + "step": 3602 + }, + { + "epoch": 0.28544266191325013, + "grad_norm": 1.8089699555304126, + "learning_rate": 1.676874080831947e-05, + "loss": 0.3693, + "step": 3603 + }, + { + "epoch": 0.2855218855218855, + "grad_norm": 2.1212143885633505, + "learning_rate": 1.676685173255294e-05, + "loss": 0.3087, + "step": 3604 + }, + { + "epoch": 0.2856011091305209, + "grad_norm": 2.0775186518972526, + "learning_rate": 1.6764962211221796e-05, + "loss": 0.3754, + "step": 3605 + }, + { + "epoch": 0.2856803327391563, + "grad_norm": 2.0970190248751686, + "learning_rate": 1.6763072244450435e-05, + "loss": 0.336, + "step": 3606 + }, + { + "epoch": 0.28575955634779165, + "grad_norm": 1.898807057326851, + "learning_rate": 1.676118183236331e-05, + "loss": 0.2389, + "step": 3607 + }, + { + "epoch": 0.28583877995642704, + "grad_norm": 1.8566474916308082, + "learning_rate": 1.6759290975084894e-05, + "loss": 0.2653, + "step": 3608 + }, + { + "epoch": 0.28591800356506236, + "grad_norm": 1.809177876070745, + "learning_rate": 1.675739967273969e-05, + "loss": 0.239, + "step": 3609 + }, + { + "epoch": 0.28599722717369774, + "grad_norm": 1.797582595699639, + "learning_rate": 1.675550792545223e-05, + "loss": 0.3648, + "step": 3610 + }, + { + "epoch": 0.2860764507823331, + "grad_norm": 1.68539868626444, + "learning_rate": 1.6753615733347085e-05, + "loss": 0.3728, + "step": 3611 + }, + { + "epoch": 0.2861556743909685, + "grad_norm": 2.322625480430677, + "learning_rate": 1.6751723096548834e-05, + "loss": 0.4333, + "step": 3612 + }, + { + "epoch": 0.2862348979996039, + "grad_norm": 1.5795178072558425, + "learning_rate": 1.6749830015182106e-05, + "loss": 0.2479, + "step": 3613 + }, + { + "epoch": 0.28631412160823927, + "grad_norm": 1.9395875810966259, + "learning_rate": 1.6747936489371552e-05, + "loss": 0.3846, + "step": 3614 + }, + { + "epoch": 0.28639334521687465, + "grad_norm": 2.1451983105362333, + "learning_rate": 1.674604251924185e-05, + "loss": 0.3302, + "step": 3615 + }, + { + "epoch": 0.28647256882551, + "grad_norm": 2.3806039299292823, + "learning_rate": 1.6744148104917705e-05, + "loss": 0.3794, + "step": 3616 + }, + { + "epoch": 0.28655179243414536, + "grad_norm": 2.159648060985913, + "learning_rate": 1.6742253246523856e-05, + "loss": 0.2889, + "step": 3617 + }, + { + "epoch": 0.28663101604278074, + "grad_norm": 1.7232120736412737, + "learning_rate": 1.6740357944185074e-05, + "loss": 0.2689, + "step": 3618 + }, + { + "epoch": 0.2867102396514161, + "grad_norm": 1.494482510615317, + "learning_rate": 1.6738462198026154e-05, + "loss": 0.2683, + "step": 3619 + }, + { + "epoch": 0.2867894632600515, + "grad_norm": 1.8418227970753027, + "learning_rate": 1.6736566008171925e-05, + "loss": 0.3004, + "step": 3620 + }, + { + "epoch": 0.2868686868686869, + "grad_norm": 2.1292358677841037, + "learning_rate": 1.6734669374747237e-05, + "loss": 0.3283, + "step": 3621 + }, + { + "epoch": 0.28694791047732227, + "grad_norm": 1.9393627711781438, + "learning_rate": 1.6732772297876975e-05, + "loss": 0.2932, + "step": 3622 + }, + { + "epoch": 0.2870271340859576, + "grad_norm": 1.7090163594453147, + "learning_rate": 1.6730874777686053e-05, + "loss": 0.2739, + "step": 3623 + }, + { + "epoch": 0.287106357694593, + "grad_norm": 1.7680539257104662, + "learning_rate": 1.6728976814299413e-05, + "loss": 0.2821, + "step": 3624 + }, + { + "epoch": 0.28718558130322835, + "grad_norm": 1.8415989304597915, + "learning_rate": 1.6727078407842028e-05, + "loss": 0.2834, + "step": 3625 + }, + { + "epoch": 0.28726480491186374, + "grad_norm": 2.4013207874511817, + "learning_rate": 1.67251795584389e-05, + "loss": 0.3087, + "step": 3626 + }, + { + "epoch": 0.2873440285204991, + "grad_norm": 1.8725678102029302, + "learning_rate": 1.6723280266215057e-05, + "loss": 0.2968, + "step": 3627 + }, + { + "epoch": 0.2874232521291345, + "grad_norm": 1.6671936102910183, + "learning_rate": 1.672138053129556e-05, + "loss": 0.2754, + "step": 3628 + }, + { + "epoch": 0.2875024757377699, + "grad_norm": 2.0201350153265984, + "learning_rate": 1.6719480353805493e-05, + "loss": 0.3754, + "step": 3629 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 2.154100983446601, + "learning_rate": 1.671757973386998e-05, + "loss": 0.3122, + "step": 3630 + }, + { + "epoch": 0.2876609229550406, + "grad_norm": 1.9757988376227502, + "learning_rate": 1.6715678671614162e-05, + "loss": 0.3703, + "step": 3631 + }, + { + "epoch": 0.28774014656367597, + "grad_norm": 1.785060455468134, + "learning_rate": 1.6713777167163215e-05, + "loss": 0.2668, + "step": 3632 + }, + { + "epoch": 0.28781937017231135, + "grad_norm": 1.8800928803189738, + "learning_rate": 1.6711875220642352e-05, + "loss": 0.3937, + "step": 3633 + }, + { + "epoch": 0.28789859378094673, + "grad_norm": 1.8377484951390568, + "learning_rate": 1.6709972832176797e-05, + "loss": 0.2776, + "step": 3634 + }, + { + "epoch": 0.2879778173895821, + "grad_norm": 2.32823713372016, + "learning_rate": 1.670807000189182e-05, + "loss": 0.3597, + "step": 3635 + }, + { + "epoch": 0.2880570409982175, + "grad_norm": 2.0583496697391555, + "learning_rate": 1.6706166729912712e-05, + "loss": 0.2811, + "step": 3636 + }, + { + "epoch": 0.2881362646068528, + "grad_norm": 2.401963084165021, + "learning_rate": 1.670426301636479e-05, + "loss": 0.3441, + "step": 3637 + }, + { + "epoch": 0.2882154882154882, + "grad_norm": 1.8959326868716981, + "learning_rate": 1.6702358861373408e-05, + "loss": 0.3094, + "step": 3638 + }, + { + "epoch": 0.2882947118241236, + "grad_norm": 2.033529054018751, + "learning_rate": 1.6700454265063943e-05, + "loss": 0.2835, + "step": 3639 + }, + { + "epoch": 0.28837393543275897, + "grad_norm": 2.2388221935456665, + "learning_rate": 1.6698549227561805e-05, + "loss": 0.2869, + "step": 3640 + }, + { + "epoch": 0.28845315904139435, + "grad_norm": 2.2217667724950734, + "learning_rate": 1.6696643748992434e-05, + "loss": 0.276, + "step": 3641 + }, + { + "epoch": 0.28853238265002973, + "grad_norm": 2.041045805619277, + "learning_rate": 1.6694737829481292e-05, + "loss": 0.2934, + "step": 3642 + }, + { + "epoch": 0.2886116062586651, + "grad_norm": 2.2057607575055673, + "learning_rate": 1.669283146915388e-05, + "loss": 0.3095, + "step": 3643 + }, + { + "epoch": 0.28869082986730044, + "grad_norm": 2.1933911590540935, + "learning_rate": 1.6690924668135718e-05, + "loss": 0.289, + "step": 3644 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 1.704166912810074, + "learning_rate": 1.668901742655236e-05, + "loss": 0.2769, + "step": 3645 + }, + { + "epoch": 0.2888492770845712, + "grad_norm": 2.2530365913442916, + "learning_rate": 1.6687109744529394e-05, + "loss": 0.2988, + "step": 3646 + }, + { + "epoch": 0.2889285006932066, + "grad_norm": 2.3520705511536604, + "learning_rate": 1.6685201622192422e-05, + "loss": 0.3506, + "step": 3647 + }, + { + "epoch": 0.28900772430184196, + "grad_norm": 1.9000939170637945, + "learning_rate": 1.6683293059667096e-05, + "loss": 0.2916, + "step": 3648 + }, + { + "epoch": 0.28908694791047734, + "grad_norm": 2.1541870784007275, + "learning_rate": 1.6681384057079076e-05, + "loss": 0.3038, + "step": 3649 + }, + { + "epoch": 0.28916617151911267, + "grad_norm": 2.0023161956333912, + "learning_rate": 1.6679474614554066e-05, + "loss": 0.3088, + "step": 3650 + }, + { + "epoch": 0.28924539512774805, + "grad_norm": 2.1054650941987907, + "learning_rate": 1.667756473221779e-05, + "loss": 0.2875, + "step": 3651 + }, + { + "epoch": 0.28932461873638343, + "grad_norm": 2.123819177462837, + "learning_rate": 1.667565441019601e-05, + "loss": 0.3562, + "step": 3652 + }, + { + "epoch": 0.2894038423450188, + "grad_norm": 1.569177301847685, + "learning_rate": 1.6673743648614507e-05, + "loss": 0.2562, + "step": 3653 + }, + { + "epoch": 0.2894830659536542, + "grad_norm": 1.780503024684975, + "learning_rate": 1.66718324475991e-05, + "loss": 0.3112, + "step": 3654 + }, + { + "epoch": 0.2895622895622896, + "grad_norm": 2.2597996091076364, + "learning_rate": 1.6669920807275622e-05, + "loss": 0.4138, + "step": 3655 + }, + { + "epoch": 0.28964151317092496, + "grad_norm": 2.2363210299760174, + "learning_rate": 1.666800872776996e-05, + "loss": 0.353, + "step": 3656 + }, + { + "epoch": 0.2897207367795603, + "grad_norm": 1.7204035314898236, + "learning_rate": 1.6666096209208e-05, + "loss": 0.2918, + "step": 3657 + }, + { + "epoch": 0.28979996038819567, + "grad_norm": 2.364315481744238, + "learning_rate": 1.6664183251715687e-05, + "loss": 0.4291, + "step": 3658 + }, + { + "epoch": 0.28987918399683105, + "grad_norm": 2.050546981189933, + "learning_rate": 1.666226985541897e-05, + "loss": 0.2882, + "step": 3659 + }, + { + "epoch": 0.28995840760546643, + "grad_norm": 1.8062537696302947, + "learning_rate": 1.666035602044384e-05, + "loss": 0.2331, + "step": 3660 + }, + { + "epoch": 0.2900376312141018, + "grad_norm": 2.297512308887384, + "learning_rate": 1.665844174691631e-05, + "loss": 0.2945, + "step": 3661 + }, + { + "epoch": 0.2901168548227372, + "grad_norm": 2.373566087823118, + "learning_rate": 1.6656527034962433e-05, + "loss": 0.4073, + "step": 3662 + }, + { + "epoch": 0.2901960784313726, + "grad_norm": 2.0224546324253048, + "learning_rate": 1.665461188470828e-05, + "loss": 0.248, + "step": 3663 + }, + { + "epoch": 0.2902753020400079, + "grad_norm": 2.031895320341666, + "learning_rate": 1.6652696296279954e-05, + "loss": 0.311, + "step": 3664 + }, + { + "epoch": 0.2903545256486433, + "grad_norm": 1.76040940252944, + "learning_rate": 1.6650780269803587e-05, + "loss": 0.2771, + "step": 3665 + }, + { + "epoch": 0.29043374925727866, + "grad_norm": 2.0534493421980953, + "learning_rate": 1.664886380540534e-05, + "loss": 0.3281, + "step": 3666 + }, + { + "epoch": 0.29051297286591404, + "grad_norm": 1.8487819573246012, + "learning_rate": 1.664694690321141e-05, + "loss": 0.3537, + "step": 3667 + }, + { + "epoch": 0.2905921964745494, + "grad_norm": 2.4302198185415307, + "learning_rate": 1.6645029563348e-05, + "loss": 0.3363, + "step": 3668 + }, + { + "epoch": 0.2906714200831848, + "grad_norm": 1.8870363659888052, + "learning_rate": 1.6643111785941374e-05, + "loss": 0.2672, + "step": 3669 + }, + { + "epoch": 0.2907506436918202, + "grad_norm": 2.3411402197049536, + "learning_rate": 1.66411935711178e-05, + "loss": 0.3507, + "step": 3670 + }, + { + "epoch": 0.2908298673004555, + "grad_norm": 1.991495491258046, + "learning_rate": 1.6639274919003582e-05, + "loss": 0.2544, + "step": 3671 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 1.8100244496728577, + "learning_rate": 1.6637355829725057e-05, + "loss": 0.3495, + "step": 3672 + }, + { + "epoch": 0.2909883145177263, + "grad_norm": 1.915294909409222, + "learning_rate": 1.663543630340859e-05, + "loss": 0.2296, + "step": 3673 + }, + { + "epoch": 0.29106753812636166, + "grad_norm": 1.7755773252871612, + "learning_rate": 1.6633516340180568e-05, + "loss": 0.3525, + "step": 3674 + }, + { + "epoch": 0.29114676173499704, + "grad_norm": 1.8516287853410067, + "learning_rate": 1.6631595940167416e-05, + "loss": 0.3264, + "step": 3675 + }, + { + "epoch": 0.2912259853436324, + "grad_norm": 2.0458824774406827, + "learning_rate": 1.662967510349558e-05, + "loss": 0.346, + "step": 3676 + }, + { + "epoch": 0.2913052089522678, + "grad_norm": 1.9528731325446507, + "learning_rate": 1.6627753830291536e-05, + "loss": 0.2949, + "step": 3677 + }, + { + "epoch": 0.29138443256090313, + "grad_norm": 1.9595910522122675, + "learning_rate": 1.6625832120681795e-05, + "loss": 0.3277, + "step": 3678 + }, + { + "epoch": 0.2914636561695385, + "grad_norm": 1.7109544432578585, + "learning_rate": 1.6623909974792888e-05, + "loss": 0.323, + "step": 3679 + }, + { + "epoch": 0.2915428797781739, + "grad_norm": 1.9111618608442615, + "learning_rate": 1.6621987392751385e-05, + "loss": 0.3251, + "step": 3680 + }, + { + "epoch": 0.2916221033868093, + "grad_norm": 1.9444067990918976, + "learning_rate": 1.6620064374683874e-05, + "loss": 0.2339, + "step": 3681 + }, + { + "epoch": 0.29170132699544465, + "grad_norm": 1.927498540419202, + "learning_rate": 1.6618140920716976e-05, + "loss": 0.2686, + "step": 3682 + }, + { + "epoch": 0.29178055060408004, + "grad_norm": 1.8758224916618815, + "learning_rate": 1.6616217030977345e-05, + "loss": 0.3062, + "step": 3683 + }, + { + "epoch": 0.29185977421271536, + "grad_norm": 1.9389046409615305, + "learning_rate": 1.6614292705591658e-05, + "loss": 0.3173, + "step": 3684 + }, + { + "epoch": 0.29193899782135074, + "grad_norm": 1.6636008108224105, + "learning_rate": 1.6612367944686617e-05, + "loss": 0.2208, + "step": 3685 + }, + { + "epoch": 0.2920182214299861, + "grad_norm": 1.9698010936256172, + "learning_rate": 1.6610442748388972e-05, + "loss": 0.2766, + "step": 3686 + }, + { + "epoch": 0.2920974450386215, + "grad_norm": 2.4567989572923685, + "learning_rate": 1.6608517116825473e-05, + "loss": 0.2604, + "step": 3687 + }, + { + "epoch": 0.2921766686472569, + "grad_norm": 2.2638678610882215, + "learning_rate": 1.6606591050122924e-05, + "loss": 0.3228, + "step": 3688 + }, + { + "epoch": 0.29225589225589227, + "grad_norm": 2.2446395650968203, + "learning_rate": 1.660466454840814e-05, + "loss": 0.3117, + "step": 3689 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 1.847599156969952, + "learning_rate": 1.6602737611807975e-05, + "loss": 0.2522, + "step": 3690 + }, + { + "epoch": 0.292414339473163, + "grad_norm": 2.2433120141663165, + "learning_rate": 1.660081024044931e-05, + "loss": 0.2587, + "step": 3691 + }, + { + "epoch": 0.29249356308179836, + "grad_norm": 2.2521143359537272, + "learning_rate": 1.659888243445905e-05, + "loss": 0.3491, + "step": 3692 + }, + { + "epoch": 0.29257278669043374, + "grad_norm": 2.152709080731402, + "learning_rate": 1.6596954193964136e-05, + "loss": 0.281, + "step": 3693 + }, + { + "epoch": 0.2926520102990691, + "grad_norm": 1.9102095064229225, + "learning_rate": 1.659502551909153e-05, + "loss": 0.2164, + "step": 3694 + }, + { + "epoch": 0.2927312339077045, + "grad_norm": 1.8576841611316988, + "learning_rate": 1.6593096409968227e-05, + "loss": 0.3501, + "step": 3695 + }, + { + "epoch": 0.2928104575163399, + "grad_norm": 2.143174160634584, + "learning_rate": 1.6591166866721247e-05, + "loss": 0.2585, + "step": 3696 + }, + { + "epoch": 0.29288968112497527, + "grad_norm": 1.8322238417572019, + "learning_rate": 1.658923688947765e-05, + "loss": 0.3295, + "step": 3697 + }, + { + "epoch": 0.2929689047336106, + "grad_norm": 2.188756027766657, + "learning_rate": 1.6587306478364502e-05, + "loss": 0.2511, + "step": 3698 + }, + { + "epoch": 0.293048128342246, + "grad_norm": 2.0800860540775936, + "learning_rate": 1.658537563350892e-05, + "loss": 0.2951, + "step": 3699 + }, + { + "epoch": 0.29312735195088135, + "grad_norm": 2.120419441630339, + "learning_rate": 1.6583444355038042e-05, + "loss": 0.3684, + "step": 3700 + }, + { + "epoch": 0.29320657555951674, + "grad_norm": 1.5402254572628864, + "learning_rate": 1.6581512643079028e-05, + "loss": 0.1987, + "step": 3701 + }, + { + "epoch": 0.2932857991681521, + "grad_norm": 1.8139736902630164, + "learning_rate": 1.657958049775908e-05, + "loss": 0.2584, + "step": 3702 + }, + { + "epoch": 0.2933650227767875, + "grad_norm": 1.6039216041841664, + "learning_rate": 1.6577647919205407e-05, + "loss": 0.232, + "step": 3703 + }, + { + "epoch": 0.2934442463854229, + "grad_norm": 2.230475177625442, + "learning_rate": 1.6575714907545272e-05, + "loss": 0.3204, + "step": 3704 + }, + { + "epoch": 0.2935234699940582, + "grad_norm": 1.9072050700736505, + "learning_rate": 1.6573781462905954e-05, + "loss": 0.2088, + "step": 3705 + }, + { + "epoch": 0.2936026936026936, + "grad_norm": 2.052871951760179, + "learning_rate": 1.6571847585414754e-05, + "loss": 0.291, + "step": 3706 + }, + { + "epoch": 0.29368191721132897, + "grad_norm": 2.04222020295031, + "learning_rate": 1.6569913275199013e-05, + "loss": 0.3721, + "step": 3707 + }, + { + "epoch": 0.29376114081996435, + "grad_norm": 1.9967492969852367, + "learning_rate": 1.6567978532386094e-05, + "loss": 0.2808, + "step": 3708 + }, + { + "epoch": 0.29384036442859973, + "grad_norm": 2.1316194578421475, + "learning_rate": 1.6566043357103393e-05, + "loss": 0.2834, + "step": 3709 + }, + { + "epoch": 0.2939195880372351, + "grad_norm": 2.151667388197471, + "learning_rate": 1.656410774947833e-05, + "loss": 0.3011, + "step": 3710 + }, + { + "epoch": 0.2939988116458705, + "grad_norm": 2.0424733100685235, + "learning_rate": 1.6562171709638355e-05, + "loss": 0.3393, + "step": 3711 + }, + { + "epoch": 0.2940780352545058, + "grad_norm": 2.2135287829523933, + "learning_rate": 1.656023523771095e-05, + "loss": 0.3026, + "step": 3712 + }, + { + "epoch": 0.2941572588631412, + "grad_norm": 1.9148831504492885, + "learning_rate": 1.655829833382362e-05, + "loss": 0.2804, + "step": 3713 + }, + { + "epoch": 0.2942364824717766, + "grad_norm": 2.185569236254536, + "learning_rate": 1.6556360998103903e-05, + "loss": 0.4382, + "step": 3714 + }, + { + "epoch": 0.29431570608041197, + "grad_norm": 1.9581195776980467, + "learning_rate": 1.655442323067936e-05, + "loss": 0.2919, + "step": 3715 + }, + { + "epoch": 0.29439492968904735, + "grad_norm": 2.3721499609991166, + "learning_rate": 1.6552485031677586e-05, + "loss": 0.3242, + "step": 3716 + }, + { + "epoch": 0.29447415329768273, + "grad_norm": 1.9665744121968807, + "learning_rate": 1.65505464012262e-05, + "loss": 0.267, + "step": 3717 + }, + { + "epoch": 0.2945533769063181, + "grad_norm": 2.041080822940283, + "learning_rate": 1.6548607339452853e-05, + "loss": 0.2669, + "step": 3718 + }, + { + "epoch": 0.29463260051495344, + "grad_norm": 1.9415721706418003, + "learning_rate": 1.6546667846485224e-05, + "loss": 0.3438, + "step": 3719 + }, + { + "epoch": 0.2947118241235888, + "grad_norm": 1.7190684876015117, + "learning_rate": 1.6544727922451014e-05, + "loss": 0.2639, + "step": 3720 + }, + { + "epoch": 0.2947910477322242, + "grad_norm": 2.1085594563983614, + "learning_rate": 1.654278756747796e-05, + "loss": 0.3244, + "step": 3721 + }, + { + "epoch": 0.2948702713408596, + "grad_norm": 1.5993923912005714, + "learning_rate": 1.6540846781693837e-05, + "loss": 0.2671, + "step": 3722 + }, + { + "epoch": 0.29494949494949496, + "grad_norm": 1.9621948229440052, + "learning_rate": 1.6538905565226416e-05, + "loss": 0.2675, + "step": 3723 + }, + { + "epoch": 0.29502871855813034, + "grad_norm": 1.6014012615104973, + "learning_rate": 1.6536963918203532e-05, + "loss": 0.2627, + "step": 3724 + }, + { + "epoch": 0.29510794216676567, + "grad_norm": 2.1318802196688704, + "learning_rate": 1.6535021840753026e-05, + "loss": 0.3222, + "step": 3725 + }, + { + "epoch": 0.29518716577540105, + "grad_norm": 1.7809032779562033, + "learning_rate": 1.6533079333002775e-05, + "loss": 0.4058, + "step": 3726 + }, + { + "epoch": 0.29526638938403643, + "grad_norm": 1.926384773417309, + "learning_rate": 1.6531136395080687e-05, + "loss": 0.3357, + "step": 3727 + }, + { + "epoch": 0.2953456129926718, + "grad_norm": 1.8914574602303056, + "learning_rate": 1.6529193027114692e-05, + "loss": 0.3571, + "step": 3728 + }, + { + "epoch": 0.2954248366013072, + "grad_norm": 2.068030663470857, + "learning_rate": 1.6527249229232754e-05, + "loss": 0.3875, + "step": 3729 + }, + { + "epoch": 0.2955040602099426, + "grad_norm": 1.6786627830455285, + "learning_rate": 1.652530500156286e-05, + "loss": 0.2264, + "step": 3730 + }, + { + "epoch": 0.29558328381857796, + "grad_norm": 2.136329576259284, + "learning_rate": 1.652336034423303e-05, + "loss": 0.4239, + "step": 3731 + }, + { + "epoch": 0.2956625074272133, + "grad_norm": 1.7843910142418054, + "learning_rate": 1.6521415257371312e-05, + "loss": 0.2254, + "step": 3732 + }, + { + "epoch": 0.29574173103584867, + "grad_norm": 2.548053482969655, + "learning_rate": 1.6519469741105777e-05, + "loss": 0.396, + "step": 3733 + }, + { + "epoch": 0.29582095464448405, + "grad_norm": 1.8438518991444643, + "learning_rate": 1.6517523795564527e-05, + "loss": 0.3635, + "step": 3734 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 1.8387940819084987, + "learning_rate": 1.6515577420875698e-05, + "loss": 0.2378, + "step": 3735 + }, + { + "epoch": 0.2959794018617548, + "grad_norm": 1.8352681078412778, + "learning_rate": 1.6513630617167446e-05, + "loss": 0.3685, + "step": 3736 + }, + { + "epoch": 0.2960586254703902, + "grad_norm": 1.8922883255051839, + "learning_rate": 1.6511683384567957e-05, + "loss": 0.262, + "step": 3737 + }, + { + "epoch": 0.2961378490790256, + "grad_norm": 1.9879691389536898, + "learning_rate": 1.6509735723205453e-05, + "loss": 0.2826, + "step": 3738 + }, + { + "epoch": 0.2962170726876609, + "grad_norm": 2.562638275860836, + "learning_rate": 1.6507787633208173e-05, + "loss": 0.425, + "step": 3739 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.8254178860060655, + "learning_rate": 1.650583911470439e-05, + "loss": 0.2448, + "step": 3740 + }, + { + "epoch": 0.29637551990493166, + "grad_norm": 2.1200297511576824, + "learning_rate": 1.6503890167822406e-05, + "loss": 0.3824, + "step": 3741 + }, + { + "epoch": 0.29645474351356704, + "grad_norm": 2.2952838979787935, + "learning_rate": 1.6501940792690547e-05, + "loss": 0.2587, + "step": 3742 + }, + { + "epoch": 0.2965339671222024, + "grad_norm": 2.10791466760142, + "learning_rate": 1.6499990989437177e-05, + "loss": 0.354, + "step": 3743 + }, + { + "epoch": 0.2966131907308378, + "grad_norm": 2.179039323045411, + "learning_rate": 1.6498040758190673e-05, + "loss": 0.2955, + "step": 3744 + }, + { + "epoch": 0.2966924143394732, + "grad_norm": 1.8339928189125239, + "learning_rate": 1.6496090099079452e-05, + "loss": 0.2792, + "step": 3745 + }, + { + "epoch": 0.2967716379481085, + "grad_norm": 1.7664076519134926, + "learning_rate": 1.6494139012231954e-05, + "loss": 0.2318, + "step": 3746 + }, + { + "epoch": 0.2968508615567439, + "grad_norm": 2.476074006188833, + "learning_rate": 1.6492187497776654e-05, + "loss": 0.4235, + "step": 3747 + }, + { + "epoch": 0.2969300851653793, + "grad_norm": 1.8606425938848785, + "learning_rate": 1.6490235555842044e-05, + "loss": 0.2253, + "step": 3748 + }, + { + "epoch": 0.29700930877401466, + "grad_norm": 2.2264343648593554, + "learning_rate": 1.6488283186556648e-05, + "loss": 0.2951, + "step": 3749 + }, + { + "epoch": 0.29708853238265004, + "grad_norm": 2.184795345147084, + "learning_rate": 1.6486330390049027e-05, + "loss": 0.4208, + "step": 3750 + }, + { + "epoch": 0.2971677559912854, + "grad_norm": 2.941243300926836, + "learning_rate": 1.648437716644776e-05, + "loss": 0.2681, + "step": 3751 + }, + { + "epoch": 0.2972469795999208, + "grad_norm": 2.3336301897213256, + "learning_rate": 1.6482423515881455e-05, + "loss": 0.4012, + "step": 3752 + }, + { + "epoch": 0.29732620320855613, + "grad_norm": 1.9028148726439165, + "learning_rate": 1.6480469438478756e-05, + "loss": 0.2722, + "step": 3753 + }, + { + "epoch": 0.2974054268171915, + "grad_norm": 2.21444199732445, + "learning_rate": 1.6478514934368326e-05, + "loss": 0.3385, + "step": 3754 + }, + { + "epoch": 0.2974846504258269, + "grad_norm": 2.1803562965814844, + "learning_rate": 1.647656000367886e-05, + "loss": 0.2866, + "step": 3755 + }, + { + "epoch": 0.2975638740344623, + "grad_norm": 2.3484328394467147, + "learning_rate": 1.647460464653908e-05, + "loss": 0.3471, + "step": 3756 + }, + { + "epoch": 0.29764309764309765, + "grad_norm": 2.068600645448547, + "learning_rate": 1.6472648863077737e-05, + "loss": 0.2878, + "step": 3757 + }, + { + "epoch": 0.29772232125173304, + "grad_norm": 1.8594561751261733, + "learning_rate": 1.6470692653423614e-05, + "loss": 0.2747, + "step": 3758 + }, + { + "epoch": 0.2978015448603684, + "grad_norm": 1.9276118550341557, + "learning_rate": 1.6468736017705515e-05, + "loss": 0.386, + "step": 3759 + }, + { + "epoch": 0.29788076846900374, + "grad_norm": 2.3310174829201693, + "learning_rate": 1.646677895605227e-05, + "loss": 0.2957, + "step": 3760 + }, + { + "epoch": 0.2979599920776391, + "grad_norm": 2.1831717495372347, + "learning_rate": 1.6464821468592748e-05, + "loss": 0.362, + "step": 3761 + }, + { + "epoch": 0.2980392156862745, + "grad_norm": 1.803552518183649, + "learning_rate": 1.646286355545584e-05, + "loss": 0.2752, + "step": 3762 + }, + { + "epoch": 0.2981184392949099, + "grad_norm": 1.7538268891622637, + "learning_rate": 1.6460905216770467e-05, + "loss": 0.278, + "step": 3763 + }, + { + "epoch": 0.29819766290354527, + "grad_norm": 2.091933796751879, + "learning_rate": 1.6458946452665573e-05, + "loss": 0.3804, + "step": 3764 + }, + { + "epoch": 0.29827688651218065, + "grad_norm": 1.8890602600421795, + "learning_rate": 1.6456987263270132e-05, + "loss": 0.3151, + "step": 3765 + }, + { + "epoch": 0.298356110120816, + "grad_norm": 2.1169401638108316, + "learning_rate": 1.645502764871315e-05, + "loss": 0.4476, + "step": 3766 + }, + { + "epoch": 0.29843533372945136, + "grad_norm": 2.01799098792038, + "learning_rate": 1.6453067609123656e-05, + "loss": 0.3729, + "step": 3767 + }, + { + "epoch": 0.29851455733808674, + "grad_norm": 1.8230427343156717, + "learning_rate": 1.6451107144630708e-05, + "loss": 0.3823, + "step": 3768 + }, + { + "epoch": 0.2985937809467221, + "grad_norm": 2.084563839398148, + "learning_rate": 1.6449146255363395e-05, + "loss": 0.2655, + "step": 3769 + }, + { + "epoch": 0.2986730045553575, + "grad_norm": 1.6726094374073701, + "learning_rate": 1.6447184941450833e-05, + "loss": 0.209, + "step": 3770 + }, + { + "epoch": 0.2987522281639929, + "grad_norm": 1.7142935177218888, + "learning_rate": 1.644522320302217e-05, + "loss": 0.3637, + "step": 3771 + }, + { + "epoch": 0.29883145177262826, + "grad_norm": 2.0405684388935055, + "learning_rate": 1.6443261040206566e-05, + "loss": 0.2907, + "step": 3772 + }, + { + "epoch": 0.2989106753812636, + "grad_norm": 2.405996813502771, + "learning_rate": 1.6441298453133224e-05, + "loss": 0.4359, + "step": 3773 + }, + { + "epoch": 0.298989898989899, + "grad_norm": 2.0765645921548774, + "learning_rate": 1.6439335441931376e-05, + "loss": 0.3118, + "step": 3774 + }, + { + "epoch": 0.29906912259853435, + "grad_norm": 1.6769594478279577, + "learning_rate": 1.6437372006730276e-05, + "loss": 0.2352, + "step": 3775 + }, + { + "epoch": 0.29914834620716974, + "grad_norm": 2.144648243792654, + "learning_rate": 1.64354081476592e-05, + "loss": 0.3472, + "step": 3776 + }, + { + "epoch": 0.2992275698158051, + "grad_norm": 1.8940517249820277, + "learning_rate": 1.643344386484746e-05, + "loss": 0.3558, + "step": 3777 + }, + { + "epoch": 0.2993067934244405, + "grad_norm": 1.9081919128302902, + "learning_rate": 1.64314791584244e-05, + "loss": 0.3174, + "step": 3778 + }, + { + "epoch": 0.2993860170330759, + "grad_norm": 2.1805329539112086, + "learning_rate": 1.6429514028519383e-05, + "loss": 0.2623, + "step": 3779 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 1.5074215594426112, + "learning_rate": 1.6427548475261807e-05, + "loss": 0.2614, + "step": 3780 + }, + { + "epoch": 0.2995444642503466, + "grad_norm": 2.062324044594821, + "learning_rate": 1.642558249878109e-05, + "loss": 0.2816, + "step": 3781 + }, + { + "epoch": 0.29962368785898197, + "grad_norm": 2.176309277440953, + "learning_rate": 1.642361609920668e-05, + "loss": 0.2924, + "step": 3782 + }, + { + "epoch": 0.29970291146761735, + "grad_norm": 1.5885254544296088, + "learning_rate": 1.6421649276668065e-05, + "loss": 0.1966, + "step": 3783 + }, + { + "epoch": 0.29978213507625273, + "grad_norm": 2.109245576191315, + "learning_rate": 1.641968203129474e-05, + "loss": 0.326, + "step": 3784 + }, + { + "epoch": 0.2998613586848881, + "grad_norm": 1.7084069328818037, + "learning_rate": 1.641771436321624e-05, + "loss": 0.239, + "step": 3785 + }, + { + "epoch": 0.2999405822935235, + "grad_norm": 2.208359135301702, + "learning_rate": 1.6415746272562133e-05, + "loss": 0.2518, + "step": 3786 + }, + { + "epoch": 0.3000198059021588, + "grad_norm": 2.1678587978477712, + "learning_rate": 1.6413777759462005e-05, + "loss": 0.3437, + "step": 3787 + }, + { + "epoch": 0.3000990295107942, + "grad_norm": 2.112349183380899, + "learning_rate": 1.6411808824045472e-05, + "loss": 0.3893, + "step": 3788 + }, + { + "epoch": 0.3001782531194296, + "grad_norm": 2.1863989847052876, + "learning_rate": 1.640983946644218e-05, + "loss": 0.3599, + "step": 3789 + }, + { + "epoch": 0.30025747672806496, + "grad_norm": 2.1255369291140402, + "learning_rate": 1.64078696867818e-05, + "loss": 0.2255, + "step": 3790 + }, + { + "epoch": 0.30033670033670035, + "grad_norm": 2.007314992038718, + "learning_rate": 1.6405899485194034e-05, + "loss": 0.2372, + "step": 3791 + }, + { + "epoch": 0.3004159239453357, + "grad_norm": 2.101018394324457, + "learning_rate": 1.640392886180861e-05, + "loss": 0.3408, + "step": 3792 + }, + { + "epoch": 0.3004951475539711, + "grad_norm": 2.028528609212532, + "learning_rate": 1.6401957816755286e-05, + "loss": 0.2749, + "step": 3793 + }, + { + "epoch": 0.30057437116260644, + "grad_norm": 1.9926545159975135, + "learning_rate": 1.6399986350163844e-05, + "loss": 0.3149, + "step": 3794 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 2.055560333375988, + "learning_rate": 1.6398014462164093e-05, + "loss": 0.2956, + "step": 3795 + }, + { + "epoch": 0.3007328183798772, + "grad_norm": 2.0950262286077628, + "learning_rate": 1.6396042152885874e-05, + "loss": 0.2574, + "step": 3796 + }, + { + "epoch": 0.3008120419885126, + "grad_norm": 1.9932789912831277, + "learning_rate": 1.639406942245906e-05, + "loss": 0.2998, + "step": 3797 + }, + { + "epoch": 0.30089126559714796, + "grad_norm": 2.5109742138400444, + "learning_rate": 1.639209627101354e-05, + "loss": 0.4138, + "step": 3798 + }, + { + "epoch": 0.30097048920578334, + "grad_norm": 1.9310931595035867, + "learning_rate": 1.6390122698679234e-05, + "loss": 0.2583, + "step": 3799 + }, + { + "epoch": 0.3010497128144187, + "grad_norm": 1.9254912693992525, + "learning_rate": 1.6388148705586097e-05, + "loss": 0.31, + "step": 3800 + }, + { + "epoch": 0.30112893642305405, + "grad_norm": 1.8593630247204218, + "learning_rate": 1.6386174291864106e-05, + "loss": 0.2727, + "step": 3801 + }, + { + "epoch": 0.30120816003168943, + "grad_norm": 2.1329915531258012, + "learning_rate": 1.6384199457643264e-05, + "loss": 0.3383, + "step": 3802 + }, + { + "epoch": 0.3012873836403248, + "grad_norm": 1.824526030686308, + "learning_rate": 1.6382224203053607e-05, + "loss": 0.242, + "step": 3803 + }, + { + "epoch": 0.3013666072489602, + "grad_norm": 1.8891658764309938, + "learning_rate": 1.6380248528225197e-05, + "loss": 0.2673, + "step": 3804 + }, + { + "epoch": 0.3014458308575956, + "grad_norm": 2.113830019227714, + "learning_rate": 1.6378272433288122e-05, + "loss": 0.2756, + "step": 3805 + }, + { + "epoch": 0.30152505446623096, + "grad_norm": 1.9963541226080466, + "learning_rate": 1.6376295918372495e-05, + "loss": 0.384, + "step": 3806 + }, + { + "epoch": 0.3016042780748663, + "grad_norm": 2.198575301925684, + "learning_rate": 1.6374318983608464e-05, + "loss": 0.4783, + "step": 3807 + }, + { + "epoch": 0.30168350168350166, + "grad_norm": 2.0902659992128267, + "learning_rate": 1.63723416291262e-05, + "loss": 0.3417, + "step": 3808 + }, + { + "epoch": 0.30176272529213705, + "grad_norm": 1.9773956755356854, + "learning_rate": 1.63703638550559e-05, + "loss": 0.2949, + "step": 3809 + }, + { + "epoch": 0.3018419489007724, + "grad_norm": 2.143615139345269, + "learning_rate": 1.6368385661527795e-05, + "loss": 0.3105, + "step": 3810 + }, + { + "epoch": 0.3019211725094078, + "grad_norm": 1.9329996022195897, + "learning_rate": 1.6366407048672135e-05, + "loss": 0.2392, + "step": 3811 + }, + { + "epoch": 0.3020003961180432, + "grad_norm": 2.576074000705922, + "learning_rate": 1.6364428016619202e-05, + "loss": 0.2952, + "step": 3812 + }, + { + "epoch": 0.30207961972667857, + "grad_norm": 1.8022116453183235, + "learning_rate": 1.636244856549931e-05, + "loss": 0.2539, + "step": 3813 + }, + { + "epoch": 0.3021588433353139, + "grad_norm": 1.9554280598292493, + "learning_rate": 1.6360468695442797e-05, + "loss": 0.3017, + "step": 3814 + }, + { + "epoch": 0.3022380669439493, + "grad_norm": 2.1463715114075987, + "learning_rate": 1.6358488406580023e-05, + "loss": 0.3389, + "step": 3815 + }, + { + "epoch": 0.30231729055258466, + "grad_norm": 1.9529828410318093, + "learning_rate": 1.635650769904138e-05, + "loss": 0.3936, + "step": 3816 + }, + { + "epoch": 0.30239651416122004, + "grad_norm": 2.2422501080785056, + "learning_rate": 1.6354526572957292e-05, + "loss": 0.4553, + "step": 3817 + }, + { + "epoch": 0.3024757377698554, + "grad_norm": 2.0914354626421225, + "learning_rate": 1.6352545028458206e-05, + "loss": 0.3765, + "step": 3818 + }, + { + "epoch": 0.3025549613784908, + "grad_norm": 1.772903823894528, + "learning_rate": 1.6350563065674596e-05, + "loss": 0.2531, + "step": 3819 + }, + { + "epoch": 0.3026341849871262, + "grad_norm": 1.8396313762919148, + "learning_rate": 1.6348580684736962e-05, + "loss": 0.344, + "step": 3820 + }, + { + "epoch": 0.3027134085957615, + "grad_norm": 1.8627998950990017, + "learning_rate": 1.6346597885775843e-05, + "loss": 0.1968, + "step": 3821 + }, + { + "epoch": 0.3027926322043969, + "grad_norm": 1.8869015693376312, + "learning_rate": 1.6344614668921787e-05, + "loss": 0.3799, + "step": 3822 + }, + { + "epoch": 0.3028718558130323, + "grad_norm": 1.8189324039238206, + "learning_rate": 1.6342631034305386e-05, + "loss": 0.3318, + "step": 3823 + }, + { + "epoch": 0.30295107942166766, + "grad_norm": 1.871169467204066, + "learning_rate": 1.634064698205725e-05, + "loss": 0.2926, + "step": 3824 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.7389531760214991, + "learning_rate": 1.6338662512308013e-05, + "loss": 0.2721, + "step": 3825 + }, + { + "epoch": 0.3031095266389384, + "grad_norm": 1.7984709446641185, + "learning_rate": 1.6336677625188357e-05, + "loss": 0.2766, + "step": 3826 + }, + { + "epoch": 0.3031887502475738, + "grad_norm": 1.8646547821936554, + "learning_rate": 1.6334692320828968e-05, + "loss": 0.2688, + "step": 3827 + }, + { + "epoch": 0.3032679738562091, + "grad_norm": 1.7287855373854593, + "learning_rate": 1.6332706599360568e-05, + "loss": 0.2644, + "step": 3828 + }, + { + "epoch": 0.3033471974648445, + "grad_norm": 1.9694960688593486, + "learning_rate": 1.633072046091391e-05, + "loss": 0.3001, + "step": 3829 + }, + { + "epoch": 0.3034264210734799, + "grad_norm": 1.6664756303722617, + "learning_rate": 1.6328733905619775e-05, + "loss": 0.2419, + "step": 3830 + }, + { + "epoch": 0.30350564468211527, + "grad_norm": 2.274382029681345, + "learning_rate": 1.632674693360896e-05, + "loss": 0.3167, + "step": 3831 + }, + { + "epoch": 0.30358486829075065, + "grad_norm": 1.9274798710578476, + "learning_rate": 1.6324759545012306e-05, + "loss": 0.3752, + "step": 3832 + }, + { + "epoch": 0.30366409189938603, + "grad_norm": 1.897628306665924, + "learning_rate": 1.6322771739960664e-05, + "loss": 0.275, + "step": 3833 + }, + { + "epoch": 0.3037433155080214, + "grad_norm": 1.5999477754897036, + "learning_rate": 1.6320783518584926e-05, + "loss": 0.2875, + "step": 3834 + }, + { + "epoch": 0.30382253911665674, + "grad_norm": 1.8498667124553432, + "learning_rate": 1.631879488101601e-05, + "loss": 0.4053, + "step": 3835 + }, + { + "epoch": 0.3039017627252921, + "grad_norm": 1.995589468587847, + "learning_rate": 1.6316805827384856e-05, + "loss": 0.2893, + "step": 3836 + }, + { + "epoch": 0.3039809863339275, + "grad_norm": 2.803428337819996, + "learning_rate": 1.631481635782243e-05, + "loss": 0.3647, + "step": 3837 + }, + { + "epoch": 0.3040602099425629, + "grad_norm": 1.6973795930868885, + "learning_rate": 1.631282647245973e-05, + "loss": 0.2832, + "step": 3838 + }, + { + "epoch": 0.30413943355119827, + "grad_norm": 2.449413131235091, + "learning_rate": 1.6310836171427788e-05, + "loss": 0.272, + "step": 3839 + }, + { + "epoch": 0.30421865715983365, + "grad_norm": 2.160290289403697, + "learning_rate": 1.6308845454857647e-05, + "loss": 0.3483, + "step": 3840 + }, + { + "epoch": 0.30429788076846903, + "grad_norm": 1.4587392739605265, + "learning_rate": 1.6306854322880386e-05, + "loss": 0.2479, + "step": 3841 + }, + { + "epoch": 0.30437710437710436, + "grad_norm": 1.9545761659315637, + "learning_rate": 1.630486277562712e-05, + "loss": 0.361, + "step": 3842 + }, + { + "epoch": 0.30445632798573974, + "grad_norm": 2.2952174193052786, + "learning_rate": 1.6302870813228974e-05, + "loss": 0.3846, + "step": 3843 + }, + { + "epoch": 0.3045355515943751, + "grad_norm": 1.959285552471317, + "learning_rate": 1.6300878435817115e-05, + "loss": 0.3198, + "step": 3844 + }, + { + "epoch": 0.3046147752030105, + "grad_norm": 2.0954925150260677, + "learning_rate": 1.6298885643522724e-05, + "loss": 0.3323, + "step": 3845 + }, + { + "epoch": 0.3046939988116459, + "grad_norm": 1.5629762722753553, + "learning_rate": 1.6296892436477024e-05, + "loss": 0.2023, + "step": 3846 + }, + { + "epoch": 0.30477322242028126, + "grad_norm": 1.880133908214487, + "learning_rate": 1.6294898814811258e-05, + "loss": 0.2758, + "step": 3847 + }, + { + "epoch": 0.3048524460289166, + "grad_norm": 2.136160045692224, + "learning_rate": 1.629290477865669e-05, + "loss": 0.3238, + "step": 3848 + }, + { + "epoch": 0.30493166963755197, + "grad_norm": 2.160497272247849, + "learning_rate": 1.6290910328144627e-05, + "loss": 0.3461, + "step": 3849 + }, + { + "epoch": 0.30501089324618735, + "grad_norm": 2.421538222789173, + "learning_rate": 1.6288915463406386e-05, + "loss": 0.2588, + "step": 3850 + }, + { + "epoch": 0.30509011685482273, + "grad_norm": 1.9690331361034967, + "learning_rate": 1.6286920184573324e-05, + "loss": 0.3828, + "step": 3851 + }, + { + "epoch": 0.3051693404634581, + "grad_norm": 1.7584469761187513, + "learning_rate": 1.6284924491776815e-05, + "loss": 0.2607, + "step": 3852 + }, + { + "epoch": 0.3052485640720935, + "grad_norm": 2.243330961403839, + "learning_rate": 1.6282928385148273e-05, + "loss": 0.4174, + "step": 3853 + }, + { + "epoch": 0.3053277876807289, + "grad_norm": 1.8479340977378784, + "learning_rate": 1.6280931864819125e-05, + "loss": 0.2729, + "step": 3854 + }, + { + "epoch": 0.3054070112893642, + "grad_norm": 2.1521448731130217, + "learning_rate": 1.6278934930920834e-05, + "loss": 0.4796, + "step": 3855 + }, + { + "epoch": 0.3054862348979996, + "grad_norm": 1.6216592532715903, + "learning_rate": 1.6276937583584895e-05, + "loss": 0.2868, + "step": 3856 + }, + { + "epoch": 0.30556545850663497, + "grad_norm": 1.7803228248602432, + "learning_rate": 1.6274939822942818e-05, + "loss": 0.3352, + "step": 3857 + }, + { + "epoch": 0.30564468211527035, + "grad_norm": 2.167989785038989, + "learning_rate": 1.6272941649126146e-05, + "loss": 0.3192, + "step": 3858 + }, + { + "epoch": 0.30572390572390573, + "grad_norm": 2.2836151215559877, + "learning_rate": 1.627094306226645e-05, + "loss": 0.4233, + "step": 3859 + }, + { + "epoch": 0.3058031293325411, + "grad_norm": 1.7829398001596515, + "learning_rate": 1.6268944062495324e-05, + "loss": 0.4216, + "step": 3860 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 2.3748966718491284, + "learning_rate": 1.62669446499444e-05, + "loss": 0.3166, + "step": 3861 + }, + { + "epoch": 0.3059615765498118, + "grad_norm": 1.957610300936025, + "learning_rate": 1.6264944824745326e-05, + "loss": 0.355, + "step": 3862 + }, + { + "epoch": 0.3060408001584472, + "grad_norm": 1.9293191358750181, + "learning_rate": 1.6262944587029777e-05, + "loss": 0.3151, + "step": 3863 + }, + { + "epoch": 0.3061200237670826, + "grad_norm": 1.9989718286163354, + "learning_rate": 1.6260943936929462e-05, + "loss": 0.3056, + "step": 3864 + }, + { + "epoch": 0.30619924737571796, + "grad_norm": 2.1499105687490885, + "learning_rate": 1.6258942874576117e-05, + "loss": 0.3503, + "step": 3865 + }, + { + "epoch": 0.30627847098435335, + "grad_norm": 2.8254402425089244, + "learning_rate": 1.62569414001015e-05, + "loss": 0.3933, + "step": 3866 + }, + { + "epoch": 0.3063576945929887, + "grad_norm": 2.1122167124721005, + "learning_rate": 1.6254939513637397e-05, + "loss": 0.2725, + "step": 3867 + }, + { + "epoch": 0.3064369182016241, + "grad_norm": 1.40117053660708, + "learning_rate": 1.6252937215315622e-05, + "loss": 0.2355, + "step": 3868 + }, + { + "epoch": 0.30651614181025943, + "grad_norm": 2.6993801709257004, + "learning_rate": 1.6250934505268025e-05, + "loss": 0.2807, + "step": 3869 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 2.0841346484136425, + "learning_rate": 1.6248931383626464e-05, + "loss": 0.3336, + "step": 3870 + }, + { + "epoch": 0.3066745890275302, + "grad_norm": 2.2435399692401563, + "learning_rate": 1.6246927850522837e-05, + "loss": 0.3394, + "step": 3871 + }, + { + "epoch": 0.3067538126361656, + "grad_norm": 1.9419838099242361, + "learning_rate": 1.624492390608907e-05, + "loss": 0.2795, + "step": 3872 + }, + { + "epoch": 0.30683303624480096, + "grad_norm": 2.26851489961783, + "learning_rate": 1.6242919550457116e-05, + "loss": 0.2961, + "step": 3873 + }, + { + "epoch": 0.30691225985343634, + "grad_norm": 1.8683617538131634, + "learning_rate": 1.6240914783758946e-05, + "loss": 0.2729, + "step": 3874 + }, + { + "epoch": 0.3069914834620717, + "grad_norm": 2.577863566015914, + "learning_rate": 1.6238909606126568e-05, + "loss": 0.4546, + "step": 3875 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 1.614941693031965, + "learning_rate": 1.6236904017692016e-05, + "loss": 0.2562, + "step": 3876 + }, + { + "epoch": 0.30714993067934243, + "grad_norm": 1.7471133075748322, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.1715, + "step": 3877 + }, + { + "epoch": 0.3072291542879778, + "grad_norm": 2.0440620824160054, + "learning_rate": 1.6232891608944627e-05, + "loss": 0.4638, + "step": 3878 + }, + { + "epoch": 0.3073083778966132, + "grad_norm": 2.211795620200163, + "learning_rate": 1.6230884788895998e-05, + "loss": 0.324, + "step": 3879 + }, + { + "epoch": 0.3073876015052486, + "grad_norm": 2.123843471148046, + "learning_rate": 1.622887755857358e-05, + "loss": 0.4222, + "step": 3880 + }, + { + "epoch": 0.30746682511388396, + "grad_norm": 2.3514130537858247, + "learning_rate": 1.6226869918109553e-05, + "loss": 0.3576, + "step": 3881 + }, + { + "epoch": 0.30754604872251934, + "grad_norm": 1.9322503985319897, + "learning_rate": 1.62248618676361e-05, + "loss": 0.3345, + "step": 3882 + }, + { + "epoch": 0.30762527233115466, + "grad_norm": 2.412672439022028, + "learning_rate": 1.6222853407285447e-05, + "loss": 0.3246, + "step": 3883 + }, + { + "epoch": 0.30770449593979005, + "grad_norm": 1.8907797274226992, + "learning_rate": 1.622084453718984e-05, + "loss": 0.2687, + "step": 3884 + }, + { + "epoch": 0.3077837195484254, + "grad_norm": 1.9689228976989726, + "learning_rate": 1.621883525748155e-05, + "loss": 0.421, + "step": 3885 + }, + { + "epoch": 0.3078629431570608, + "grad_norm": 1.7495566011511918, + "learning_rate": 1.6216825568292885e-05, + "loss": 0.2069, + "step": 3886 + }, + { + "epoch": 0.3079421667656962, + "grad_norm": 2.6398760013938696, + "learning_rate": 1.6214815469756165e-05, + "loss": 0.3931, + "step": 3887 + }, + { + "epoch": 0.30802139037433157, + "grad_norm": 1.9719950123547463, + "learning_rate": 1.6212804962003757e-05, + "loss": 0.2859, + "step": 3888 + }, + { + "epoch": 0.3081006139829669, + "grad_norm": 1.940671676172438, + "learning_rate": 1.6210794045168033e-05, + "loss": 0.4319, + "step": 3889 + }, + { + "epoch": 0.3081798375916023, + "grad_norm": 2.152809329782499, + "learning_rate": 1.6208782719381403e-05, + "loss": 0.3564, + "step": 3890 + }, + { + "epoch": 0.30825906120023766, + "grad_norm": 1.9770811974953582, + "learning_rate": 1.6206770984776307e-05, + "loss": 0.3088, + "step": 3891 + }, + { + "epoch": 0.30833828480887304, + "grad_norm": 1.9141487175381255, + "learning_rate": 1.620475884148521e-05, + "loss": 0.3333, + "step": 3892 + }, + { + "epoch": 0.3084175084175084, + "grad_norm": 1.7703408511480312, + "learning_rate": 1.6202746289640594e-05, + "loss": 0.2357, + "step": 3893 + }, + { + "epoch": 0.3084967320261438, + "grad_norm": 1.9516593716941046, + "learning_rate": 1.620073332937498e-05, + "loss": 0.3622, + "step": 3894 + }, + { + "epoch": 0.3085759556347792, + "grad_norm": 1.9480693322190266, + "learning_rate": 1.6198719960820917e-05, + "loss": 0.2391, + "step": 3895 + }, + { + "epoch": 0.3086551792434145, + "grad_norm": 1.674640942011194, + "learning_rate": 1.619670618411097e-05, + "loss": 0.3304, + "step": 3896 + }, + { + "epoch": 0.3087344028520499, + "grad_norm": 1.7951846188338794, + "learning_rate": 1.6194691999377736e-05, + "loss": 0.2428, + "step": 3897 + }, + { + "epoch": 0.3088136264606853, + "grad_norm": 2.1378042822837577, + "learning_rate": 1.619267740675384e-05, + "loss": 0.3088, + "step": 3898 + }, + { + "epoch": 0.30889285006932066, + "grad_norm": 1.5547112546564361, + "learning_rate": 1.6190662406371937e-05, + "loss": 0.2087, + "step": 3899 + }, + { + "epoch": 0.30897207367795604, + "grad_norm": 1.9005705215609991, + "learning_rate": 1.6188646998364703e-05, + "loss": 0.3378, + "step": 3900 + }, + { + "epoch": 0.3090512972865914, + "grad_norm": 1.8001390377069098, + "learning_rate": 1.6186631182864835e-05, + "loss": 0.2601, + "step": 3901 + }, + { + "epoch": 0.3091305208952268, + "grad_norm": 2.177327509178617, + "learning_rate": 1.6184614960005078e-05, + "loss": 0.2658, + "step": 3902 + }, + { + "epoch": 0.3092097445038621, + "grad_norm": 2.5743060656250085, + "learning_rate": 1.6182598329918185e-05, + "loss": 0.2972, + "step": 3903 + }, + { + "epoch": 0.3092889681124975, + "grad_norm": 1.9088422755694918, + "learning_rate": 1.6180581292736938e-05, + "loss": 0.2194, + "step": 3904 + }, + { + "epoch": 0.3093681917211329, + "grad_norm": 1.671705802554036, + "learning_rate": 1.617856384859415e-05, + "loss": 0.2853, + "step": 3905 + }, + { + "epoch": 0.30944741532976827, + "grad_norm": 1.7661791742375041, + "learning_rate": 1.6176545997622662e-05, + "loss": 0.1881, + "step": 3906 + }, + { + "epoch": 0.30952663893840365, + "grad_norm": 1.783243911099805, + "learning_rate": 1.6174527739955345e-05, + "loss": 0.3599, + "step": 3907 + }, + { + "epoch": 0.30960586254703903, + "grad_norm": 1.8924379009781485, + "learning_rate": 1.6172509075725084e-05, + "loss": 0.2983, + "step": 3908 + }, + { + "epoch": 0.3096850861556744, + "grad_norm": 1.9509820560545892, + "learning_rate": 1.61704900050648e-05, + "loss": 0.2865, + "step": 3909 + }, + { + "epoch": 0.30976430976430974, + "grad_norm": 2.156195049673379, + "learning_rate": 1.616847052810744e-05, + "loss": 0.4085, + "step": 3910 + }, + { + "epoch": 0.3098435333729451, + "grad_norm": 1.9165157866369644, + "learning_rate": 1.6166450644985975e-05, + "loss": 0.3005, + "step": 3911 + }, + { + "epoch": 0.3099227569815805, + "grad_norm": 1.6751328481165728, + "learning_rate": 1.6164430355833407e-05, + "loss": 0.261, + "step": 3912 + }, + { + "epoch": 0.3100019805902159, + "grad_norm": 1.9158640865268861, + "learning_rate": 1.616240966078276e-05, + "loss": 0.293, + "step": 3913 + }, + { + "epoch": 0.31008120419885127, + "grad_norm": 2.0599838634663152, + "learning_rate": 1.616038855996709e-05, + "loss": 0.2297, + "step": 3914 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 2.4528871772281358, + "learning_rate": 1.6158367053519476e-05, + "loss": 0.2641, + "step": 3915 + }, + { + "epoch": 0.31023965141612203, + "grad_norm": 1.9193754673955372, + "learning_rate": 1.6156345141573022e-05, + "loss": 0.3411, + "step": 3916 + }, + { + "epoch": 0.31031887502475736, + "grad_norm": 2.358475031590979, + "learning_rate": 1.6154322824260865e-05, + "loss": 0.4313, + "step": 3917 + }, + { + "epoch": 0.31039809863339274, + "grad_norm": 2.0051862836262684, + "learning_rate": 1.615230010171616e-05, + "loss": 0.3042, + "step": 3918 + }, + { + "epoch": 0.3104773222420281, + "grad_norm": 1.8257439127886896, + "learning_rate": 1.61502769740721e-05, + "loss": 0.345, + "step": 3919 + }, + { + "epoch": 0.3105565458506635, + "grad_norm": 1.7222303158122967, + "learning_rate": 1.6148253441461887e-05, + "loss": 0.2583, + "step": 3920 + }, + { + "epoch": 0.3106357694592989, + "grad_norm": 1.8354109064706357, + "learning_rate": 1.6146229504018777e-05, + "loss": 0.3348, + "step": 3921 + }, + { + "epoch": 0.31071499306793426, + "grad_norm": 1.9938417547079281, + "learning_rate": 1.6144205161876023e-05, + "loss": 0.3578, + "step": 3922 + }, + { + "epoch": 0.3107942166765696, + "grad_norm": 1.9080262819362557, + "learning_rate": 1.6142180415166926e-05, + "loss": 0.2521, + "step": 3923 + }, + { + "epoch": 0.31087344028520497, + "grad_norm": 2.191058427817248, + "learning_rate": 1.61401552640248e-05, + "loss": 0.3704, + "step": 3924 + }, + { + "epoch": 0.31095266389384035, + "grad_norm": 1.94213503263778, + "learning_rate": 1.6138129708582996e-05, + "loss": 0.2344, + "step": 3925 + }, + { + "epoch": 0.31103188750247573, + "grad_norm": 1.6868327814697108, + "learning_rate": 1.6136103748974885e-05, + "loss": 0.2173, + "step": 3926 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.2202989322696167, + "learning_rate": 1.6134077385333867e-05, + "loss": 0.2989, + "step": 3927 + }, + { + "epoch": 0.3111903347197465, + "grad_norm": 1.6816314287567677, + "learning_rate": 1.613205061779337e-05, + "loss": 0.2768, + "step": 3928 + }, + { + "epoch": 0.3112695583283819, + "grad_norm": 1.9382536324508184, + "learning_rate": 1.6130023446486844e-05, + "loss": 0.3009, + "step": 3929 + }, + { + "epoch": 0.3113487819370172, + "grad_norm": 2.24667938949898, + "learning_rate": 1.612799587154777e-05, + "loss": 0.3958, + "step": 3930 + }, + { + "epoch": 0.3114280055456526, + "grad_norm": 1.7925720803306824, + "learning_rate": 1.6125967893109657e-05, + "loss": 0.2506, + "step": 3931 + }, + { + "epoch": 0.31150722915428797, + "grad_norm": 2.0923067316498587, + "learning_rate": 1.6123939511306028e-05, + "loss": 0.2696, + "step": 3932 + }, + { + "epoch": 0.31158645276292335, + "grad_norm": 1.8367898449378253, + "learning_rate": 1.6121910726270453e-05, + "loss": 0.3175, + "step": 3933 + }, + { + "epoch": 0.31166567637155873, + "grad_norm": 2.1286210925687064, + "learning_rate": 1.6119881538136514e-05, + "loss": 0.3189, + "step": 3934 + }, + { + "epoch": 0.3117448999801941, + "grad_norm": 1.7267476988139265, + "learning_rate": 1.611785194703782e-05, + "loss": 0.287, + "step": 3935 + }, + { + "epoch": 0.3118241235888295, + "grad_norm": 1.7876459912399263, + "learning_rate": 1.6115821953108015e-05, + "loss": 0.2529, + "step": 3936 + }, + { + "epoch": 0.3119033471974648, + "grad_norm": 2.1558750922097656, + "learning_rate": 1.611379155648076e-05, + "loss": 0.3349, + "step": 3937 + }, + { + "epoch": 0.3119825708061002, + "grad_norm": 1.7851203493784387, + "learning_rate": 1.611176075728975e-05, + "loss": 0.257, + "step": 3938 + }, + { + "epoch": 0.3120617944147356, + "grad_norm": 2.283855374530786, + "learning_rate": 1.61097295556687e-05, + "loss": 0.3111, + "step": 3939 + }, + { + "epoch": 0.31214101802337096, + "grad_norm": 1.7939502898337396, + "learning_rate": 1.610769795175136e-05, + "loss": 0.241, + "step": 3940 + }, + { + "epoch": 0.31222024163200635, + "grad_norm": 1.6231241676865509, + "learning_rate": 1.6105665945671497e-05, + "loss": 0.1799, + "step": 3941 + }, + { + "epoch": 0.3122994652406417, + "grad_norm": 2.352700261299547, + "learning_rate": 1.610363353756291e-05, + "loss": 0.3384, + "step": 3942 + }, + { + "epoch": 0.3123786888492771, + "grad_norm": 1.4658020470523758, + "learning_rate": 1.6101600727559423e-05, + "loss": 0.1937, + "step": 3943 + }, + { + "epoch": 0.31245791245791243, + "grad_norm": 2.1586367091409446, + "learning_rate": 1.6099567515794886e-05, + "loss": 0.2896, + "step": 3944 + }, + { + "epoch": 0.3125371360665478, + "grad_norm": 1.837981925044546, + "learning_rate": 1.609753390240318e-05, + "loss": 0.1941, + "step": 3945 + }, + { + "epoch": 0.3126163596751832, + "grad_norm": 1.8579657389698425, + "learning_rate": 1.6095499887518204e-05, + "loss": 0.2319, + "step": 3946 + }, + { + "epoch": 0.3126955832838186, + "grad_norm": 1.8232280004988357, + "learning_rate": 1.6093465471273894e-05, + "loss": 0.374, + "step": 3947 + }, + { + "epoch": 0.31277480689245396, + "grad_norm": 1.8727934165329296, + "learning_rate": 1.60914306538042e-05, + "loss": 0.2962, + "step": 3948 + }, + { + "epoch": 0.31285403050108934, + "grad_norm": 2.5372307999468893, + "learning_rate": 1.6089395435243105e-05, + "loss": 0.4109, + "step": 3949 + }, + { + "epoch": 0.3129332541097247, + "grad_norm": 1.9742757797484873, + "learning_rate": 1.6087359815724623e-05, + "loss": 0.3543, + "step": 3950 + }, + { + "epoch": 0.31301247771836005, + "grad_norm": 1.8996848433813027, + "learning_rate": 1.6085323795382785e-05, + "loss": 0.2593, + "step": 3951 + }, + { + "epoch": 0.31309170132699543, + "grad_norm": 1.8096764407408223, + "learning_rate": 1.608328737435166e-05, + "loss": 0.2937, + "step": 3952 + }, + { + "epoch": 0.3131709249356308, + "grad_norm": 1.9350111810145432, + "learning_rate": 1.608125055276533e-05, + "loss": 0.2628, + "step": 3953 + }, + { + "epoch": 0.3132501485442662, + "grad_norm": 2.1173831719090765, + "learning_rate": 1.607921333075791e-05, + "loss": 0.2997, + "step": 3954 + }, + { + "epoch": 0.3133293721529016, + "grad_norm": 3.1258977917392827, + "learning_rate": 1.607717570846355e-05, + "loss": 0.4694, + "step": 3955 + }, + { + "epoch": 0.31340859576153696, + "grad_norm": 1.9002696716607135, + "learning_rate": 1.6075137686016408e-05, + "loss": 0.2501, + "step": 3956 + }, + { + "epoch": 0.31348781937017234, + "grad_norm": 2.116947415993789, + "learning_rate": 1.6073099263550677e-05, + "loss": 0.2908, + "step": 3957 + }, + { + "epoch": 0.31356704297880766, + "grad_norm": 1.8899645168620067, + "learning_rate": 1.6071060441200587e-05, + "loss": 0.2369, + "step": 3958 + }, + { + "epoch": 0.31364626658744305, + "grad_norm": 2.308064173911316, + "learning_rate": 1.6069021219100375e-05, + "loss": 0.3566, + "step": 3959 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 1.7959416113274134, + "learning_rate": 1.606698159738432e-05, + "loss": 0.2439, + "step": 3960 + }, + { + "epoch": 0.3138047138047138, + "grad_norm": 1.8275297598982878, + "learning_rate": 1.606494157618672e-05, + "loss": 0.3451, + "step": 3961 + }, + { + "epoch": 0.3138839374133492, + "grad_norm": 1.5524191180813574, + "learning_rate": 1.60629011556419e-05, + "loss": 0.2685, + "step": 3962 + }, + { + "epoch": 0.31396316102198457, + "grad_norm": 1.9748588124929398, + "learning_rate": 1.6060860335884208e-05, + "loss": 0.2453, + "step": 3963 + }, + { + "epoch": 0.3140423846306199, + "grad_norm": 2.053769344164576, + "learning_rate": 1.605881911704803e-05, + "loss": 0.3235, + "step": 3964 + }, + { + "epoch": 0.3141216082392553, + "grad_norm": 2.224374890468355, + "learning_rate": 1.6056777499267764e-05, + "loss": 0.3611, + "step": 3965 + }, + { + "epoch": 0.31420083184789066, + "grad_norm": 1.789999684107788, + "learning_rate": 1.6054735482677842e-05, + "loss": 0.3265, + "step": 3966 + }, + { + "epoch": 0.31428005545652604, + "grad_norm": 1.798907961784022, + "learning_rate": 1.6052693067412724e-05, + "loss": 0.2314, + "step": 3967 + }, + { + "epoch": 0.3143592790651614, + "grad_norm": 1.6865712013244745, + "learning_rate": 1.605065025360689e-05, + "loss": 0.2424, + "step": 3968 + }, + { + "epoch": 0.3144385026737968, + "grad_norm": 1.8333734531802057, + "learning_rate": 1.6048607041394856e-05, + "loss": 0.2395, + "step": 3969 + }, + { + "epoch": 0.3145177262824322, + "grad_norm": 1.7631459514094503, + "learning_rate": 1.6046563430911148e-05, + "loss": 0.2464, + "step": 3970 + }, + { + "epoch": 0.3145969498910675, + "grad_norm": 2.3856367608051667, + "learning_rate": 1.6044519422290333e-05, + "loss": 0.3064, + "step": 3971 + }, + { + "epoch": 0.3146761734997029, + "grad_norm": 1.7766065061498073, + "learning_rate": 1.6042475015666995e-05, + "loss": 0.2692, + "step": 3972 + }, + { + "epoch": 0.3147553971083383, + "grad_norm": 1.6779049584215282, + "learning_rate": 1.604043021117575e-05, + "loss": 0.2584, + "step": 3973 + }, + { + "epoch": 0.31483462071697366, + "grad_norm": 1.8893508351771968, + "learning_rate": 1.603838500895125e-05, + "loss": 0.2939, + "step": 3974 + }, + { + "epoch": 0.31491384432560904, + "grad_norm": 2.3084413715650163, + "learning_rate": 1.6036339409128146e-05, + "loss": 0.3975, + "step": 3975 + }, + { + "epoch": 0.3149930679342444, + "grad_norm": 2.1433616849251456, + "learning_rate": 1.603429341184114e-05, + "loss": 0.2674, + "step": 3976 + }, + { + "epoch": 0.3150722915428798, + "grad_norm": 2.230723648052661, + "learning_rate": 1.6032247017224944e-05, + "loss": 0.2977, + "step": 3977 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 1.9179401943762864, + "learning_rate": 1.603020022541431e-05, + "loss": 0.211, + "step": 3978 + }, + { + "epoch": 0.3152307387601505, + "grad_norm": 1.9089046256172308, + "learning_rate": 1.6028153036544005e-05, + "loss": 0.34, + "step": 3979 + }, + { + "epoch": 0.3153099623687859, + "grad_norm": 1.9960431333738415, + "learning_rate": 1.6026105450748826e-05, + "loss": 0.3704, + "step": 3980 + }, + { + "epoch": 0.31538918597742127, + "grad_norm": 2.4076080649003684, + "learning_rate": 1.6024057468163604e-05, + "loss": 0.1812, + "step": 3981 + }, + { + "epoch": 0.31546840958605665, + "grad_norm": 1.8427481180961933, + "learning_rate": 1.602200908892318e-05, + "loss": 0.248, + "step": 3982 + }, + { + "epoch": 0.31554763319469203, + "grad_norm": 2.1798891995769956, + "learning_rate": 1.6019960313162436e-05, + "loss": 0.2334, + "step": 3983 + }, + { + "epoch": 0.3156268568033274, + "grad_norm": 1.974442811407452, + "learning_rate": 1.601791114101627e-05, + "loss": 0.2711, + "step": 3984 + }, + { + "epoch": 0.31570608041196274, + "grad_norm": 2.175938830014821, + "learning_rate": 1.6015861572619612e-05, + "loss": 0.3338, + "step": 3985 + }, + { + "epoch": 0.3157853040205981, + "grad_norm": 2.6222181513096694, + "learning_rate": 1.6013811608107415e-05, + "loss": 0.3487, + "step": 3986 + }, + { + "epoch": 0.3158645276292335, + "grad_norm": 2.20789515760567, + "learning_rate": 1.6011761247614664e-05, + "loss": 0.2957, + "step": 3987 + }, + { + "epoch": 0.3159437512378689, + "grad_norm": 1.8943652597393255, + "learning_rate": 1.600971049127636e-05, + "loss": 0.2472, + "step": 3988 + }, + { + "epoch": 0.31602297484650427, + "grad_norm": 2.024566237756117, + "learning_rate": 1.6007659339227534e-05, + "loss": 0.2642, + "step": 3989 + }, + { + "epoch": 0.31610219845513965, + "grad_norm": 1.944313898948442, + "learning_rate": 1.6005607791603247e-05, + "loss": 0.2307, + "step": 3990 + }, + { + "epoch": 0.31618142206377503, + "grad_norm": 1.9427822102582424, + "learning_rate": 1.6003555848538586e-05, + "loss": 0.3385, + "step": 3991 + }, + { + "epoch": 0.31626064567241036, + "grad_norm": 4.066136665150959, + "learning_rate": 1.600150351016866e-05, + "loss": 0.2736, + "step": 3992 + }, + { + "epoch": 0.31633986928104574, + "grad_norm": 1.92154477044616, + "learning_rate": 1.5999450776628607e-05, + "loss": 0.3779, + "step": 3993 + }, + { + "epoch": 0.3164190928896811, + "grad_norm": 1.9335814823070883, + "learning_rate": 1.5997397648053587e-05, + "loss": 0.3789, + "step": 3994 + }, + { + "epoch": 0.3164983164983165, + "grad_norm": 1.728191793993343, + "learning_rate": 1.599534412457879e-05, + "loss": 0.2616, + "step": 3995 + }, + { + "epoch": 0.3165775401069519, + "grad_norm": 2.81413102449859, + "learning_rate": 1.5993290206339426e-05, + "loss": 0.3215, + "step": 3996 + }, + { + "epoch": 0.31665676371558726, + "grad_norm": 2.3641176582933587, + "learning_rate": 1.5991235893470745e-05, + "loss": 0.3937, + "step": 3997 + }, + { + "epoch": 0.31673598732422265, + "grad_norm": 1.9101103998982618, + "learning_rate": 1.5989181186108003e-05, + "loss": 0.3124, + "step": 3998 + }, + { + "epoch": 0.31681521093285797, + "grad_norm": 1.7561473454114287, + "learning_rate": 1.59871260843865e-05, + "loss": 0.2864, + "step": 3999 + }, + { + "epoch": 0.31689443454149335, + "grad_norm": 1.8793343555809265, + "learning_rate": 1.5985070588441556e-05, + "loss": 0.3111, + "step": 4000 + }, + { + "epoch": 0.31697365815012873, + "grad_norm": 1.7717228214469674, + "learning_rate": 1.598301469840851e-05, + "loss": 0.3065, + "step": 4001 + }, + { + "epoch": 0.3170528817587641, + "grad_norm": 2.151955626117451, + "learning_rate": 1.598095841442273e-05, + "loss": 0.2503, + "step": 4002 + }, + { + "epoch": 0.3171321053673995, + "grad_norm": 1.9747632321716406, + "learning_rate": 1.5978901736619624e-05, + "loss": 0.2794, + "step": 4003 + }, + { + "epoch": 0.3172113289760349, + "grad_norm": 1.90056694755564, + "learning_rate": 1.5976844665134607e-05, + "loss": 0.3375, + "step": 4004 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 1.8978748209876632, + "learning_rate": 1.5974787200103124e-05, + "loss": 0.3145, + "step": 4005 + }, + { + "epoch": 0.3173697761933056, + "grad_norm": 2.1567993648396797, + "learning_rate": 1.5972729341660653e-05, + "loss": 0.396, + "step": 4006 + }, + { + "epoch": 0.31744899980194097, + "grad_norm": 1.9267914672126427, + "learning_rate": 1.597067108994269e-05, + "loss": 0.2758, + "step": 4007 + }, + { + "epoch": 0.31752822341057635, + "grad_norm": 1.5611573646467702, + "learning_rate": 1.5968612445084773e-05, + "loss": 0.2299, + "step": 4008 + }, + { + "epoch": 0.31760744701921173, + "grad_norm": 2.361271594463766, + "learning_rate": 1.596655340722244e-05, + "loss": 0.3738, + "step": 4009 + }, + { + "epoch": 0.3176866706278471, + "grad_norm": 2.068419813892609, + "learning_rate": 1.5964493976491278e-05, + "loss": 0.2954, + "step": 4010 + }, + { + "epoch": 0.3177658942364825, + "grad_norm": 1.870546204783706, + "learning_rate": 1.5962434153026884e-05, + "loss": 0.2908, + "step": 4011 + }, + { + "epoch": 0.3178451178451178, + "grad_norm": 1.9475016898043347, + "learning_rate": 1.596037393696489e-05, + "loss": 0.3117, + "step": 4012 + }, + { + "epoch": 0.3179243414537532, + "grad_norm": 2.2225880283071118, + "learning_rate": 1.5958313328440954e-05, + "loss": 0.2465, + "step": 4013 + }, + { + "epoch": 0.3180035650623886, + "grad_norm": 2.4470109336711148, + "learning_rate": 1.595625232759076e-05, + "loss": 0.5933, + "step": 4014 + }, + { + "epoch": 0.31808278867102396, + "grad_norm": 1.3762857094268033, + "learning_rate": 1.595419093455e-05, + "loss": 0.2015, + "step": 4015 + }, + { + "epoch": 0.31816201227965935, + "grad_norm": 2.3033690119405095, + "learning_rate": 1.5952129149454422e-05, + "loss": 0.4467, + "step": 4016 + }, + { + "epoch": 0.3182412358882947, + "grad_norm": 1.7167589765461668, + "learning_rate": 1.595006697243978e-05, + "loss": 0.2681, + "step": 4017 + }, + { + "epoch": 0.3183204594969301, + "grad_norm": 1.9150054326315642, + "learning_rate": 1.5948004403641853e-05, + "loss": 0.298, + "step": 4018 + }, + { + "epoch": 0.31839968310556543, + "grad_norm": 2.3981098079412804, + "learning_rate": 1.594594144319646e-05, + "loss": 0.3773, + "step": 4019 + }, + { + "epoch": 0.3184789067142008, + "grad_norm": 2.0250177617565144, + "learning_rate": 1.594387809123943e-05, + "loss": 0.2536, + "step": 4020 + }, + { + "epoch": 0.3185581303228362, + "grad_norm": 1.9640868143463204, + "learning_rate": 1.594181434790663e-05, + "loss": 0.2815, + "step": 4021 + }, + { + "epoch": 0.3186373539314716, + "grad_norm": 2.139947631377795, + "learning_rate": 1.5939750213333948e-05, + "loss": 0.3134, + "step": 4022 + }, + { + "epoch": 0.31871657754010696, + "grad_norm": 1.801002495716628, + "learning_rate": 1.593768568765729e-05, + "loss": 0.2418, + "step": 4023 + }, + { + "epoch": 0.31879580114874234, + "grad_norm": 1.8599436405834358, + "learning_rate": 1.5935620771012603e-05, + "loss": 0.3237, + "step": 4024 + }, + { + "epoch": 0.3188750247573777, + "grad_norm": 2.0256314102735753, + "learning_rate": 1.5933555463535846e-05, + "loss": 0.2543, + "step": 4025 + }, + { + "epoch": 0.31895424836601305, + "grad_norm": 2.052342543353026, + "learning_rate": 1.5931489765363014e-05, + "loss": 0.2433, + "step": 4026 + }, + { + "epoch": 0.31903347197464843, + "grad_norm": 2.5196943939498806, + "learning_rate": 1.592942367663012e-05, + "loss": 0.3964, + "step": 4027 + }, + { + "epoch": 0.3191126955832838, + "grad_norm": 2.8422374135487796, + "learning_rate": 1.5927357197473207e-05, + "loss": 0.3734, + "step": 4028 + }, + { + "epoch": 0.3191919191919192, + "grad_norm": 2.126178226411484, + "learning_rate": 1.5925290328028346e-05, + "loss": 0.2419, + "step": 4029 + }, + { + "epoch": 0.3192711428005546, + "grad_norm": 2.110679173449746, + "learning_rate": 1.5923223068431626e-05, + "loss": 0.3075, + "step": 4030 + }, + { + "epoch": 0.31935036640918996, + "grad_norm": 2.0639564225578164, + "learning_rate": 1.592115541881917e-05, + "loss": 0.3479, + "step": 4031 + }, + { + "epoch": 0.31942959001782534, + "grad_norm": 2.371948417427405, + "learning_rate": 1.5919087379327116e-05, + "loss": 0.3406, + "step": 4032 + }, + { + "epoch": 0.31950881362646066, + "grad_norm": 1.6725567352261204, + "learning_rate": 1.5917018950091642e-05, + "loss": 0.2522, + "step": 4033 + }, + { + "epoch": 0.31958803723509605, + "grad_norm": 1.8444118252975636, + "learning_rate": 1.591495013124894e-05, + "loss": 0.3204, + "step": 4034 + }, + { + "epoch": 0.3196672608437314, + "grad_norm": 2.122041391801781, + "learning_rate": 1.591288092293523e-05, + "loss": 0.3741, + "step": 4035 + }, + { + "epoch": 0.3197464844523668, + "grad_norm": 2.1362290050951946, + "learning_rate": 1.5910811325286768e-05, + "loss": 0.4184, + "step": 4036 + }, + { + "epoch": 0.3198257080610022, + "grad_norm": 1.6414471857008746, + "learning_rate": 1.5908741338439818e-05, + "loss": 0.237, + "step": 4037 + }, + { + "epoch": 0.31990493166963757, + "grad_norm": 2.49435095596472, + "learning_rate": 1.5906670962530683e-05, + "loss": 0.3213, + "step": 4038 + }, + { + "epoch": 0.31998415527827295, + "grad_norm": 1.6601537654362195, + "learning_rate": 1.5904600197695684e-05, + "loss": 0.2623, + "step": 4039 + }, + { + "epoch": 0.3200633788869083, + "grad_norm": 1.944171958200773, + "learning_rate": 1.5902529044071173e-05, + "loss": 0.3613, + "step": 4040 + }, + { + "epoch": 0.32014260249554366, + "grad_norm": 2.0236805023828808, + "learning_rate": 1.590045750179353e-05, + "loss": 0.3689, + "step": 4041 + }, + { + "epoch": 0.32022182610417904, + "grad_norm": 1.6275474590223904, + "learning_rate": 1.5898385570999146e-05, + "loss": 0.2395, + "step": 4042 + }, + { + "epoch": 0.3203010497128144, + "grad_norm": 2.2014302463912436, + "learning_rate": 1.589631325182446e-05, + "loss": 0.3119, + "step": 4043 + }, + { + "epoch": 0.3203802733214498, + "grad_norm": 2.43230973569851, + "learning_rate": 1.589424054440591e-05, + "loss": 0.2708, + "step": 4044 + }, + { + "epoch": 0.3204594969300852, + "grad_norm": 1.681017091662089, + "learning_rate": 1.5892167448879984e-05, + "loss": 0.2336, + "step": 4045 + }, + { + "epoch": 0.3205387205387205, + "grad_norm": 2.0078855698082174, + "learning_rate": 1.5890093965383186e-05, + "loss": 0.3314, + "step": 4046 + }, + { + "epoch": 0.3206179441473559, + "grad_norm": 2.049858088648156, + "learning_rate": 1.588802009405204e-05, + "loss": 0.3397, + "step": 4047 + }, + { + "epoch": 0.3206971677559913, + "grad_norm": 2.1167523309896317, + "learning_rate": 1.5885945835023104e-05, + "loss": 0.4135, + "step": 4048 + }, + { + "epoch": 0.32077639136462666, + "grad_norm": 2.0031193597246815, + "learning_rate": 1.5883871188432955e-05, + "loss": 0.3587, + "step": 4049 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 1.762456007263147, + "learning_rate": 1.5881796154418196e-05, + "loss": 0.2377, + "step": 4050 + }, + { + "epoch": 0.3209348385818974, + "grad_norm": 2.2895347703732685, + "learning_rate": 1.5879720733115464e-05, + "loss": 0.2599, + "step": 4051 + }, + { + "epoch": 0.3210140621905328, + "grad_norm": 1.8329954948344045, + "learning_rate": 1.5877644924661412e-05, + "loss": 0.248, + "step": 4052 + }, + { + "epoch": 0.3210932857991681, + "grad_norm": 1.9270899962526025, + "learning_rate": 1.5875568729192728e-05, + "loss": 0.227, + "step": 4053 + }, + { + "epoch": 0.3211725094078035, + "grad_norm": 1.6463588484945015, + "learning_rate": 1.587349214684611e-05, + "loss": 0.244, + "step": 4054 + }, + { + "epoch": 0.3212517330164389, + "grad_norm": 1.943624732892277, + "learning_rate": 1.5871415177758297e-05, + "loss": 0.2808, + "step": 4055 + }, + { + "epoch": 0.32133095662507427, + "grad_norm": 2.0175144306246535, + "learning_rate": 1.5869337822066043e-05, + "loss": 0.3496, + "step": 4056 + }, + { + "epoch": 0.32141018023370965, + "grad_norm": 2.0645349013474847, + "learning_rate": 1.586726007990614e-05, + "loss": 0.277, + "step": 4057 + }, + { + "epoch": 0.32148940384234503, + "grad_norm": 1.9668490925635078, + "learning_rate": 1.586518195141539e-05, + "loss": 0.246, + "step": 4058 + }, + { + "epoch": 0.3215686274509804, + "grad_norm": 2.076758760248237, + "learning_rate": 1.5863103436730627e-05, + "loss": 0.2806, + "step": 4059 + }, + { + "epoch": 0.32164785105961574, + "grad_norm": 1.8178453858181338, + "learning_rate": 1.586102453598872e-05, + "loss": 0.2752, + "step": 4060 + }, + { + "epoch": 0.3217270746682511, + "grad_norm": 1.8722637282137147, + "learning_rate": 1.5858945249326545e-05, + "loss": 0.3395, + "step": 4061 + }, + { + "epoch": 0.3218062982768865, + "grad_norm": 2.526002572404788, + "learning_rate": 1.5856865576881016e-05, + "loss": 0.342, + "step": 4062 + }, + { + "epoch": 0.3218855218855219, + "grad_norm": 2.6755086579777267, + "learning_rate": 1.5854785518789074e-05, + "loss": 0.3573, + "step": 4063 + }, + { + "epoch": 0.32196474549415727, + "grad_norm": 1.5681079518616239, + "learning_rate": 1.5852705075187674e-05, + "loss": 0.2815, + "step": 4064 + }, + { + "epoch": 0.32204396910279265, + "grad_norm": 1.7370720057240607, + "learning_rate": 1.5850624246213805e-05, + "loss": 0.3012, + "step": 4065 + }, + { + "epoch": 0.32212319271142803, + "grad_norm": 1.8082971477500807, + "learning_rate": 1.5848543032004483e-05, + "loss": 0.3721, + "step": 4066 + }, + { + "epoch": 0.32220241632006336, + "grad_norm": 2.1193927527194285, + "learning_rate": 1.5846461432696744e-05, + "loss": 0.2599, + "step": 4067 + }, + { + "epoch": 0.32228163992869874, + "grad_norm": 2.2325038799031134, + "learning_rate": 1.5844379448427648e-05, + "loss": 0.4903, + "step": 4068 + }, + { + "epoch": 0.3223608635373341, + "grad_norm": 1.392150212968567, + "learning_rate": 1.5842297079334293e-05, + "loss": 0.1691, + "step": 4069 + }, + { + "epoch": 0.3224400871459695, + "grad_norm": 1.8015553694689401, + "learning_rate": 1.5840214325553782e-05, + "loss": 0.2973, + "step": 4070 + }, + { + "epoch": 0.3225193107546049, + "grad_norm": 2.1733442007276733, + "learning_rate": 1.583813118722326e-05, + "loss": 0.3981, + "step": 4071 + }, + { + "epoch": 0.32259853436324026, + "grad_norm": 2.614894611398543, + "learning_rate": 1.583604766447989e-05, + "loss": 0.3827, + "step": 4072 + }, + { + "epoch": 0.32267775797187565, + "grad_norm": 2.119499121308234, + "learning_rate": 1.5833963757460863e-05, + "loss": 0.2996, + "step": 4073 + }, + { + "epoch": 0.32275698158051097, + "grad_norm": 2.037985274175979, + "learning_rate": 1.5831879466303393e-05, + "loss": 0.276, + "step": 4074 + }, + { + "epoch": 0.32283620518914635, + "grad_norm": 2.723679634710538, + "learning_rate": 1.5829794791144723e-05, + "loss": 0.3793, + "step": 4075 + }, + { + "epoch": 0.32291542879778173, + "grad_norm": 1.9970506313199945, + "learning_rate": 1.5827709732122115e-05, + "loss": 0.3441, + "step": 4076 + }, + { + "epoch": 0.3229946524064171, + "grad_norm": 1.9622586147111902, + "learning_rate": 1.5825624289372864e-05, + "loss": 0.3718, + "step": 4077 + }, + { + "epoch": 0.3230738760150525, + "grad_norm": 2.143431287772388, + "learning_rate": 1.5823538463034283e-05, + "loss": 0.2206, + "step": 4078 + }, + { + "epoch": 0.3231530996236879, + "grad_norm": 1.903792384129505, + "learning_rate": 1.5821452253243718e-05, + "loss": 0.3107, + "step": 4079 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 1.9027351850903347, + "learning_rate": 1.581936566013853e-05, + "loss": 0.4254, + "step": 4080 + }, + { + "epoch": 0.3233115468409586, + "grad_norm": 1.740506514681122, + "learning_rate": 1.5817278683856117e-05, + "loss": 0.2343, + "step": 4081 + }, + { + "epoch": 0.32339077044959397, + "grad_norm": 2.0469800874352138, + "learning_rate": 1.5815191324533893e-05, + "loss": 0.2973, + "step": 4082 + }, + { + "epoch": 0.32346999405822935, + "grad_norm": 2.2171681165223487, + "learning_rate": 1.58131035823093e-05, + "loss": 0.3047, + "step": 4083 + }, + { + "epoch": 0.32354921766686473, + "grad_norm": 1.689756820326841, + "learning_rate": 1.581101545731981e-05, + "loss": 0.2921, + "step": 4084 + }, + { + "epoch": 0.3236284412755001, + "grad_norm": 2.3031657463495305, + "learning_rate": 1.580892694970291e-05, + "loss": 0.3372, + "step": 4085 + }, + { + "epoch": 0.3237076648841355, + "grad_norm": 1.7919402536648075, + "learning_rate": 1.580683805959612e-05, + "loss": 0.2199, + "step": 4086 + }, + { + "epoch": 0.3237868884927708, + "grad_norm": 1.5639764339074862, + "learning_rate": 1.5804748787136987e-05, + "loss": 0.2603, + "step": 4087 + }, + { + "epoch": 0.3238661121014062, + "grad_norm": 1.6868211992951307, + "learning_rate": 1.5802659132463076e-05, + "loss": 0.2501, + "step": 4088 + }, + { + "epoch": 0.3239453357100416, + "grad_norm": 1.769418634947462, + "learning_rate": 1.5800569095711983e-05, + "loss": 0.1865, + "step": 4089 + }, + { + "epoch": 0.32402455931867696, + "grad_norm": 2.0179952751308132, + "learning_rate": 1.5798478677021327e-05, + "loss": 0.3892, + "step": 4090 + }, + { + "epoch": 0.32410378292731234, + "grad_norm": 2.078649795694238, + "learning_rate": 1.5796387876528746e-05, + "loss": 0.368, + "step": 4091 + }, + { + "epoch": 0.3241830065359477, + "grad_norm": 2.6619235752115444, + "learning_rate": 1.579429669437192e-05, + "loss": 0.2971, + "step": 4092 + }, + { + "epoch": 0.3242622301445831, + "grad_norm": 2.2926714265516663, + "learning_rate": 1.579220513068853e-05, + "loss": 0.332, + "step": 4093 + }, + { + "epoch": 0.32434145375321843, + "grad_norm": 1.8536661196517776, + "learning_rate": 1.5790113185616305e-05, + "loss": 0.2657, + "step": 4094 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 1.9288459995228031, + "learning_rate": 1.5788020859292987e-05, + "loss": 0.2645, + "step": 4095 + }, + { + "epoch": 0.3244999009704892, + "grad_norm": 1.8808167657942876, + "learning_rate": 1.5785928151856345e-05, + "loss": 0.2979, + "step": 4096 + }, + { + "epoch": 0.3245791245791246, + "grad_norm": 1.412144293332204, + "learning_rate": 1.5783835063444176e-05, + "loss": 0.2129, + "step": 4097 + }, + { + "epoch": 0.32465834818775996, + "grad_norm": 1.9839658100546034, + "learning_rate": 1.57817415941943e-05, + "loss": 0.3353, + "step": 4098 + }, + { + "epoch": 0.32473757179639534, + "grad_norm": 1.8842000444305722, + "learning_rate": 1.5779647744244556e-05, + "loss": 0.3133, + "step": 4099 + }, + { + "epoch": 0.3248167954050307, + "grad_norm": 1.9080653898707225, + "learning_rate": 1.577755351373282e-05, + "loss": 0.3901, + "step": 4100 + }, + { + "epoch": 0.32489601901366605, + "grad_norm": 1.7806153644427043, + "learning_rate": 1.5775458902796982e-05, + "loss": 0.2714, + "step": 4101 + }, + { + "epoch": 0.32497524262230143, + "grad_norm": 1.7365063142102544, + "learning_rate": 1.577336391157497e-05, + "loss": 0.357, + "step": 4102 + }, + { + "epoch": 0.3250544662309368, + "grad_norm": 2.1679652293564127, + "learning_rate": 1.5771268540204724e-05, + "loss": 0.3341, + "step": 4103 + }, + { + "epoch": 0.3251336898395722, + "grad_norm": 1.5722481058956101, + "learning_rate": 1.576917278882421e-05, + "loss": 0.229, + "step": 4104 + }, + { + "epoch": 0.3252129134482076, + "grad_norm": 1.844090865812453, + "learning_rate": 1.576707665757143e-05, + "loss": 0.3012, + "step": 4105 + }, + { + "epoch": 0.32529213705684296, + "grad_norm": 1.9913564865988536, + "learning_rate": 1.5764980146584402e-05, + "loss": 0.2775, + "step": 4106 + }, + { + "epoch": 0.32537136066547834, + "grad_norm": 1.8636738078139399, + "learning_rate": 1.5762883256001168e-05, + "loss": 0.273, + "step": 4107 + }, + { + "epoch": 0.32545058427411366, + "grad_norm": 1.630847762775362, + "learning_rate": 1.57607859859598e-05, + "loss": 0.2966, + "step": 4108 + }, + { + "epoch": 0.32552980788274904, + "grad_norm": 2.253427638357135, + "learning_rate": 1.5758688336598397e-05, + "loss": 0.3349, + "step": 4109 + }, + { + "epoch": 0.3256090314913844, + "grad_norm": 1.744316884058336, + "learning_rate": 1.5756590308055075e-05, + "loss": 0.2538, + "step": 4110 + }, + { + "epoch": 0.3256882551000198, + "grad_norm": 2.0056713052688036, + "learning_rate": 1.5754491900467982e-05, + "loss": 0.3635, + "step": 4111 + }, + { + "epoch": 0.3257674787086552, + "grad_norm": 2.0928956574864035, + "learning_rate": 1.5752393113975282e-05, + "loss": 0.3495, + "step": 4112 + }, + { + "epoch": 0.32584670231729057, + "grad_norm": 1.9696345383364964, + "learning_rate": 1.5750293948715178e-05, + "loss": 0.2408, + "step": 4113 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 1.7039144209183519, + "learning_rate": 1.5748194404825885e-05, + "loss": 0.3456, + "step": 4114 + }, + { + "epoch": 0.3260051495345613, + "grad_norm": 1.750849112988832, + "learning_rate": 1.574609448244565e-05, + "loss": 0.267, + "step": 4115 + }, + { + "epoch": 0.32608437314319666, + "grad_norm": 1.5537283801467563, + "learning_rate": 1.574399418171274e-05, + "loss": 0.2862, + "step": 4116 + }, + { + "epoch": 0.32616359675183204, + "grad_norm": 2.4039450094435617, + "learning_rate": 1.5741893502765452e-05, + "loss": 0.369, + "step": 4117 + }, + { + "epoch": 0.3262428203604674, + "grad_norm": 1.7095700085066374, + "learning_rate": 1.5739792445742103e-05, + "loss": 0.2441, + "step": 4118 + }, + { + "epoch": 0.3263220439691028, + "grad_norm": 1.914552416246257, + "learning_rate": 1.573769101078104e-05, + "loss": 0.2993, + "step": 4119 + }, + { + "epoch": 0.3264012675777382, + "grad_norm": 2.075327053752495, + "learning_rate": 1.573558919802064e-05, + "loss": 0.2773, + "step": 4120 + }, + { + "epoch": 0.32648049118637357, + "grad_norm": 1.8089891261128201, + "learning_rate": 1.573348700759928e-05, + "loss": 0.2803, + "step": 4121 + }, + { + "epoch": 0.3265597147950089, + "grad_norm": 2.7520214383524437, + "learning_rate": 1.573138443965539e-05, + "loss": 0.3152, + "step": 4122 + }, + { + "epoch": 0.3266389384036443, + "grad_norm": 2.0706597457899774, + "learning_rate": 1.572928149432741e-05, + "loss": 0.3849, + "step": 4123 + }, + { + "epoch": 0.32671816201227966, + "grad_norm": 1.7301278473448791, + "learning_rate": 1.5727178171753817e-05, + "loss": 0.2783, + "step": 4124 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 2.1039699424191656, + "learning_rate": 1.57250744720731e-05, + "loss": 0.2097, + "step": 4125 + }, + { + "epoch": 0.3268766092295504, + "grad_norm": 2.2591565211240154, + "learning_rate": 1.572297039542377e-05, + "loss": 0.3362, + "step": 4126 + }, + { + "epoch": 0.3269558328381858, + "grad_norm": 1.926279140824297, + "learning_rate": 1.572086594194438e-05, + "loss": 0.3088, + "step": 4127 + }, + { + "epoch": 0.3270350564468211, + "grad_norm": 1.8434933979819819, + "learning_rate": 1.571876111177349e-05, + "loss": 0.3095, + "step": 4128 + }, + { + "epoch": 0.3271142800554565, + "grad_norm": 1.6554772085052598, + "learning_rate": 1.571665590504971e-05, + "loss": 0.2262, + "step": 4129 + }, + { + "epoch": 0.3271935036640919, + "grad_norm": 1.7829938502914777, + "learning_rate": 1.5714550321911636e-05, + "loss": 0.2941, + "step": 4130 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 1.8118777365841372, + "learning_rate": 1.5712444362497917e-05, + "loss": 0.2147, + "step": 4131 + }, + { + "epoch": 0.32735195088136265, + "grad_norm": 2.1045371624305473, + "learning_rate": 1.5710338026947227e-05, + "loss": 0.3798, + "step": 4132 + }, + { + "epoch": 0.32743117448999803, + "grad_norm": 1.6913427320247267, + "learning_rate": 1.5708231315398255e-05, + "loss": 0.261, + "step": 4133 + }, + { + "epoch": 0.3275103980986334, + "grad_norm": 1.755451253773995, + "learning_rate": 1.570612422798972e-05, + "loss": 0.2415, + "step": 4134 + }, + { + "epoch": 0.32758962170726874, + "grad_norm": 1.9250741575553094, + "learning_rate": 1.5704016764860358e-05, + "loss": 0.2959, + "step": 4135 + }, + { + "epoch": 0.3276688453159041, + "grad_norm": 1.6107620157410916, + "learning_rate": 1.5701908926148933e-05, + "loss": 0.2067, + "step": 4136 + }, + { + "epoch": 0.3277480689245395, + "grad_norm": 1.9702381507118036, + "learning_rate": 1.5699800711994247e-05, + "loss": 0.2861, + "step": 4137 + }, + { + "epoch": 0.3278272925331749, + "grad_norm": 2.1229368030089724, + "learning_rate": 1.569769212253511e-05, + "loss": 0.2939, + "step": 4138 + }, + { + "epoch": 0.32790651614181027, + "grad_norm": 1.893017523021293, + "learning_rate": 1.569558315791036e-05, + "loss": 0.3275, + "step": 4139 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 1.6485804663534345, + "learning_rate": 1.5693473818258866e-05, + "loss": 0.2983, + "step": 4140 + }, + { + "epoch": 0.32806496335908103, + "grad_norm": 2.0349170447246565, + "learning_rate": 1.5691364103719515e-05, + "loss": 0.3413, + "step": 4141 + }, + { + "epoch": 0.32814418696771636, + "grad_norm": 2.176856751640671, + "learning_rate": 1.5689254014431225e-05, + "loss": 0.3351, + "step": 4142 + }, + { + "epoch": 0.32822341057635174, + "grad_norm": 1.906142663821326, + "learning_rate": 1.5687143550532932e-05, + "loss": 0.2591, + "step": 4143 + }, + { + "epoch": 0.3283026341849871, + "grad_norm": 2.5868170438887073, + "learning_rate": 1.56850327121636e-05, + "loss": 0.2514, + "step": 4144 + }, + { + "epoch": 0.3283818577936225, + "grad_norm": 2.058362372247087, + "learning_rate": 1.568292149946222e-05, + "loss": 0.3288, + "step": 4145 + }, + { + "epoch": 0.3284610814022579, + "grad_norm": 2.034689932872934, + "learning_rate": 1.56808099125678e-05, + "loss": 0.2298, + "step": 4146 + }, + { + "epoch": 0.32854030501089326, + "grad_norm": 1.893909594325405, + "learning_rate": 1.5678697951619386e-05, + "loss": 0.2916, + "step": 4147 + }, + { + "epoch": 0.32861952861952864, + "grad_norm": 2.1359174516830683, + "learning_rate": 1.5676585616756037e-05, + "loss": 0.3771, + "step": 4148 + }, + { + "epoch": 0.32869875222816397, + "grad_norm": 2.2077493460693307, + "learning_rate": 1.5674472908116834e-05, + "loss": 0.3048, + "step": 4149 + }, + { + "epoch": 0.32877797583679935, + "grad_norm": 1.8999387749329557, + "learning_rate": 1.5672359825840895e-05, + "loss": 0.2521, + "step": 4150 + }, + { + "epoch": 0.32885719944543473, + "grad_norm": 1.9518996024157405, + "learning_rate": 1.567024637006736e-05, + "loss": 0.2551, + "step": 4151 + }, + { + "epoch": 0.3289364230540701, + "grad_norm": 1.9548221225523328, + "learning_rate": 1.566813254093538e-05, + "loss": 0.3271, + "step": 4152 + }, + { + "epoch": 0.3290156466627055, + "grad_norm": 1.7712907745007795, + "learning_rate": 1.566601833858415e-05, + "loss": 0.283, + "step": 4153 + }, + { + "epoch": 0.3290948702713409, + "grad_norm": 1.9249634518316, + "learning_rate": 1.566390376315287e-05, + "loss": 0.2552, + "step": 4154 + }, + { + "epoch": 0.32917409387997626, + "grad_norm": 2.1857893391887737, + "learning_rate": 1.5661788814780782e-05, + "loss": 0.3248, + "step": 4155 + }, + { + "epoch": 0.3292533174886116, + "grad_norm": 1.7952313859059046, + "learning_rate": 1.5659673493607144e-05, + "loss": 0.2421, + "step": 4156 + }, + { + "epoch": 0.32933254109724697, + "grad_norm": 2.284554172395247, + "learning_rate": 1.565755779977124e-05, + "loss": 0.3253, + "step": 4157 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 1.7305090606284783, + "learning_rate": 1.5655441733412376e-05, + "loss": 0.2471, + "step": 4158 + }, + { + "epoch": 0.32949098831451773, + "grad_norm": 1.4725869928264486, + "learning_rate": 1.5653325294669884e-05, + "loss": 0.2099, + "step": 4159 + }, + { + "epoch": 0.3295702119231531, + "grad_norm": 1.726508259815418, + "learning_rate": 1.565120848368313e-05, + "loss": 0.2285, + "step": 4160 + }, + { + "epoch": 0.3296494355317885, + "grad_norm": 1.6955974922527628, + "learning_rate": 1.5649091300591482e-05, + "loss": 0.2198, + "step": 4161 + }, + { + "epoch": 0.3297286591404238, + "grad_norm": 1.485382215649945, + "learning_rate": 1.564697374553436e-05, + "loss": 0.1918, + "step": 4162 + }, + { + "epoch": 0.3298078827490592, + "grad_norm": 1.9590472179203442, + "learning_rate": 1.5644855818651184e-05, + "loss": 0.3356, + "step": 4163 + }, + { + "epoch": 0.3298871063576946, + "grad_norm": 2.0848972358316553, + "learning_rate": 1.564273752008141e-05, + "loss": 0.491, + "step": 4164 + }, + { + "epoch": 0.32996632996632996, + "grad_norm": 1.5195430850276355, + "learning_rate": 1.5640618849964528e-05, + "loss": 0.2071, + "step": 4165 + }, + { + "epoch": 0.33004555357496534, + "grad_norm": 2.8200981666878717, + "learning_rate": 1.5638499808440036e-05, + "loss": 0.2827, + "step": 4166 + }, + { + "epoch": 0.3301247771836007, + "grad_norm": 1.4756598003004047, + "learning_rate": 1.563638039564746e-05, + "loss": 0.1947, + "step": 4167 + }, + { + "epoch": 0.3302040007922361, + "grad_norm": 1.7680420416192881, + "learning_rate": 1.5634260611726355e-05, + "loss": 0.2517, + "step": 4168 + }, + { + "epoch": 0.33028322440087143, + "grad_norm": 2.7309634235464935, + "learning_rate": 1.5632140456816302e-05, + "loss": 0.3589, + "step": 4169 + }, + { + "epoch": 0.3303624480095068, + "grad_norm": 1.8039379695711888, + "learning_rate": 1.5630019931056894e-05, + "loss": 0.3873, + "step": 4170 + }, + { + "epoch": 0.3304416716181422, + "grad_norm": 2.0306410386562277, + "learning_rate": 1.5627899034587768e-05, + "loss": 0.248, + "step": 4171 + }, + { + "epoch": 0.3305208952267776, + "grad_norm": 1.8613563957747847, + "learning_rate": 1.562577776754857e-05, + "loss": 0.3148, + "step": 4172 + }, + { + "epoch": 0.33060011883541296, + "grad_norm": 2.0268976858785335, + "learning_rate": 1.5623656130078976e-05, + "loss": 0.3096, + "step": 4173 + }, + { + "epoch": 0.33067934244404834, + "grad_norm": 1.9642267780551177, + "learning_rate": 1.5621534122318682e-05, + "loss": 0.4011, + "step": 4174 + }, + { + "epoch": 0.3307585660526837, + "grad_norm": 2.2275136002737694, + "learning_rate": 1.5619411744407416e-05, + "loss": 0.2974, + "step": 4175 + }, + { + "epoch": 0.33083778966131905, + "grad_norm": 2.0573585547048574, + "learning_rate": 1.561728899648493e-05, + "loss": 0.4184, + "step": 4176 + }, + { + "epoch": 0.33091701326995443, + "grad_norm": 1.784065174703472, + "learning_rate": 1.561516587869099e-05, + "loss": 0.2368, + "step": 4177 + }, + { + "epoch": 0.3309962368785898, + "grad_norm": 2.345085993296679, + "learning_rate": 1.5613042391165395e-05, + "loss": 0.5117, + "step": 4178 + }, + { + "epoch": 0.3310754604872252, + "grad_norm": 1.8843346096104885, + "learning_rate": 1.5610918534047964e-05, + "loss": 0.3124, + "step": 4179 + }, + { + "epoch": 0.3311546840958606, + "grad_norm": 2.102157227950788, + "learning_rate": 1.5608794307478546e-05, + "loss": 0.3268, + "step": 4180 + }, + { + "epoch": 0.33123390770449596, + "grad_norm": 2.086338685799572, + "learning_rate": 1.5606669711597017e-05, + "loss": 0.2938, + "step": 4181 + }, + { + "epoch": 0.33131313131313134, + "grad_norm": 1.7652348594152758, + "learning_rate": 1.560454474654326e-05, + "loss": 0.2812, + "step": 4182 + }, + { + "epoch": 0.33139235492176666, + "grad_norm": 1.8252978613925877, + "learning_rate": 1.56024194124572e-05, + "loss": 0.263, + "step": 4183 + }, + { + "epoch": 0.33147157853040204, + "grad_norm": 1.864031163813262, + "learning_rate": 1.5600293709478776e-05, + "loss": 0.2456, + "step": 4184 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 1.8790595816595552, + "learning_rate": 1.559816763774796e-05, + "loss": 0.2499, + "step": 4185 + }, + { + "epoch": 0.3316300257476728, + "grad_norm": 1.3063733944967173, + "learning_rate": 1.559604119740474e-05, + "loss": 0.1497, + "step": 4186 + }, + { + "epoch": 0.3317092493563082, + "grad_norm": 2.011374018149606, + "learning_rate": 1.5593914388589136e-05, + "loss": 0.2942, + "step": 4187 + }, + { + "epoch": 0.33178847296494357, + "grad_norm": 1.9226572666156472, + "learning_rate": 1.559178721144119e-05, + "loss": 0.3166, + "step": 4188 + }, + { + "epoch": 0.33186769657357895, + "grad_norm": 2.7010134848217198, + "learning_rate": 1.5589659666100952e-05, + "loss": 0.3039, + "step": 4189 + }, + { + "epoch": 0.3319469201822143, + "grad_norm": 1.6929861645498105, + "learning_rate": 1.5587531752708528e-05, + "loss": 0.2536, + "step": 4190 + }, + { + "epoch": 0.33202614379084966, + "grad_norm": 2.0686873432262693, + "learning_rate": 1.558540347140402e-05, + "loss": 0.3263, + "step": 4191 + }, + { + "epoch": 0.33210536739948504, + "grad_norm": 2.151177216151281, + "learning_rate": 1.558327482232757e-05, + "loss": 0.2321, + "step": 4192 + }, + { + "epoch": 0.3321845910081204, + "grad_norm": 2.021348978810011, + "learning_rate": 1.558114580561934e-05, + "loss": 0.2866, + "step": 4193 + }, + { + "epoch": 0.3322638146167558, + "grad_norm": 1.9616187055423595, + "learning_rate": 1.557901642141951e-05, + "loss": 0.2853, + "step": 4194 + }, + { + "epoch": 0.3323430382253912, + "grad_norm": 1.82481141229278, + "learning_rate": 1.5576886669868297e-05, + "loss": 0.3359, + "step": 4195 + }, + { + "epoch": 0.33242226183402657, + "grad_norm": 1.9340850627275143, + "learning_rate": 1.5574756551105926e-05, + "loss": 0.3128, + "step": 4196 + }, + { + "epoch": 0.3325014854426619, + "grad_norm": 1.6995045651117409, + "learning_rate": 1.5572626065272666e-05, + "loss": 0.2094, + "step": 4197 + }, + { + "epoch": 0.3325807090512973, + "grad_norm": 1.5882197868249708, + "learning_rate": 1.557049521250879e-05, + "loss": 0.2524, + "step": 4198 + }, + { + "epoch": 0.33265993265993266, + "grad_norm": 2.107135541455425, + "learning_rate": 1.5568363992954607e-05, + "loss": 0.3428, + "step": 4199 + }, + { + "epoch": 0.33273915626856804, + "grad_norm": 2.1211769764202546, + "learning_rate": 1.556623240675045e-05, + "loss": 0.2182, + "step": 4200 + }, + { + "epoch": 0.3328183798772034, + "grad_norm": 1.8211730408897344, + "learning_rate": 1.556410045403667e-05, + "loss": 0.28, + "step": 4201 + }, + { + "epoch": 0.3328976034858388, + "grad_norm": 2.0872544797002566, + "learning_rate": 1.556196813495365e-05, + "loss": 0.3396, + "step": 4202 + }, + { + "epoch": 0.3329768270944741, + "grad_norm": 1.6635062661309072, + "learning_rate": 1.555983544964179e-05, + "loss": 0.2766, + "step": 4203 + }, + { + "epoch": 0.3330560507031095, + "grad_norm": 1.694082658103534, + "learning_rate": 1.555770239824152e-05, + "loss": 0.2121, + "step": 4204 + }, + { + "epoch": 0.3331352743117449, + "grad_norm": 2.605382853380774, + "learning_rate": 1.5555568980893284e-05, + "loss": 0.3387, + "step": 4205 + }, + { + "epoch": 0.33321449792038027, + "grad_norm": 1.800015559639178, + "learning_rate": 1.5553435197737566e-05, + "loss": 0.2996, + "step": 4206 + }, + { + "epoch": 0.33329372152901565, + "grad_norm": 1.9240469006806957, + "learning_rate": 1.5551301048914863e-05, + "loss": 0.3536, + "step": 4207 + }, + { + "epoch": 0.33337294513765103, + "grad_norm": 2.361148170414222, + "learning_rate": 1.5549166534565695e-05, + "loss": 0.3056, + "step": 4208 + }, + { + "epoch": 0.3334521687462864, + "grad_norm": 1.5466109690125924, + "learning_rate": 1.554703165483061e-05, + "loss": 0.2417, + "step": 4209 + }, + { + "epoch": 0.33353139235492174, + "grad_norm": 2.296150315066984, + "learning_rate": 1.5544896409850183e-05, + "loss": 0.2576, + "step": 4210 + }, + { + "epoch": 0.3336106159635571, + "grad_norm": 2.0513319074031404, + "learning_rate": 1.554276079976501e-05, + "loss": 0.2914, + "step": 4211 + }, + { + "epoch": 0.3336898395721925, + "grad_norm": 1.9963253522224222, + "learning_rate": 1.5540624824715703e-05, + "loss": 0.2474, + "step": 4212 + }, + { + "epoch": 0.3337690631808279, + "grad_norm": 1.509464447307471, + "learning_rate": 1.5538488484842914e-05, + "loss": 0.2196, + "step": 4213 + }, + { + "epoch": 0.33384828678946327, + "grad_norm": 1.6846127024733197, + "learning_rate": 1.553635178028731e-05, + "loss": 0.2665, + "step": 4214 + }, + { + "epoch": 0.33392751039809865, + "grad_norm": 1.988712110831499, + "learning_rate": 1.5534214711189574e-05, + "loss": 0.3297, + "step": 4215 + }, + { + "epoch": 0.33400673400673403, + "grad_norm": 2.27134904472091, + "learning_rate": 1.5532077277690435e-05, + "loss": 0.3467, + "step": 4216 + }, + { + "epoch": 0.33408595761536936, + "grad_norm": 1.7186316802985433, + "learning_rate": 1.552993947993062e-05, + "loss": 0.2703, + "step": 4217 + }, + { + "epoch": 0.33416518122400474, + "grad_norm": 2.039824739504798, + "learning_rate": 1.5527801318050904e-05, + "loss": 0.2278, + "step": 4218 + }, + { + "epoch": 0.3342444048326401, + "grad_norm": 1.5954402937053078, + "learning_rate": 1.5525662792192066e-05, + "loss": 0.1945, + "step": 4219 + }, + { + "epoch": 0.3343236284412755, + "grad_norm": 1.6366701666823844, + "learning_rate": 1.5523523902494927e-05, + "loss": 0.327, + "step": 4220 + }, + { + "epoch": 0.3344028520499109, + "grad_norm": 2.081910836589598, + "learning_rate": 1.552138464910031e-05, + "loss": 0.2706, + "step": 4221 + }, + { + "epoch": 0.33448207565854626, + "grad_norm": 2.349504732929474, + "learning_rate": 1.5519245032149083e-05, + "loss": 0.3777, + "step": 4222 + }, + { + "epoch": 0.33456129926718164, + "grad_norm": 1.6512342749360909, + "learning_rate": 1.5517105051782127e-05, + "loss": 0.1573, + "step": 4223 + }, + { + "epoch": 0.33464052287581697, + "grad_norm": 1.7863680575581886, + "learning_rate": 1.551496470814035e-05, + "loss": 0.2866, + "step": 4224 + }, + { + "epoch": 0.33471974648445235, + "grad_norm": 1.5758976090552037, + "learning_rate": 1.5512824001364686e-05, + "loss": 0.2174, + "step": 4225 + }, + { + "epoch": 0.33479897009308773, + "grad_norm": 1.9475637167806332, + "learning_rate": 1.5510682931596083e-05, + "loss": 0.2782, + "step": 4226 + }, + { + "epoch": 0.3348781937017231, + "grad_norm": 2.159770847930568, + "learning_rate": 1.550854149897553e-05, + "loss": 0.389, + "step": 4227 + }, + { + "epoch": 0.3349574173103585, + "grad_norm": 1.8354856213760793, + "learning_rate": 1.5506399703644017e-05, + "loss": 0.2698, + "step": 4228 + }, + { + "epoch": 0.3350366409189939, + "grad_norm": 1.9009134784812198, + "learning_rate": 1.5504257545742585e-05, + "loss": 0.3655, + "step": 4229 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 1.8220364471119024, + "learning_rate": 1.5502115025412275e-05, + "loss": 0.3391, + "step": 4230 + }, + { + "epoch": 0.3351950881362646, + "grad_norm": 2.025915788875568, + "learning_rate": 1.5499972142794167e-05, + "loss": 0.3257, + "step": 4231 + }, + { + "epoch": 0.33527431174489997, + "grad_norm": 1.6023690089589089, + "learning_rate": 1.5497828898029358e-05, + "loss": 0.2379, + "step": 4232 + }, + { + "epoch": 0.33535353535353535, + "grad_norm": 1.5530329381540535, + "learning_rate": 1.5495685291258967e-05, + "loss": 0.2621, + "step": 4233 + }, + { + "epoch": 0.33543275896217073, + "grad_norm": 1.6338481623741345, + "learning_rate": 1.5493541322624145e-05, + "loss": 0.2335, + "step": 4234 + }, + { + "epoch": 0.3355119825708061, + "grad_norm": 1.6827318394040827, + "learning_rate": 1.5491396992266065e-05, + "loss": 0.2813, + "step": 4235 + }, + { + "epoch": 0.3355912061794415, + "grad_norm": 1.979935126907643, + "learning_rate": 1.548925230032591e-05, + "loss": 0.3657, + "step": 4236 + }, + { + "epoch": 0.3356704297880769, + "grad_norm": 1.6773241521139335, + "learning_rate": 1.5487107246944902e-05, + "loss": 0.2955, + "step": 4237 + }, + { + "epoch": 0.3357496533967122, + "grad_norm": 2.100257671973178, + "learning_rate": 1.548496183226429e-05, + "loss": 0.3872, + "step": 4238 + }, + { + "epoch": 0.3358288770053476, + "grad_norm": 1.6312452651279, + "learning_rate": 1.548281605642533e-05, + "loss": 0.2521, + "step": 4239 + }, + { + "epoch": 0.33590810061398296, + "grad_norm": 1.9946315754549027, + "learning_rate": 1.5480669919569313e-05, + "loss": 0.2326, + "step": 4240 + }, + { + "epoch": 0.33598732422261834, + "grad_norm": 1.8239993388651272, + "learning_rate": 1.5478523421837553e-05, + "loss": 0.2659, + "step": 4241 + }, + { + "epoch": 0.3360665478312537, + "grad_norm": 1.9536843460951268, + "learning_rate": 1.5476376563371392e-05, + "loss": 0.2384, + "step": 4242 + }, + { + "epoch": 0.3361457714398891, + "grad_norm": 1.51697535987197, + "learning_rate": 1.547422934431218e-05, + "loss": 0.2301, + "step": 4243 + }, + { + "epoch": 0.33622499504852443, + "grad_norm": 2.1046197056263956, + "learning_rate": 1.5472081764801307e-05, + "loss": 0.4041, + "step": 4244 + }, + { + "epoch": 0.3363042186571598, + "grad_norm": 1.2886288483487907, + "learning_rate": 1.546993382498018e-05, + "loss": 0.2088, + "step": 4245 + }, + { + "epoch": 0.3363834422657952, + "grad_norm": 1.8452170013003963, + "learning_rate": 1.546778552499023e-05, + "loss": 0.2787, + "step": 4246 + }, + { + "epoch": 0.3364626658744306, + "grad_norm": 2.0496775864564563, + "learning_rate": 1.5465636864972914e-05, + "loss": 0.3261, + "step": 4247 + }, + { + "epoch": 0.33654188948306596, + "grad_norm": 2.0760124324458773, + "learning_rate": 1.5463487845069708e-05, + "loss": 0.4144, + "step": 4248 + }, + { + "epoch": 0.33662111309170134, + "grad_norm": 1.8105885126090164, + "learning_rate": 1.546133846542212e-05, + "loss": 0.3485, + "step": 4249 + }, + { + "epoch": 0.3367003367003367, + "grad_norm": 2.1672221876893065, + "learning_rate": 1.5459188726171666e-05, + "loss": 0.1811, + "step": 4250 + }, + { + "epoch": 0.33677956030897205, + "grad_norm": 2.3069163413501994, + "learning_rate": 1.5457038627459905e-05, + "loss": 0.3412, + "step": 4251 + }, + { + "epoch": 0.33685878391760743, + "grad_norm": 1.7577940423762994, + "learning_rate": 1.545488816942841e-05, + "loss": 0.1951, + "step": 4252 + }, + { + "epoch": 0.3369380075262428, + "grad_norm": 2.123441585135997, + "learning_rate": 1.5452737352218773e-05, + "loss": 0.355, + "step": 4253 + }, + { + "epoch": 0.3370172311348782, + "grad_norm": 2.183493113262432, + "learning_rate": 1.545058617597262e-05, + "loss": 0.3443, + "step": 4254 + }, + { + "epoch": 0.3370964547435136, + "grad_norm": 2.064295495283359, + "learning_rate": 1.544843464083159e-05, + "loss": 0.2968, + "step": 4255 + }, + { + "epoch": 0.33717567835214896, + "grad_norm": 2.1302273046599756, + "learning_rate": 1.544628274693736e-05, + "loss": 0.3033, + "step": 4256 + }, + { + "epoch": 0.33725490196078434, + "grad_norm": 1.5397902364478384, + "learning_rate": 1.5444130494431612e-05, + "loss": 0.1992, + "step": 4257 + }, + { + "epoch": 0.33733412556941966, + "grad_norm": 1.9645034232813676, + "learning_rate": 1.544197788345607e-05, + "loss": 0.2881, + "step": 4258 + }, + { + "epoch": 0.33741334917805504, + "grad_norm": 2.039604624926172, + "learning_rate": 1.543982491415247e-05, + "loss": 0.3182, + "step": 4259 + }, + { + "epoch": 0.3374925727866904, + "grad_norm": 1.8970238967569129, + "learning_rate": 1.5437671586662575e-05, + "loss": 0.2663, + "step": 4260 + }, + { + "epoch": 0.3375717963953258, + "grad_norm": 1.9660679800310723, + "learning_rate": 1.543551790112817e-05, + "loss": 0.3307, + "step": 4261 + }, + { + "epoch": 0.3376510200039612, + "grad_norm": 1.5030433319851695, + "learning_rate": 1.5433363857691067e-05, + "loss": 0.174, + "step": 4262 + }, + { + "epoch": 0.33773024361259657, + "grad_norm": 1.807588601157472, + "learning_rate": 1.5431209456493093e-05, + "loss": 0.2646, + "step": 4263 + }, + { + "epoch": 0.33780946722123195, + "grad_norm": 1.8680580415839958, + "learning_rate": 1.542905469767611e-05, + "loss": 0.2956, + "step": 4264 + }, + { + "epoch": 0.3378886908298673, + "grad_norm": 1.523110883624214, + "learning_rate": 1.5426899581382e-05, + "loss": 0.1813, + "step": 4265 + }, + { + "epoch": 0.33796791443850266, + "grad_norm": 2.245064697129695, + "learning_rate": 1.5424744107752666e-05, + "loss": 0.2906, + "step": 4266 + }, + { + "epoch": 0.33804713804713804, + "grad_norm": 2.1559301863917124, + "learning_rate": 1.542258827693003e-05, + "loss": 0.2919, + "step": 4267 + }, + { + "epoch": 0.3381263616557734, + "grad_norm": 1.742575540923704, + "learning_rate": 1.542043208905605e-05, + "loss": 0.1956, + "step": 4268 + }, + { + "epoch": 0.3382055852644088, + "grad_norm": 1.616308647585813, + "learning_rate": 1.5418275544272702e-05, + "loss": 0.263, + "step": 4269 + }, + { + "epoch": 0.3382848088730442, + "grad_norm": 2.527817253462671, + "learning_rate": 1.541611864272198e-05, + "loss": 0.358, + "step": 4270 + }, + { + "epoch": 0.33836403248167957, + "grad_norm": 1.8537353435793598, + "learning_rate": 1.5413961384545902e-05, + "loss": 0.3152, + "step": 4271 + }, + { + "epoch": 0.3384432560903149, + "grad_norm": 2.2402571345985494, + "learning_rate": 1.541180376988652e-05, + "loss": 0.3091, + "step": 4272 + }, + { + "epoch": 0.3385224796989503, + "grad_norm": 2.2787702443716054, + "learning_rate": 1.54096457988859e-05, + "loss": 0.4223, + "step": 4273 + }, + { + "epoch": 0.33860170330758566, + "grad_norm": 1.7161948501520208, + "learning_rate": 1.540748747168613e-05, + "loss": 0.2491, + "step": 4274 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 1.8799437774761005, + "learning_rate": 1.5405328788429333e-05, + "loss": 0.2264, + "step": 4275 + }, + { + "epoch": 0.3387601505248564, + "grad_norm": 2.258720491672299, + "learning_rate": 1.5403169749257644e-05, + "loss": 0.4304, + "step": 4276 + }, + { + "epoch": 0.3388393741334918, + "grad_norm": 1.854586281164233, + "learning_rate": 1.5401010354313222e-05, + "loss": 0.2844, + "step": 4277 + }, + { + "epoch": 0.3389185977421272, + "grad_norm": 2.0277851794350585, + "learning_rate": 1.539885060373826e-05, + "loss": 0.3632, + "step": 4278 + }, + { + "epoch": 0.3389978213507625, + "grad_norm": 1.683993755207749, + "learning_rate": 1.539669049767496e-05, + "loss": 0.2834, + "step": 4279 + }, + { + "epoch": 0.3390770449593979, + "grad_norm": 2.2606856110645066, + "learning_rate": 1.539453003626556e-05, + "loss": 0.2712, + "step": 4280 + }, + { + "epoch": 0.33915626856803327, + "grad_norm": 1.7078151134332757, + "learning_rate": 1.5392369219652313e-05, + "loss": 0.3186, + "step": 4281 + }, + { + "epoch": 0.33923549217666865, + "grad_norm": 1.6712324007049093, + "learning_rate": 1.53902080479775e-05, + "loss": 0.2478, + "step": 4282 + }, + { + "epoch": 0.33931471578530403, + "grad_norm": 2.1278755825112388, + "learning_rate": 1.5388046521383424e-05, + "loss": 0.3337, + "step": 4283 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 1.6972863548899855, + "learning_rate": 1.538588464001241e-05, + "loss": 0.2332, + "step": 4284 + }, + { + "epoch": 0.33947316300257474, + "grad_norm": 2.2271294790647103, + "learning_rate": 1.5383722404006808e-05, + "loss": 0.4383, + "step": 4285 + }, + { + "epoch": 0.3395523866112101, + "grad_norm": 2.237778411149077, + "learning_rate": 1.5381559813508986e-05, + "loss": 0.4502, + "step": 4286 + }, + { + "epoch": 0.3396316102198455, + "grad_norm": 1.5086851347808645, + "learning_rate": 1.537939686866135e-05, + "loss": 0.2157, + "step": 4287 + }, + { + "epoch": 0.3397108338284809, + "grad_norm": 1.8727890550763813, + "learning_rate": 1.5377233569606312e-05, + "loss": 0.2876, + "step": 4288 + }, + { + "epoch": 0.33979005743711627, + "grad_norm": 1.7398716979858997, + "learning_rate": 1.5375069916486318e-05, + "loss": 0.3142, + "step": 4289 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 1.7613656628020053, + "learning_rate": 1.5372905909443833e-05, + "loss": 0.2358, + "step": 4290 + }, + { + "epoch": 0.33994850465438703, + "grad_norm": 1.601557636102539, + "learning_rate": 1.5370741548621343e-05, + "loss": 0.3144, + "step": 4291 + }, + { + "epoch": 0.34002772826302236, + "grad_norm": 1.9591043075721617, + "learning_rate": 1.5368576834161372e-05, + "loss": 0.2454, + "step": 4292 + }, + { + "epoch": 0.34010695187165774, + "grad_norm": 1.8706199646952146, + "learning_rate": 1.536641176620644e-05, + "loss": 0.342, + "step": 4293 + }, + { + "epoch": 0.3401861754802931, + "grad_norm": 1.9926457981504575, + "learning_rate": 1.536424634489912e-05, + "loss": 0.2907, + "step": 4294 + }, + { + "epoch": 0.3402653990889285, + "grad_norm": 2.328294810366131, + "learning_rate": 1.536208057038199e-05, + "loss": 0.4167, + "step": 4295 + }, + { + "epoch": 0.3403446226975639, + "grad_norm": 1.502990605492509, + "learning_rate": 1.535991444279765e-05, + "loss": 0.2031, + "step": 4296 + }, + { + "epoch": 0.34042384630619926, + "grad_norm": 1.651332549206242, + "learning_rate": 1.535774796228874e-05, + "loss": 0.2222, + "step": 4297 + }, + { + "epoch": 0.34050306991483464, + "grad_norm": 1.7861513747887263, + "learning_rate": 1.5355581128997904e-05, + "loss": 0.2726, + "step": 4298 + }, + { + "epoch": 0.34058229352346997, + "grad_norm": 2.226154747167074, + "learning_rate": 1.5353413943067818e-05, + "loss": 0.3108, + "step": 4299 + }, + { + "epoch": 0.34066151713210535, + "grad_norm": 1.6325333448809107, + "learning_rate": 1.5351246404641183e-05, + "loss": 0.1681, + "step": 4300 + }, + { + "epoch": 0.34074074074074073, + "grad_norm": 2.1551814409247325, + "learning_rate": 1.5349078513860728e-05, + "loss": 0.2381, + "step": 4301 + }, + { + "epoch": 0.3408199643493761, + "grad_norm": 2.0248073267510542, + "learning_rate": 1.534691027086918e-05, + "loss": 0.3461, + "step": 4302 + }, + { + "epoch": 0.3408991879580115, + "grad_norm": 1.7560626074293617, + "learning_rate": 1.5344741675809328e-05, + "loss": 0.2805, + "step": 4303 + }, + { + "epoch": 0.3409784115666469, + "grad_norm": 1.8682179154893879, + "learning_rate": 1.534257272882395e-05, + "loss": 0.2586, + "step": 4304 + }, + { + "epoch": 0.34105763517528226, + "grad_norm": 1.7019759340309528, + "learning_rate": 1.5340403430055864e-05, + "loss": 0.2146, + "step": 4305 + }, + { + "epoch": 0.3411368587839176, + "grad_norm": 1.7510238402428289, + "learning_rate": 1.533823377964791e-05, + "loss": 0.314, + "step": 4306 + }, + { + "epoch": 0.34121608239255297, + "grad_norm": 1.840822756088021, + "learning_rate": 1.5336063777742944e-05, + "loss": 0.2424, + "step": 4307 + }, + { + "epoch": 0.34129530600118835, + "grad_norm": 1.9454782748104735, + "learning_rate": 1.5333893424483856e-05, + "loss": 0.3007, + "step": 4308 + }, + { + "epoch": 0.34137452960982373, + "grad_norm": 1.8657968760586143, + "learning_rate": 1.5331722720013555e-05, + "loss": 0.2632, + "step": 4309 + }, + { + "epoch": 0.3414537532184591, + "grad_norm": 1.7602105159920918, + "learning_rate": 1.532955166447496e-05, + "loss": 0.3302, + "step": 4310 + }, + { + "epoch": 0.3415329768270945, + "grad_norm": 2.4262218483881566, + "learning_rate": 1.5327380258011037e-05, + "loss": 0.3591, + "step": 4311 + }, + { + "epoch": 0.3416122004357299, + "grad_norm": 1.8887095956710382, + "learning_rate": 1.5325208500764756e-05, + "loss": 0.2521, + "step": 4312 + }, + { + "epoch": 0.3416914240443652, + "grad_norm": 1.976389398706188, + "learning_rate": 1.532303639287912e-05, + "loss": 0.2855, + "step": 4313 + }, + { + "epoch": 0.3417706476530006, + "grad_norm": 1.9927260923008052, + "learning_rate": 1.532086393449715e-05, + "loss": 0.3559, + "step": 4314 + }, + { + "epoch": 0.34184987126163596, + "grad_norm": 2.208702772151743, + "learning_rate": 1.531869112576189e-05, + "loss": 0.2779, + "step": 4315 + }, + { + "epoch": 0.34192909487027134, + "grad_norm": 2.1544575997026607, + "learning_rate": 1.5316517966816414e-05, + "loss": 0.2869, + "step": 4316 + }, + { + "epoch": 0.3420083184789067, + "grad_norm": 1.7013860031784205, + "learning_rate": 1.5314344457803812e-05, + "loss": 0.2025, + "step": 4317 + }, + { + "epoch": 0.3420875420875421, + "grad_norm": 1.644489043748052, + "learning_rate": 1.5312170598867195e-05, + "loss": 0.2501, + "step": 4318 + }, + { + "epoch": 0.3421667656961775, + "grad_norm": 1.843467781325001, + "learning_rate": 1.5309996390149708e-05, + "loss": 0.2452, + "step": 4319 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 2.1051344368196356, + "learning_rate": 1.5307821831794506e-05, + "loss": 0.3663, + "step": 4320 + }, + { + "epoch": 0.3423252129134482, + "grad_norm": 1.9273496119563407, + "learning_rate": 1.5305646923944776e-05, + "loss": 0.2697, + "step": 4321 + }, + { + "epoch": 0.3424044365220836, + "grad_norm": 1.884762212074546, + "learning_rate": 1.5303471666743727e-05, + "loss": 0.277, + "step": 4322 + }, + { + "epoch": 0.34248366013071896, + "grad_norm": 1.5493395238928411, + "learning_rate": 1.5301296060334588e-05, + "loss": 0.2337, + "step": 4323 + }, + { + "epoch": 0.34256288373935434, + "grad_norm": 2.08668779338647, + "learning_rate": 1.529912010486061e-05, + "loss": 0.2796, + "step": 4324 + }, + { + "epoch": 0.3426421073479897, + "grad_norm": 2.1830172210967707, + "learning_rate": 1.5296943800465068e-05, + "loss": 0.3285, + "step": 4325 + }, + { + "epoch": 0.34272133095662505, + "grad_norm": 1.8116074492912733, + "learning_rate": 1.529476714729127e-05, + "loss": 0.32, + "step": 4326 + }, + { + "epoch": 0.34280055456526043, + "grad_norm": 1.6807150398042805, + "learning_rate": 1.529259014548253e-05, + "loss": 0.2676, + "step": 4327 + }, + { + "epoch": 0.3428797781738958, + "grad_norm": 1.7394546432822156, + "learning_rate": 1.5290412795182193e-05, + "loss": 0.2546, + "step": 4328 + }, + { + "epoch": 0.3429590017825312, + "grad_norm": 2.1086588191264313, + "learning_rate": 1.528823509653363e-05, + "loss": 0.4513, + "step": 4329 + }, + { + "epoch": 0.3430382253911666, + "grad_norm": 1.5969371528609895, + "learning_rate": 1.5286057049680236e-05, + "loss": 0.3001, + "step": 4330 + }, + { + "epoch": 0.34311744899980196, + "grad_norm": 2.27881170248216, + "learning_rate": 1.5283878654765414e-05, + "loss": 0.254, + "step": 4331 + }, + { + "epoch": 0.34319667260843734, + "grad_norm": 1.4007672243605134, + "learning_rate": 1.5281699911932612e-05, + "loss": 0.173, + "step": 4332 + }, + { + "epoch": 0.34327589621707266, + "grad_norm": 1.55334928524067, + "learning_rate": 1.527952082132528e-05, + "loss": 0.3496, + "step": 4333 + }, + { + "epoch": 0.34335511982570804, + "grad_norm": 1.8427773871932651, + "learning_rate": 1.5277341383086906e-05, + "loss": 0.3233, + "step": 4334 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 1.5660673113715498, + "learning_rate": 1.5275161597360996e-05, + "loss": 0.2783, + "step": 4335 + }, + { + "epoch": 0.3435135670429788, + "grad_norm": 2.289739566600019, + "learning_rate": 1.5272981464291077e-05, + "loss": 0.3417, + "step": 4336 + }, + { + "epoch": 0.3435927906516142, + "grad_norm": 1.957723131340051, + "learning_rate": 1.5270800984020705e-05, + "loss": 0.2761, + "step": 4337 + }, + { + "epoch": 0.34367201426024957, + "grad_norm": 2.054671639153292, + "learning_rate": 1.5268620156693444e-05, + "loss": 0.3202, + "step": 4338 + }, + { + "epoch": 0.34375123786888495, + "grad_norm": 1.7408085922341083, + "learning_rate": 1.52664389824529e-05, + "loss": 0.2798, + "step": 4339 + }, + { + "epoch": 0.3438304614775203, + "grad_norm": 1.6112369798064616, + "learning_rate": 1.5264257461442687e-05, + "loss": 0.1713, + "step": 4340 + }, + { + "epoch": 0.34390968508615566, + "grad_norm": 2.2249323887993793, + "learning_rate": 1.526207559380645e-05, + "loss": 0.3886, + "step": 4341 + }, + { + "epoch": 0.34398890869479104, + "grad_norm": 1.9314707057397802, + "learning_rate": 1.5259893379687855e-05, + "loss": 0.2596, + "step": 4342 + }, + { + "epoch": 0.3440681323034264, + "grad_norm": 2.040388022390409, + "learning_rate": 1.525771081923059e-05, + "loss": 0.3465, + "step": 4343 + }, + { + "epoch": 0.3441473559120618, + "grad_norm": 2.2032934823475507, + "learning_rate": 1.525552791257837e-05, + "loss": 0.4216, + "step": 4344 + }, + { + "epoch": 0.3442265795206972, + "grad_norm": 1.738034897592833, + "learning_rate": 1.525334465987492e-05, + "loss": 0.2187, + "step": 4345 + }, + { + "epoch": 0.34430580312933257, + "grad_norm": 1.8480308926002667, + "learning_rate": 1.5251161061264003e-05, + "loss": 0.2848, + "step": 4346 + }, + { + "epoch": 0.3443850267379679, + "grad_norm": 1.7362872484399905, + "learning_rate": 1.5248977116889396e-05, + "loss": 0.2908, + "step": 4347 + }, + { + "epoch": 0.3444642503466033, + "grad_norm": 1.975588266543425, + "learning_rate": 1.5246792826894906e-05, + "loss": 0.3602, + "step": 4348 + }, + { + "epoch": 0.34454347395523865, + "grad_norm": 1.9725950094402869, + "learning_rate": 1.5244608191424352e-05, + "loss": 0.3373, + "step": 4349 + }, + { + "epoch": 0.34462269756387404, + "grad_norm": 1.5802875107101795, + "learning_rate": 1.5242423210621584e-05, + "loss": 0.1813, + "step": 4350 + }, + { + "epoch": 0.3447019211725094, + "grad_norm": 1.7529626213171856, + "learning_rate": 1.5240237884630471e-05, + "loss": 0.4385, + "step": 4351 + }, + { + "epoch": 0.3447811447811448, + "grad_norm": 1.8639229728251556, + "learning_rate": 1.5238052213594912e-05, + "loss": 0.3419, + "step": 4352 + }, + { + "epoch": 0.3448603683897802, + "grad_norm": 2.056434018432276, + "learning_rate": 1.5235866197658812e-05, + "loss": 0.3015, + "step": 4353 + }, + { + "epoch": 0.3449395919984155, + "grad_norm": 1.764657805949268, + "learning_rate": 1.5233679836966122e-05, + "loss": 0.3147, + "step": 4354 + }, + { + "epoch": 0.3450188156070509, + "grad_norm": 2.0679386147054832, + "learning_rate": 1.5231493131660794e-05, + "loss": 0.2619, + "step": 4355 + }, + { + "epoch": 0.34509803921568627, + "grad_norm": 1.888886802634934, + "learning_rate": 1.5229306081886818e-05, + "loss": 0.2476, + "step": 4356 + }, + { + "epoch": 0.34517726282432165, + "grad_norm": 2.045020138735299, + "learning_rate": 1.5227118687788198e-05, + "loss": 0.2444, + "step": 4357 + }, + { + "epoch": 0.34525648643295703, + "grad_norm": 2.1061354082554176, + "learning_rate": 1.5224930949508964e-05, + "loss": 0.2349, + "step": 4358 + }, + { + "epoch": 0.3453357100415924, + "grad_norm": 1.6833212123636772, + "learning_rate": 1.5222742867193167e-05, + "loss": 0.2171, + "step": 4359 + }, + { + "epoch": 0.3454149336502278, + "grad_norm": 2.2059804935763085, + "learning_rate": 1.5220554440984882e-05, + "loss": 0.2891, + "step": 4360 + }, + { + "epoch": 0.3454941572588631, + "grad_norm": 1.9399908906967271, + "learning_rate": 1.5218365671028207e-05, + "loss": 0.2718, + "step": 4361 + }, + { + "epoch": 0.3455733808674985, + "grad_norm": 2.064133781808747, + "learning_rate": 1.5216176557467265e-05, + "loss": 0.4346, + "step": 4362 + }, + { + "epoch": 0.3456526044761339, + "grad_norm": 2.341288935705729, + "learning_rate": 1.521398710044619e-05, + "loss": 0.2904, + "step": 4363 + }, + { + "epoch": 0.34573182808476927, + "grad_norm": 1.480486934605892, + "learning_rate": 1.5211797300109154e-05, + "loss": 0.1788, + "step": 4364 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 1.8980110514113189, + "learning_rate": 1.5209607156600346e-05, + "loss": 0.2404, + "step": 4365 + }, + { + "epoch": 0.34589027530204003, + "grad_norm": 1.918180580012612, + "learning_rate": 1.520741667006397e-05, + "loss": 0.2985, + "step": 4366 + }, + { + "epoch": 0.34596949891067535, + "grad_norm": 1.7157258831670843, + "learning_rate": 1.5205225840644264e-05, + "loss": 0.2572, + "step": 4367 + }, + { + "epoch": 0.34604872251931074, + "grad_norm": 1.4616233109176535, + "learning_rate": 1.5203034668485486e-05, + "loss": 0.2215, + "step": 4368 + }, + { + "epoch": 0.3461279461279461, + "grad_norm": 1.7973744840780148, + "learning_rate": 1.5200843153731905e-05, + "loss": 0.2964, + "step": 4369 + }, + { + "epoch": 0.3462071697365815, + "grad_norm": 1.8689465298530563, + "learning_rate": 1.519865129652783e-05, + "loss": 0.2368, + "step": 4370 + }, + { + "epoch": 0.3462863933452169, + "grad_norm": 2.146848217514277, + "learning_rate": 1.5196459097017582e-05, + "loss": 0.2507, + "step": 4371 + }, + { + "epoch": 0.34636561695385226, + "grad_norm": 2.4399665904302963, + "learning_rate": 1.5194266555345505e-05, + "loss": 0.2897, + "step": 4372 + }, + { + "epoch": 0.34644484056248764, + "grad_norm": 1.970614575243572, + "learning_rate": 1.5192073671655969e-05, + "loss": 0.3587, + "step": 4373 + }, + { + "epoch": 0.34652406417112297, + "grad_norm": 2.0283610101737573, + "learning_rate": 1.5189880446093366e-05, + "loss": 0.2925, + "step": 4374 + }, + { + "epoch": 0.34660328777975835, + "grad_norm": 1.6896989147708572, + "learning_rate": 1.5187686878802108e-05, + "loss": 0.2192, + "step": 4375 + }, + { + "epoch": 0.34668251138839373, + "grad_norm": 2.3203107076858167, + "learning_rate": 1.5185492969926627e-05, + "loss": 0.3606, + "step": 4376 + }, + { + "epoch": 0.3467617349970291, + "grad_norm": 1.7853082041339545, + "learning_rate": 1.5183298719611388e-05, + "loss": 0.3665, + "step": 4377 + }, + { + "epoch": 0.3468409586056645, + "grad_norm": 1.6594019539959624, + "learning_rate": 1.5181104128000868e-05, + "loss": 0.2264, + "step": 4378 + }, + { + "epoch": 0.3469201822142999, + "grad_norm": 1.9559896403703443, + "learning_rate": 1.517890919523957e-05, + "loss": 0.2923, + "step": 4379 + }, + { + "epoch": 0.34699940582293526, + "grad_norm": 1.880034336558596, + "learning_rate": 1.517671392147202e-05, + "loss": 0.3446, + "step": 4380 + }, + { + "epoch": 0.3470786294315706, + "grad_norm": 2.689484204475275, + "learning_rate": 1.517451830684277e-05, + "loss": 0.4122, + "step": 4381 + }, + { + "epoch": 0.34715785304020597, + "grad_norm": 2.182067699308231, + "learning_rate": 1.5172322351496385e-05, + "loss": 0.3123, + "step": 4382 + }, + { + "epoch": 0.34723707664884135, + "grad_norm": 2.1286155319351714, + "learning_rate": 1.517012605557746e-05, + "loss": 0.3076, + "step": 4383 + }, + { + "epoch": 0.34731630025747673, + "grad_norm": 1.861668082962604, + "learning_rate": 1.5167929419230616e-05, + "loss": 0.3237, + "step": 4384 + }, + { + "epoch": 0.3473955238661121, + "grad_norm": 1.66810975360935, + "learning_rate": 1.516573244260048e-05, + "loss": 0.2895, + "step": 4385 + }, + { + "epoch": 0.3474747474747475, + "grad_norm": 1.8512312432650195, + "learning_rate": 1.5163535125831724e-05, + "loss": 0.3088, + "step": 4386 + }, + { + "epoch": 0.3475539710833829, + "grad_norm": 2.0341052801180446, + "learning_rate": 1.5161337469069024e-05, + "loss": 0.3371, + "step": 4387 + }, + { + "epoch": 0.3476331946920182, + "grad_norm": 1.976628083975725, + "learning_rate": 1.5159139472457086e-05, + "loss": 0.3395, + "step": 4388 + }, + { + "epoch": 0.3477124183006536, + "grad_norm": 1.696058103457667, + "learning_rate": 1.5156941136140637e-05, + "loss": 0.2882, + "step": 4389 + }, + { + "epoch": 0.34779164190928896, + "grad_norm": 2.107799749146764, + "learning_rate": 1.5154742460264426e-05, + "loss": 0.297, + "step": 4390 + }, + { + "epoch": 0.34787086551792434, + "grad_norm": 1.4747042291311936, + "learning_rate": 1.515254344497323e-05, + "loss": 0.2135, + "step": 4391 + }, + { + "epoch": 0.3479500891265597, + "grad_norm": 2.88685922740951, + "learning_rate": 1.5150344090411841e-05, + "loss": 0.3602, + "step": 4392 + }, + { + "epoch": 0.3480293127351951, + "grad_norm": 1.706895664844948, + "learning_rate": 1.5148144396725072e-05, + "loss": 0.1824, + "step": 4393 + }, + { + "epoch": 0.3481085363438305, + "grad_norm": 1.915496284971201, + "learning_rate": 1.514594436405777e-05, + "loss": 0.226, + "step": 4394 + }, + { + "epoch": 0.3481877599524658, + "grad_norm": 2.150580405982988, + "learning_rate": 1.5143743992554791e-05, + "loss": 0.3376, + "step": 4395 + }, + { + "epoch": 0.3482669835611012, + "grad_norm": 1.5297964883482145, + "learning_rate": 1.514154328236102e-05, + "loss": 0.2836, + "step": 4396 + }, + { + "epoch": 0.3483462071697366, + "grad_norm": 1.7733065070222984, + "learning_rate": 1.5139342233621364e-05, + "loss": 0.2042, + "step": 4397 + }, + { + "epoch": 0.34842543077837196, + "grad_norm": 1.7394350376309586, + "learning_rate": 1.5137140846480752e-05, + "loss": 0.1713, + "step": 4398 + }, + { + "epoch": 0.34850465438700734, + "grad_norm": 1.9405996273018045, + "learning_rate": 1.5134939121084129e-05, + "loss": 0.2823, + "step": 4399 + }, + { + "epoch": 0.3485838779956427, + "grad_norm": 2.338470253130088, + "learning_rate": 1.5132737057576476e-05, + "loss": 0.4525, + "step": 4400 + }, + { + "epoch": 0.34866310160427805, + "grad_norm": 1.6704759171154522, + "learning_rate": 1.5130534656102783e-05, + "loss": 0.2541, + "step": 4401 + }, + { + "epoch": 0.34874232521291343, + "grad_norm": 1.8266411948811352, + "learning_rate": 1.512833191680807e-05, + "loss": 0.3236, + "step": 4402 + }, + { + "epoch": 0.3488215488215488, + "grad_norm": 2.061409850804772, + "learning_rate": 1.5126128839837378e-05, + "loss": 0.2578, + "step": 4403 + }, + { + "epoch": 0.3489007724301842, + "grad_norm": 1.5343577896229241, + "learning_rate": 1.5123925425335766e-05, + "loss": 0.2321, + "step": 4404 + }, + { + "epoch": 0.3489799960388196, + "grad_norm": 2.005253722607864, + "learning_rate": 1.5121721673448319e-05, + "loss": 0.2829, + "step": 4405 + }, + { + "epoch": 0.34905921964745495, + "grad_norm": 2.089500827857083, + "learning_rate": 1.5119517584320146e-05, + "loss": 0.3455, + "step": 4406 + }, + { + "epoch": 0.34913844325609034, + "grad_norm": 1.8841284410038224, + "learning_rate": 1.5117313158096371e-05, + "loss": 0.259, + "step": 4407 + }, + { + "epoch": 0.34921766686472566, + "grad_norm": 1.699858850561031, + "learning_rate": 1.511510839492215e-05, + "loss": 0.3421, + "step": 4408 + }, + { + "epoch": 0.34929689047336104, + "grad_norm": 1.7271775514665164, + "learning_rate": 1.5112903294942651e-05, + "loss": 0.1925, + "step": 4409 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 2.2917828207714432, + "learning_rate": 1.5110697858303072e-05, + "loss": 0.3069, + "step": 4410 + }, + { + "epoch": 0.3494553376906318, + "grad_norm": 2.0199844096252155, + "learning_rate": 1.5108492085148632e-05, + "loss": 0.3332, + "step": 4411 + }, + { + "epoch": 0.3495345612992672, + "grad_norm": 2.9004469049775934, + "learning_rate": 1.5106285975624568e-05, + "loss": 0.3829, + "step": 4412 + }, + { + "epoch": 0.34961378490790257, + "grad_norm": 1.6794746118887558, + "learning_rate": 1.5104079529876143e-05, + "loss": 0.1862, + "step": 4413 + }, + { + "epoch": 0.34969300851653795, + "grad_norm": 2.670433659965118, + "learning_rate": 1.510187274804864e-05, + "loss": 0.2534, + "step": 4414 + }, + { + "epoch": 0.3497722321251733, + "grad_norm": 1.9237606531490077, + "learning_rate": 1.5099665630287365e-05, + "loss": 0.3361, + "step": 4415 + }, + { + "epoch": 0.34985145573380866, + "grad_norm": 2.3536509688690233, + "learning_rate": 1.5097458176737647e-05, + "loss": 0.28, + "step": 4416 + }, + { + "epoch": 0.34993067934244404, + "grad_norm": 1.9985007451799834, + "learning_rate": 1.5095250387544833e-05, + "loss": 0.2482, + "step": 4417 + }, + { + "epoch": 0.3500099029510794, + "grad_norm": 1.4664203288167474, + "learning_rate": 1.5093042262854297e-05, + "loss": 0.1871, + "step": 4418 + }, + { + "epoch": 0.3500891265597148, + "grad_norm": 2.071109023299152, + "learning_rate": 1.509083380281144e-05, + "loss": 0.2448, + "step": 4419 + }, + { + "epoch": 0.3501683501683502, + "grad_norm": 1.6111722903949248, + "learning_rate": 1.5088625007561668e-05, + "loss": 0.3055, + "step": 4420 + }, + { + "epoch": 0.35024757377698557, + "grad_norm": 2.402300155827407, + "learning_rate": 1.5086415877250424e-05, + "loss": 0.3575, + "step": 4421 + }, + { + "epoch": 0.3503267973856209, + "grad_norm": 2.1828640785355224, + "learning_rate": 1.5084206412023172e-05, + "loss": 0.2519, + "step": 4422 + }, + { + "epoch": 0.3504060209942563, + "grad_norm": 1.87727170332503, + "learning_rate": 1.5081996612025387e-05, + "loss": 0.3044, + "step": 4423 + }, + { + "epoch": 0.35048524460289165, + "grad_norm": 1.8376061913158677, + "learning_rate": 1.5079786477402581e-05, + "loss": 0.2773, + "step": 4424 + }, + { + "epoch": 0.35056446821152704, + "grad_norm": 1.9599922868668826, + "learning_rate": 1.5077576008300278e-05, + "loss": 0.4042, + "step": 4425 + }, + { + "epoch": 0.3506436918201624, + "grad_norm": 1.886436161981459, + "learning_rate": 1.5075365204864025e-05, + "loss": 0.3172, + "step": 4426 + }, + { + "epoch": 0.3507229154287978, + "grad_norm": 2.158573459895322, + "learning_rate": 1.5073154067239396e-05, + "loss": 0.4641, + "step": 4427 + }, + { + "epoch": 0.3508021390374332, + "grad_norm": 2.163469986816108, + "learning_rate": 1.507094259557198e-05, + "loss": 0.281, + "step": 4428 + }, + { + "epoch": 0.3508813626460685, + "grad_norm": 2.196740423828265, + "learning_rate": 1.5068730790007395e-05, + "loss": 0.3008, + "step": 4429 + }, + { + "epoch": 0.3509605862547039, + "grad_norm": 1.799638147572534, + "learning_rate": 1.5066518650691277e-05, + "loss": 0.2728, + "step": 4430 + }, + { + "epoch": 0.35103980986333927, + "grad_norm": 2.1098419869514276, + "learning_rate": 1.5064306177769284e-05, + "loss": 0.3632, + "step": 4431 + }, + { + "epoch": 0.35111903347197465, + "grad_norm": 1.801827280728131, + "learning_rate": 1.5062093371387097e-05, + "loss": 0.3705, + "step": 4432 + }, + { + "epoch": 0.35119825708061003, + "grad_norm": 1.752504445294334, + "learning_rate": 1.5059880231690418e-05, + "loss": 0.1788, + "step": 4433 + }, + { + "epoch": 0.3512774806892454, + "grad_norm": 1.83183042013958, + "learning_rate": 1.5057666758824974e-05, + "loss": 0.3134, + "step": 4434 + }, + { + "epoch": 0.3513567042978808, + "grad_norm": 2.5327598128970923, + "learning_rate": 1.5055452952936512e-05, + "loss": 0.2955, + "step": 4435 + }, + { + "epoch": 0.3514359279065161, + "grad_norm": 2.4047093866945946, + "learning_rate": 1.5053238814170792e-05, + "loss": 0.2791, + "step": 4436 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 1.728766487939823, + "learning_rate": 1.5051024342673614e-05, + "loss": 0.2045, + "step": 4437 + }, + { + "epoch": 0.3515943751237869, + "grad_norm": 2.4855068979562174, + "learning_rate": 1.5048809538590789e-05, + "loss": 0.3207, + "step": 4438 + }, + { + "epoch": 0.35167359873242227, + "grad_norm": 2.158786644866198, + "learning_rate": 1.5046594402068147e-05, + "loss": 0.3248, + "step": 4439 + }, + { + "epoch": 0.35175282234105765, + "grad_norm": 2.3381376925169883, + "learning_rate": 1.5044378933251546e-05, + "loss": 0.323, + "step": 4440 + }, + { + "epoch": 0.35183204594969303, + "grad_norm": 1.7281068557815027, + "learning_rate": 1.5042163132286867e-05, + "loss": 0.2607, + "step": 4441 + }, + { + "epoch": 0.35191126955832835, + "grad_norm": 1.7203874216493409, + "learning_rate": 1.5039946999320004e-05, + "loss": 0.2534, + "step": 4442 + }, + { + "epoch": 0.35199049316696374, + "grad_norm": 1.5575732626889371, + "learning_rate": 1.5037730534496882e-05, + "loss": 0.2317, + "step": 4443 + }, + { + "epoch": 0.3520697167755991, + "grad_norm": 1.8546706015814542, + "learning_rate": 1.5035513737963445e-05, + "loss": 0.2794, + "step": 4444 + }, + { + "epoch": 0.3521489403842345, + "grad_norm": 1.7444183718993096, + "learning_rate": 1.5033296609865658e-05, + "loss": 0.1767, + "step": 4445 + }, + { + "epoch": 0.3522281639928699, + "grad_norm": 1.5804079466245837, + "learning_rate": 1.503107915034951e-05, + "loss": 0.2115, + "step": 4446 + }, + { + "epoch": 0.35230738760150526, + "grad_norm": 1.6798556614448925, + "learning_rate": 1.5028861359561005e-05, + "loss": 0.3022, + "step": 4447 + }, + { + "epoch": 0.35238661121014064, + "grad_norm": 1.643809781537248, + "learning_rate": 1.5026643237646176e-05, + "loss": 0.2498, + "step": 4448 + }, + { + "epoch": 0.35246583481877597, + "grad_norm": 1.8262600883655196, + "learning_rate": 1.5024424784751079e-05, + "loss": 0.3141, + "step": 4449 + }, + { + "epoch": 0.35254505842741135, + "grad_norm": 1.9265408123435968, + "learning_rate": 1.5022206001021784e-05, + "loss": 0.2125, + "step": 4450 + }, + { + "epoch": 0.35262428203604673, + "grad_norm": 1.8989136232334514, + "learning_rate": 1.501998688660439e-05, + "loss": 0.2919, + "step": 4451 + }, + { + "epoch": 0.3527035056446821, + "grad_norm": 1.6806567197407747, + "learning_rate": 1.5017767441645015e-05, + "loss": 0.261, + "step": 4452 + }, + { + "epoch": 0.3527827292533175, + "grad_norm": 2.37236928160262, + "learning_rate": 1.5015547666289798e-05, + "loss": 0.2752, + "step": 4453 + }, + { + "epoch": 0.3528619528619529, + "grad_norm": 2.543361733152592, + "learning_rate": 1.50133275606849e-05, + "loss": 0.4694, + "step": 4454 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.6183208684242345, + "learning_rate": 1.5011107124976505e-05, + "loss": 0.2857, + "step": 4455 + }, + { + "epoch": 0.3530204000792236, + "grad_norm": 1.8982486209875373, + "learning_rate": 1.5008886359310815e-05, + "loss": 0.2949, + "step": 4456 + }, + { + "epoch": 0.35309962368785897, + "grad_norm": 1.6758397899792723, + "learning_rate": 1.5006665263834062e-05, + "loss": 0.1777, + "step": 4457 + }, + { + "epoch": 0.35317884729649435, + "grad_norm": 1.9026406711685655, + "learning_rate": 1.5004443838692492e-05, + "loss": 0.3559, + "step": 4458 + }, + { + "epoch": 0.35325807090512973, + "grad_norm": 1.6327646019744073, + "learning_rate": 1.5002222084032374e-05, + "loss": 0.2262, + "step": 4459 + }, + { + "epoch": 0.3533372945137651, + "grad_norm": 2.0556888262304147, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2983, + "step": 4460 + }, + { + "epoch": 0.3534165181224005, + "grad_norm": 2.0223199864331547, + "learning_rate": 1.4997777586741689e-05, + "loss": 0.306, + "step": 4461 + }, + { + "epoch": 0.3534957417310359, + "grad_norm": 1.9646526168339682, + "learning_rate": 1.4995554844403767e-05, + "loss": 0.4299, + "step": 4462 + }, + { + "epoch": 0.3535749653396712, + "grad_norm": 2.0637272688595463, + "learning_rate": 1.4993331773132598e-05, + "loss": 0.3072, + "step": 4463 + }, + { + "epoch": 0.3536541889483066, + "grad_norm": 1.8858414970509891, + "learning_rate": 1.4991108373074557e-05, + "loss": 0.2434, + "step": 4464 + }, + { + "epoch": 0.35373341255694196, + "grad_norm": 1.7065698382424745, + "learning_rate": 1.4988884644376045e-05, + "loss": 0.2395, + "step": 4465 + }, + { + "epoch": 0.35381263616557734, + "grad_norm": 1.5618804363379024, + "learning_rate": 1.4986660587183485e-05, + "loss": 0.2366, + "step": 4466 + }, + { + "epoch": 0.3538918597742127, + "grad_norm": 1.6103705110655664, + "learning_rate": 1.498443620164332e-05, + "loss": 0.3091, + "step": 4467 + }, + { + "epoch": 0.3539710833828481, + "grad_norm": 2.3187816422481795, + "learning_rate": 1.4982211487902015e-05, + "loss": 0.4032, + "step": 4468 + }, + { + "epoch": 0.3540503069914835, + "grad_norm": 1.8366383171331269, + "learning_rate": 1.4979986446106054e-05, + "loss": 0.3065, + "step": 4469 + }, + { + "epoch": 0.3541295306001188, + "grad_norm": 2.730340448726131, + "learning_rate": 1.4977761076401949e-05, + "loss": 0.315, + "step": 4470 + }, + { + "epoch": 0.3542087542087542, + "grad_norm": 1.861506882875236, + "learning_rate": 1.4975535378936228e-05, + "loss": 0.2781, + "step": 4471 + }, + { + "epoch": 0.3542879778173896, + "grad_norm": 1.5370290927428905, + "learning_rate": 1.4973309353855443e-05, + "loss": 0.2145, + "step": 4472 + }, + { + "epoch": 0.35436720142602496, + "grad_norm": 1.5470037733901187, + "learning_rate": 1.497108300130617e-05, + "loss": 0.2517, + "step": 4473 + }, + { + "epoch": 0.35444642503466034, + "grad_norm": 2.1706644129758903, + "learning_rate": 1.4968856321434997e-05, + "loss": 0.3411, + "step": 4474 + }, + { + "epoch": 0.3545256486432957, + "grad_norm": 1.452155532639976, + "learning_rate": 1.4966629314388548e-05, + "loss": 0.136, + "step": 4475 + }, + { + "epoch": 0.3546048722519311, + "grad_norm": 1.8777592765536582, + "learning_rate": 1.4964401980313452e-05, + "loss": 0.3524, + "step": 4476 + }, + { + "epoch": 0.35468409586056643, + "grad_norm": 2.187966976579896, + "learning_rate": 1.4962174319356372e-05, + "loss": 0.3678, + "step": 4477 + }, + { + "epoch": 0.3547633194692018, + "grad_norm": 1.8574633617091538, + "learning_rate": 1.4959946331663995e-05, + "loss": 0.3337, + "step": 4478 + }, + { + "epoch": 0.3548425430778372, + "grad_norm": 1.8259251177057856, + "learning_rate": 1.4957718017383013e-05, + "loss": 0.2727, + "step": 4479 + }, + { + "epoch": 0.3549217666864726, + "grad_norm": 1.7524440633439209, + "learning_rate": 1.4955489376660157e-05, + "loss": 0.3102, + "step": 4480 + }, + { + "epoch": 0.35500099029510795, + "grad_norm": 2.063226226559272, + "learning_rate": 1.4953260409642172e-05, + "loss": 0.3671, + "step": 4481 + }, + { + "epoch": 0.35508021390374334, + "grad_norm": 2.1118865278626946, + "learning_rate": 1.4951031116475819e-05, + "loss": 0.2145, + "step": 4482 + }, + { + "epoch": 0.35515943751237866, + "grad_norm": 1.5880068378403644, + "learning_rate": 1.4948801497307893e-05, + "loss": 0.2357, + "step": 4483 + }, + { + "epoch": 0.35523866112101404, + "grad_norm": 1.7385274370948502, + "learning_rate": 1.4946571552285196e-05, + "loss": 0.3444, + "step": 4484 + }, + { + "epoch": 0.3553178847296494, + "grad_norm": 1.8917244831295035, + "learning_rate": 1.4944341281554566e-05, + "loss": 0.2726, + "step": 4485 + }, + { + "epoch": 0.3553971083382848, + "grad_norm": 2.1572495752829277, + "learning_rate": 1.4942110685262854e-05, + "loss": 0.3252, + "step": 4486 + }, + { + "epoch": 0.3554763319469202, + "grad_norm": 1.9851630007469503, + "learning_rate": 1.493987976355693e-05, + "loss": 0.3795, + "step": 4487 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 1.4817547742601904, + "learning_rate": 1.4937648516583696e-05, + "loss": 0.2436, + "step": 4488 + }, + { + "epoch": 0.35563477916419095, + "grad_norm": 1.9726909153526209, + "learning_rate": 1.4935416944490066e-05, + "loss": 0.4018, + "step": 4489 + }, + { + "epoch": 0.3557140027728263, + "grad_norm": 1.8912975937246552, + "learning_rate": 1.4933185047422976e-05, + "loss": 0.2751, + "step": 4490 + }, + { + "epoch": 0.35579322638146166, + "grad_norm": 2.2215615622354146, + "learning_rate": 1.493095282552939e-05, + "loss": 0.4122, + "step": 4491 + }, + { + "epoch": 0.35587244999009704, + "grad_norm": 1.756047117177278, + "learning_rate": 1.4928720278956284e-05, + "loss": 0.2804, + "step": 4492 + }, + { + "epoch": 0.3559516735987324, + "grad_norm": 1.939696498642266, + "learning_rate": 1.4926487407850667e-05, + "loss": 0.2614, + "step": 4493 + }, + { + "epoch": 0.3560308972073678, + "grad_norm": 1.6043403065342479, + "learning_rate": 1.4924254212359557e-05, + "loss": 0.2459, + "step": 4494 + }, + { + "epoch": 0.3561101208160032, + "grad_norm": 1.847794508376427, + "learning_rate": 1.492202069263e-05, + "loss": 0.1965, + "step": 4495 + }, + { + "epoch": 0.35618934442463857, + "grad_norm": 1.866015536038721, + "learning_rate": 1.4919786848809061e-05, + "loss": 0.2716, + "step": 4496 + }, + { + "epoch": 0.3562685680332739, + "grad_norm": 1.9062728149950978, + "learning_rate": 1.4917552681043837e-05, + "loss": 0.2454, + "step": 4497 + }, + { + "epoch": 0.3563477916419093, + "grad_norm": 2.076902047483076, + "learning_rate": 1.4915318189481425e-05, + "loss": 0.3539, + "step": 4498 + }, + { + "epoch": 0.35642701525054465, + "grad_norm": 2.1394462677614765, + "learning_rate": 1.4913083374268965e-05, + "loss": 0.28, + "step": 4499 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 1.9708424852547899, + "learning_rate": 1.4910848235553604e-05, + "loss": 0.3289, + "step": 4500 + }, + { + "epoch": 0.3565854624678154, + "grad_norm": 1.5964853174329938, + "learning_rate": 1.4908612773482514e-05, + "loss": 0.2311, + "step": 4501 + }, + { + "epoch": 0.3566646860764508, + "grad_norm": 1.8500186435390105, + "learning_rate": 1.4906376988202893e-05, + "loss": 0.2334, + "step": 4502 + }, + { + "epoch": 0.3567439096850862, + "grad_norm": 1.9059740606972333, + "learning_rate": 1.4904140879861957e-05, + "loss": 0.2515, + "step": 4503 + }, + { + "epoch": 0.3568231332937215, + "grad_norm": 1.8348869138817459, + "learning_rate": 1.490190444860694e-05, + "loss": 0.2107, + "step": 4504 + }, + { + "epoch": 0.3569023569023569, + "grad_norm": 1.707738179716157, + "learning_rate": 1.48996676945851e-05, + "loss": 0.2349, + "step": 4505 + }, + { + "epoch": 0.35698158051099227, + "grad_norm": 2.2254491853794054, + "learning_rate": 1.4897430617943718e-05, + "loss": 0.3567, + "step": 4506 + }, + { + "epoch": 0.35706080411962765, + "grad_norm": 1.248338011244278, + "learning_rate": 1.4895193218830098e-05, + "loss": 0.1435, + "step": 4507 + }, + { + "epoch": 0.35714002772826303, + "grad_norm": 2.243778706100023, + "learning_rate": 1.4892955497391556e-05, + "loss": 0.2788, + "step": 4508 + }, + { + "epoch": 0.3572192513368984, + "grad_norm": 2.0744338656622316, + "learning_rate": 1.4890717453775438e-05, + "loss": 0.2514, + "step": 4509 + }, + { + "epoch": 0.3572984749455338, + "grad_norm": 1.4472308399580844, + "learning_rate": 1.488847908812911e-05, + "loss": 0.1623, + "step": 4510 + }, + { + "epoch": 0.3573776985541691, + "grad_norm": 1.7644779971883402, + "learning_rate": 1.4886240400599954e-05, + "loss": 0.3246, + "step": 4511 + }, + { + "epoch": 0.3574569221628045, + "grad_norm": 1.7329541023791304, + "learning_rate": 1.488400139133538e-05, + "loss": 0.2265, + "step": 4512 + }, + { + "epoch": 0.3575361457714399, + "grad_norm": 1.7129722573748578, + "learning_rate": 1.4881762060482814e-05, + "loss": 0.2959, + "step": 4513 + }, + { + "epoch": 0.35761536938007527, + "grad_norm": 1.8951230646421218, + "learning_rate": 1.4879522408189706e-05, + "loss": 0.2604, + "step": 4514 + }, + { + "epoch": 0.35769459298871065, + "grad_norm": 1.7288392584592716, + "learning_rate": 1.4877282434603527e-05, + "loss": 0.2367, + "step": 4515 + }, + { + "epoch": 0.35777381659734603, + "grad_norm": 1.992538903721756, + "learning_rate": 1.4875042139871768e-05, + "loss": 0.3523, + "step": 4516 + }, + { + "epoch": 0.3578530402059814, + "grad_norm": 2.02740501299765, + "learning_rate": 1.487280152414194e-05, + "loss": 0.3351, + "step": 4517 + }, + { + "epoch": 0.35793226381461674, + "grad_norm": 1.8306707156593898, + "learning_rate": 1.4870560587561578e-05, + "loss": 0.2539, + "step": 4518 + }, + { + "epoch": 0.3580114874232521, + "grad_norm": 2.317142037857916, + "learning_rate": 1.4868319330278236e-05, + "loss": 0.3641, + "step": 4519 + }, + { + "epoch": 0.3580907110318875, + "grad_norm": 1.4492466031203324, + "learning_rate": 1.4866077752439495e-05, + "loss": 0.1789, + "step": 4520 + }, + { + "epoch": 0.3581699346405229, + "grad_norm": 2.0823960158601955, + "learning_rate": 1.4863835854192945e-05, + "loss": 0.2656, + "step": 4521 + }, + { + "epoch": 0.35824915824915826, + "grad_norm": 1.9778597739014039, + "learning_rate": 1.4861593635686207e-05, + "loss": 0.3041, + "step": 4522 + }, + { + "epoch": 0.35832838185779364, + "grad_norm": 1.4878587377277102, + "learning_rate": 1.485935109706692e-05, + "loss": 0.2841, + "step": 4523 + }, + { + "epoch": 0.35840760546642897, + "grad_norm": 1.898570776013274, + "learning_rate": 1.4857108238482747e-05, + "loss": 0.2735, + "step": 4524 + }, + { + "epoch": 0.35848682907506435, + "grad_norm": 1.9923333177254, + "learning_rate": 1.4854865060081367e-05, + "loss": 0.3702, + "step": 4525 + }, + { + "epoch": 0.35856605268369973, + "grad_norm": 1.9271745699492897, + "learning_rate": 1.4852621562010484e-05, + "loss": 0.3176, + "step": 4526 + }, + { + "epoch": 0.3586452762923351, + "grad_norm": 1.6935219452465744, + "learning_rate": 1.4850377744417816e-05, + "loss": 0.2658, + "step": 4527 + }, + { + "epoch": 0.3587244999009705, + "grad_norm": 1.6334071754581398, + "learning_rate": 1.4848133607451116e-05, + "loss": 0.2764, + "step": 4528 + }, + { + "epoch": 0.3588037235096059, + "grad_norm": 1.770940147150993, + "learning_rate": 1.4845889151258144e-05, + "loss": 0.2752, + "step": 4529 + }, + { + "epoch": 0.35888294711824126, + "grad_norm": 1.7400020480243055, + "learning_rate": 1.484364437598669e-05, + "loss": 0.265, + "step": 4530 + }, + { + "epoch": 0.3589621707268766, + "grad_norm": 1.8936786294272283, + "learning_rate": 1.4841399281784558e-05, + "loss": 0.2692, + "step": 4531 + }, + { + "epoch": 0.35904139433551197, + "grad_norm": 1.8265624261751408, + "learning_rate": 1.4839153868799583e-05, + "loss": 0.2403, + "step": 4532 + }, + { + "epoch": 0.35912061794414735, + "grad_norm": 1.8339537583849919, + "learning_rate": 1.4836908137179607e-05, + "loss": 0.2834, + "step": 4533 + }, + { + "epoch": 0.35919984155278273, + "grad_norm": 2.1927511822249226, + "learning_rate": 1.4834662087072502e-05, + "loss": 0.2485, + "step": 4534 + }, + { + "epoch": 0.3592790651614181, + "grad_norm": 2.6198828901448343, + "learning_rate": 1.4832415718626166e-05, + "loss": 0.2249, + "step": 4535 + }, + { + "epoch": 0.3593582887700535, + "grad_norm": 1.8791975996335242, + "learning_rate": 1.4830169031988502e-05, + "loss": 0.3104, + "step": 4536 + }, + { + "epoch": 0.3594375123786889, + "grad_norm": 1.7408388895555706, + "learning_rate": 1.482792202730745e-05, + "loss": 0.2941, + "step": 4537 + }, + { + "epoch": 0.3595167359873242, + "grad_norm": 2.131712923392794, + "learning_rate": 1.4825674704730966e-05, + "loss": 0.3036, + "step": 4538 + }, + { + "epoch": 0.3595959595959596, + "grad_norm": 2.2786987947911403, + "learning_rate": 1.4823427064407018e-05, + "loss": 0.2108, + "step": 4539 + }, + { + "epoch": 0.35967518320459496, + "grad_norm": 1.5969629787024902, + "learning_rate": 1.4821179106483609e-05, + "loss": 0.2157, + "step": 4540 + }, + { + "epoch": 0.35975440681323034, + "grad_norm": 1.8511205429928075, + "learning_rate": 1.4818930831108755e-05, + "loss": 0.2415, + "step": 4541 + }, + { + "epoch": 0.3598336304218657, + "grad_norm": 2.1315218058671355, + "learning_rate": 1.481668223843049e-05, + "loss": 0.3733, + "step": 4542 + }, + { + "epoch": 0.3599128540305011, + "grad_norm": 1.847512443968618, + "learning_rate": 1.481443332859688e-05, + "loss": 0.2486, + "step": 4543 + }, + { + "epoch": 0.3599920776391365, + "grad_norm": 1.9910027128388537, + "learning_rate": 1.4812184101755997e-05, + "loss": 0.3474, + "step": 4544 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 3.2582475850998707, + "learning_rate": 1.480993455805595e-05, + "loss": 0.3684, + "step": 4545 + }, + { + "epoch": 0.3601505248564072, + "grad_norm": 1.785677876391657, + "learning_rate": 1.480768469764485e-05, + "loss": 0.3508, + "step": 4546 + }, + { + "epoch": 0.3602297484650426, + "grad_norm": 2.1111082556414735, + "learning_rate": 1.480543452067085e-05, + "loss": 0.4027, + "step": 4547 + }, + { + "epoch": 0.36030897207367796, + "grad_norm": 1.6313097464088209, + "learning_rate": 1.480318402728211e-05, + "loss": 0.2332, + "step": 4548 + }, + { + "epoch": 0.36038819568231334, + "grad_norm": 1.8559683155422102, + "learning_rate": 1.480093321762681e-05, + "loss": 0.35, + "step": 4549 + }, + { + "epoch": 0.3604674192909487, + "grad_norm": 1.7853318963964082, + "learning_rate": 1.4798682091853161e-05, + "loss": 0.1915, + "step": 4550 + }, + { + "epoch": 0.3605466428995841, + "grad_norm": 2.071621887291487, + "learning_rate": 1.4796430650109383e-05, + "loss": 0.339, + "step": 4551 + }, + { + "epoch": 0.36062586650821943, + "grad_norm": 1.7452459123553987, + "learning_rate": 1.4794178892543727e-05, + "loss": 0.2623, + "step": 4552 + }, + { + "epoch": 0.3607050901168548, + "grad_norm": 1.9857757949360042, + "learning_rate": 1.4791926819304462e-05, + "loss": 0.2993, + "step": 4553 + }, + { + "epoch": 0.3607843137254902, + "grad_norm": 1.9043763234461848, + "learning_rate": 1.4789674430539868e-05, + "loss": 0.2841, + "step": 4554 + }, + { + "epoch": 0.3608635373341256, + "grad_norm": 1.9947000003906217, + "learning_rate": 1.4787421726398263e-05, + "loss": 0.3286, + "step": 4555 + }, + { + "epoch": 0.36094276094276095, + "grad_norm": 2.3353144966346284, + "learning_rate": 1.4785168707027972e-05, + "loss": 0.4169, + "step": 4556 + }, + { + "epoch": 0.36102198455139634, + "grad_norm": 2.1079237478369803, + "learning_rate": 1.4782915372577347e-05, + "loss": 0.2736, + "step": 4557 + }, + { + "epoch": 0.3611012081600317, + "grad_norm": 1.8403067529174804, + "learning_rate": 1.4780661723194757e-05, + "loss": 0.2704, + "step": 4558 + }, + { + "epoch": 0.36118043176866704, + "grad_norm": 1.9144111954723666, + "learning_rate": 1.4778407759028599e-05, + "loss": 0.2462, + "step": 4559 + }, + { + "epoch": 0.3612596553773024, + "grad_norm": 2.0896006232969815, + "learning_rate": 1.4776153480227278e-05, + "loss": 0.2951, + "step": 4560 + }, + { + "epoch": 0.3613388789859378, + "grad_norm": 2.175656361755795, + "learning_rate": 1.4773898886939235e-05, + "loss": 0.2725, + "step": 4561 + }, + { + "epoch": 0.3614181025945732, + "grad_norm": 2.1968586535941044, + "learning_rate": 1.4771643979312917e-05, + "loss": 0.329, + "step": 4562 + }, + { + "epoch": 0.36149732620320857, + "grad_norm": 1.893851266459502, + "learning_rate": 1.4769388757496806e-05, + "loss": 0.1702, + "step": 4563 + }, + { + "epoch": 0.36157654981184395, + "grad_norm": 1.8709173094430427, + "learning_rate": 1.4767133221639394e-05, + "loss": 0.325, + "step": 4564 + }, + { + "epoch": 0.3616557734204793, + "grad_norm": 2.0964495692159018, + "learning_rate": 1.4764877371889194e-05, + "loss": 0.3104, + "step": 4565 + }, + { + "epoch": 0.36173499702911466, + "grad_norm": 1.5181118840211347, + "learning_rate": 1.476262120839475e-05, + "loss": 0.2408, + "step": 4566 + }, + { + "epoch": 0.36181422063775004, + "grad_norm": 1.9308876705992082, + "learning_rate": 1.4760364731304614e-05, + "loss": 0.2622, + "step": 4567 + }, + { + "epoch": 0.3618934442463854, + "grad_norm": 1.6309956987385346, + "learning_rate": 1.4758107940767368e-05, + "loss": 0.2814, + "step": 4568 + }, + { + "epoch": 0.3619726678550208, + "grad_norm": 1.8227012989468465, + "learning_rate": 1.4755850836931607e-05, + "loss": 0.283, + "step": 4569 + }, + { + "epoch": 0.3620518914636562, + "grad_norm": 1.8517746035684717, + "learning_rate": 1.475359341994595e-05, + "loss": 0.345, + "step": 4570 + }, + { + "epoch": 0.36213111507229157, + "grad_norm": 1.829998598390889, + "learning_rate": 1.4751335689959044e-05, + "loss": 0.2462, + "step": 4571 + }, + { + "epoch": 0.3622103386809269, + "grad_norm": 2.0421939088794057, + "learning_rate": 1.4749077647119542e-05, + "loss": 0.2188, + "step": 4572 + }, + { + "epoch": 0.3622895622895623, + "grad_norm": 2.0200479575612413, + "learning_rate": 1.474681929157613e-05, + "loss": 0.3439, + "step": 4573 + }, + { + "epoch": 0.36236878589819765, + "grad_norm": 1.8543730087865458, + "learning_rate": 1.4744560623477502e-05, + "loss": 0.2511, + "step": 4574 + }, + { + "epoch": 0.36244800950683304, + "grad_norm": 1.9421722383662212, + "learning_rate": 1.4742301642972392e-05, + "loss": 0.3163, + "step": 4575 + }, + { + "epoch": 0.3625272331154684, + "grad_norm": 2.0974607886007433, + "learning_rate": 1.4740042350209536e-05, + "loss": 0.2961, + "step": 4576 + }, + { + "epoch": 0.3626064567241038, + "grad_norm": 2.1036012226703718, + "learning_rate": 1.4737782745337696e-05, + "loss": 0.3351, + "step": 4577 + }, + { + "epoch": 0.3626856803327392, + "grad_norm": 2.3285039526764435, + "learning_rate": 1.4735522828505663e-05, + "loss": 0.3722, + "step": 4578 + }, + { + "epoch": 0.3627649039413745, + "grad_norm": 1.851927494391605, + "learning_rate": 1.4733262599862234e-05, + "loss": 0.2979, + "step": 4579 + }, + { + "epoch": 0.3628441275500099, + "grad_norm": 1.6369910505642769, + "learning_rate": 1.4731002059556242e-05, + "loss": 0.2057, + "step": 4580 + }, + { + "epoch": 0.36292335115864527, + "grad_norm": 1.9433350948670887, + "learning_rate": 1.4728741207736525e-05, + "loss": 0.3834, + "step": 4581 + }, + { + "epoch": 0.36300257476728065, + "grad_norm": 1.6151885165582778, + "learning_rate": 1.4726480044551953e-05, + "loss": 0.2589, + "step": 4582 + }, + { + "epoch": 0.36308179837591603, + "grad_norm": 2.027856639065916, + "learning_rate": 1.4724218570151415e-05, + "loss": 0.3408, + "step": 4583 + }, + { + "epoch": 0.3631610219845514, + "grad_norm": 1.9773296362166783, + "learning_rate": 1.4721956784683813e-05, + "loss": 0.4183, + "step": 4584 + }, + { + "epoch": 0.3632402455931868, + "grad_norm": 1.9726032512494887, + "learning_rate": 1.4719694688298078e-05, + "loss": 0.3214, + "step": 4585 + }, + { + "epoch": 0.3633194692018221, + "grad_norm": 2.0096875940382386, + "learning_rate": 1.4717432281143161e-05, + "loss": 0.2344, + "step": 4586 + }, + { + "epoch": 0.3633986928104575, + "grad_norm": 1.8675973209031467, + "learning_rate": 1.4715169563368021e-05, + "loss": 0.3268, + "step": 4587 + }, + { + "epoch": 0.3634779164190929, + "grad_norm": 1.8619405637117874, + "learning_rate": 1.4712906535121658e-05, + "loss": 0.1834, + "step": 4588 + }, + { + "epoch": 0.36355714002772826, + "grad_norm": 1.9622734340502865, + "learning_rate": 1.4710643196553074e-05, + "loss": 0.3811, + "step": 4589 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.2464322813911541, + "learning_rate": 1.4708379547811302e-05, + "loss": 0.1781, + "step": 4590 + }, + { + "epoch": 0.36371558724499903, + "grad_norm": 1.97120190389002, + "learning_rate": 1.4706115589045396e-05, + "loss": 0.2922, + "step": 4591 + }, + { + "epoch": 0.3637948108536344, + "grad_norm": 1.8130444421048555, + "learning_rate": 1.4703851320404416e-05, + "loss": 0.3521, + "step": 4592 + }, + { + "epoch": 0.36387403446226974, + "grad_norm": 1.7903272120508689, + "learning_rate": 1.4701586742037464e-05, + "loss": 0.2122, + "step": 4593 + }, + { + "epoch": 0.3639532580709051, + "grad_norm": 2.0312416747431756, + "learning_rate": 1.4699321854093649e-05, + "loss": 0.3273, + "step": 4594 + }, + { + "epoch": 0.3640324816795405, + "grad_norm": 1.8251285075242487, + "learning_rate": 1.46970566567221e-05, + "loss": 0.278, + "step": 4595 + }, + { + "epoch": 0.3641117052881759, + "grad_norm": 1.5354544779957986, + "learning_rate": 1.469479115007197e-05, + "loss": 0.3274, + "step": 4596 + }, + { + "epoch": 0.36419092889681126, + "grad_norm": 2.280352908558222, + "learning_rate": 1.4692525334292434e-05, + "loss": 0.1861, + "step": 4597 + }, + { + "epoch": 0.36427015250544664, + "grad_norm": 1.7668855525305438, + "learning_rate": 1.4690259209532682e-05, + "loss": 0.2326, + "step": 4598 + }, + { + "epoch": 0.364349376114082, + "grad_norm": 2.2111396426990777, + "learning_rate": 1.468799277594193e-05, + "loss": 0.3252, + "step": 4599 + }, + { + "epoch": 0.36442859972271735, + "grad_norm": 1.8981381327900542, + "learning_rate": 1.4685726033669412e-05, + "loss": 0.3434, + "step": 4600 + }, + { + "epoch": 0.36450782333135273, + "grad_norm": 1.943164954252058, + "learning_rate": 1.468345898286438e-05, + "loss": 0.2665, + "step": 4601 + }, + { + "epoch": 0.3645870469399881, + "grad_norm": 1.5919574835797072, + "learning_rate": 1.468119162367611e-05, + "loss": 0.2047, + "step": 4602 + }, + { + "epoch": 0.3646662705486235, + "grad_norm": 1.7437721486295135, + "learning_rate": 1.4678923956253894e-05, + "loss": 0.3533, + "step": 4603 + }, + { + "epoch": 0.3647454941572589, + "grad_norm": 2.0121970439790986, + "learning_rate": 1.4676655980747052e-05, + "loss": 0.3026, + "step": 4604 + }, + { + "epoch": 0.36482471776589426, + "grad_norm": 1.4807536834732176, + "learning_rate": 1.4674387697304914e-05, + "loss": 0.2085, + "step": 4605 + }, + { + "epoch": 0.3649039413745296, + "grad_norm": 1.6945669425668628, + "learning_rate": 1.4672119106076838e-05, + "loss": 0.419, + "step": 4606 + }, + { + "epoch": 0.36498316498316496, + "grad_norm": 2.556056779793669, + "learning_rate": 1.4669850207212202e-05, + "loss": 0.3179, + "step": 4607 + }, + { + "epoch": 0.36506238859180035, + "grad_norm": 1.8481477313822434, + "learning_rate": 1.4667581000860395e-05, + "loss": 0.2396, + "step": 4608 + }, + { + "epoch": 0.36514161220043573, + "grad_norm": 2.8868924312330466, + "learning_rate": 1.4665311487170844e-05, + "loss": 0.5238, + "step": 4609 + }, + { + "epoch": 0.3652208358090711, + "grad_norm": 2.107044608719778, + "learning_rate": 1.4663041666292978e-05, + "loss": 0.3859, + "step": 4610 + }, + { + "epoch": 0.3653000594177065, + "grad_norm": 1.9488982503648078, + "learning_rate": 1.4660771538376253e-05, + "loss": 0.3047, + "step": 4611 + }, + { + "epoch": 0.3653792830263419, + "grad_norm": 1.5307612171191352, + "learning_rate": 1.4658501103570149e-05, + "loss": 0.2338, + "step": 4612 + }, + { + "epoch": 0.3654585066349772, + "grad_norm": 1.5640540786814416, + "learning_rate": 1.4656230362024166e-05, + "loss": 0.2163, + "step": 4613 + }, + { + "epoch": 0.3655377302436126, + "grad_norm": 1.6627371105261337, + "learning_rate": 1.4653959313887813e-05, + "loss": 0.307, + "step": 4614 + }, + { + "epoch": 0.36561695385224796, + "grad_norm": 1.918676718822198, + "learning_rate": 1.4651687959310636e-05, + "loss": 0.2164, + "step": 4615 + }, + { + "epoch": 0.36569617746088334, + "grad_norm": 2.0657209258516387, + "learning_rate": 1.4649416298442187e-05, + "loss": 0.3128, + "step": 4616 + }, + { + "epoch": 0.3657754010695187, + "grad_norm": 1.8261691355512197, + "learning_rate": 1.4647144331432049e-05, + "loss": 0.3541, + "step": 4617 + }, + { + "epoch": 0.3658546246781541, + "grad_norm": 1.83896566157039, + "learning_rate": 1.4644872058429816e-05, + "loss": 0.2391, + "step": 4618 + }, + { + "epoch": 0.3659338482867895, + "grad_norm": 2.0076747941817863, + "learning_rate": 1.4642599479585106e-05, + "loss": 0.3385, + "step": 4619 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 2.3144193022940023, + "learning_rate": 1.4640326595047561e-05, + "loss": 0.3623, + "step": 4620 + }, + { + "epoch": 0.3660922955040602, + "grad_norm": 2.0255817411378847, + "learning_rate": 1.4638053404966836e-05, + "loss": 0.3224, + "step": 4621 + }, + { + "epoch": 0.3661715191126956, + "grad_norm": 1.748214080107897, + "learning_rate": 1.4635779909492614e-05, + "loss": 0.2633, + "step": 4622 + }, + { + "epoch": 0.36625074272133096, + "grad_norm": 1.762716927418055, + "learning_rate": 1.4633506108774588e-05, + "loss": 0.2537, + "step": 4623 + }, + { + "epoch": 0.36632996632996634, + "grad_norm": 2.134580736957998, + "learning_rate": 1.4631232002962481e-05, + "loss": 0.2609, + "step": 4624 + }, + { + "epoch": 0.3664091899386017, + "grad_norm": 1.4654683207780617, + "learning_rate": 1.462895759220603e-05, + "loss": 0.2321, + "step": 4625 + }, + { + "epoch": 0.3664884135472371, + "grad_norm": 1.8979989375459447, + "learning_rate": 1.4626682876654998e-05, + "loss": 0.2743, + "step": 4626 + }, + { + "epoch": 0.36656763715587243, + "grad_norm": 1.936896031552917, + "learning_rate": 1.4624407856459154e-05, + "loss": 0.3495, + "step": 4627 + }, + { + "epoch": 0.3666468607645078, + "grad_norm": 2.05955466442178, + "learning_rate": 1.4622132531768309e-05, + "loss": 0.3189, + "step": 4628 + }, + { + "epoch": 0.3667260843731432, + "grad_norm": 1.8753091597874505, + "learning_rate": 1.4619856902732279e-05, + "loss": 0.2919, + "step": 4629 + }, + { + "epoch": 0.36680530798177857, + "grad_norm": 1.892511585392787, + "learning_rate": 1.4617580969500895e-05, + "loss": 0.2447, + "step": 4630 + }, + { + "epoch": 0.36688453159041395, + "grad_norm": 1.8837761972181513, + "learning_rate": 1.461530473222403e-05, + "loss": 0.2432, + "step": 4631 + }, + { + "epoch": 0.36696375519904934, + "grad_norm": 1.4786091567383612, + "learning_rate": 1.4613028191051548e-05, + "loss": 0.234, + "step": 4632 + }, + { + "epoch": 0.3670429788076847, + "grad_norm": 2.2305164677214977, + "learning_rate": 1.4610751346133361e-05, + "loss": 0.3248, + "step": 4633 + }, + { + "epoch": 0.36712220241632004, + "grad_norm": 1.9363515121712283, + "learning_rate": 1.4608474197619383e-05, + "loss": 0.314, + "step": 4634 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 1.654846397734255, + "learning_rate": 1.4606196745659551e-05, + "loss": 0.2172, + "step": 4635 + }, + { + "epoch": 0.3672806496335908, + "grad_norm": 1.471941295936231, + "learning_rate": 1.460391899040383e-05, + "loss": 0.2592, + "step": 4636 + }, + { + "epoch": 0.3673598732422262, + "grad_norm": 1.9640710056443629, + "learning_rate": 1.4601640932002194e-05, + "loss": 0.2894, + "step": 4637 + }, + { + "epoch": 0.36743909685086157, + "grad_norm": 1.4775331718198763, + "learning_rate": 1.4599362570604645e-05, + "loss": 0.2402, + "step": 4638 + }, + { + "epoch": 0.36751832045949695, + "grad_norm": 1.7897415267168406, + "learning_rate": 1.4597083906361203e-05, + "loss": 0.3734, + "step": 4639 + }, + { + "epoch": 0.3675975440681323, + "grad_norm": 2.074932247402187, + "learning_rate": 1.4594804939421903e-05, + "loss": 0.3158, + "step": 4640 + }, + { + "epoch": 0.36767676767676766, + "grad_norm": 1.737471564022473, + "learning_rate": 1.4592525669936808e-05, + "loss": 0.2409, + "step": 4641 + }, + { + "epoch": 0.36775599128540304, + "grad_norm": 2.195945092172361, + "learning_rate": 1.4590246098055995e-05, + "loss": 0.4206, + "step": 4642 + }, + { + "epoch": 0.3678352148940384, + "grad_norm": 1.7421794391496206, + "learning_rate": 1.4587966223929562e-05, + "loss": 0.2932, + "step": 4643 + }, + { + "epoch": 0.3679144385026738, + "grad_norm": 1.6049634386404827, + "learning_rate": 1.458568604770763e-05, + "loss": 0.2902, + "step": 4644 + }, + { + "epoch": 0.3679936621113092, + "grad_norm": 1.5299809026000064, + "learning_rate": 1.458340556954034e-05, + "loss": 0.279, + "step": 4645 + }, + { + "epoch": 0.36807288571994456, + "grad_norm": 1.8258500233407735, + "learning_rate": 1.4581124789577841e-05, + "loss": 0.287, + "step": 4646 + }, + { + "epoch": 0.3681521093285799, + "grad_norm": 1.6579619781481048, + "learning_rate": 1.4578843707970323e-05, + "loss": 0.297, + "step": 4647 + }, + { + "epoch": 0.36823133293721527, + "grad_norm": 2.4866456878273557, + "learning_rate": 1.4576562324867975e-05, + "loss": 0.2564, + "step": 4648 + }, + { + "epoch": 0.36831055654585065, + "grad_norm": 1.7970297668660742, + "learning_rate": 1.457428064042102e-05, + "loss": 0.335, + "step": 4649 + }, + { + "epoch": 0.36838978015448604, + "grad_norm": 1.4325466255727297, + "learning_rate": 1.45719986547797e-05, + "loss": 0.1693, + "step": 4650 + }, + { + "epoch": 0.3684690037631214, + "grad_norm": 1.705408056581669, + "learning_rate": 1.4569716368094262e-05, + "loss": 0.2615, + "step": 4651 + }, + { + "epoch": 0.3685482273717568, + "grad_norm": 1.8508482085674984, + "learning_rate": 1.456743378051499e-05, + "loss": 0.2815, + "step": 4652 + }, + { + "epoch": 0.3686274509803922, + "grad_norm": 1.8742340972489975, + "learning_rate": 1.456515089219218e-05, + "loss": 0.2094, + "step": 4653 + }, + { + "epoch": 0.3687066745890275, + "grad_norm": 2.2385941481780076, + "learning_rate": 1.456286770327615e-05, + "loss": 0.417, + "step": 4654 + }, + { + "epoch": 0.3687858981976629, + "grad_norm": 1.4993760016931719, + "learning_rate": 1.456058421391724e-05, + "loss": 0.1901, + "step": 4655 + }, + { + "epoch": 0.36886512180629827, + "grad_norm": 1.8644129594122525, + "learning_rate": 1.45583004242658e-05, + "loss": 0.3158, + "step": 4656 + }, + { + "epoch": 0.36894434541493365, + "grad_norm": 1.9542568327216117, + "learning_rate": 1.4556016334472211e-05, + "loss": 0.2579, + "step": 4657 + }, + { + "epoch": 0.36902356902356903, + "grad_norm": 1.7588025672766292, + "learning_rate": 1.455373194468687e-05, + "loss": 0.2339, + "step": 4658 + }, + { + "epoch": 0.3691027926322044, + "grad_norm": 1.7140319811474112, + "learning_rate": 1.4551447255060192e-05, + "loss": 0.2637, + "step": 4659 + }, + { + "epoch": 0.3691820162408398, + "grad_norm": 1.3164749056508211, + "learning_rate": 1.4549162265742608e-05, + "loss": 0.2048, + "step": 4660 + }, + { + "epoch": 0.3692612398494751, + "grad_norm": 1.8639998795903583, + "learning_rate": 1.4546876976884583e-05, + "loss": 0.2791, + "step": 4661 + }, + { + "epoch": 0.3693404634581105, + "grad_norm": 1.9381892678910597, + "learning_rate": 1.4544591388636584e-05, + "loss": 0.255, + "step": 4662 + }, + { + "epoch": 0.3694196870667459, + "grad_norm": 1.3953220826390549, + "learning_rate": 1.454230550114911e-05, + "loss": 0.1501, + "step": 4663 + }, + { + "epoch": 0.36949891067538126, + "grad_norm": 2.0610398191583843, + "learning_rate": 1.4540019314572678e-05, + "loss": 0.2809, + "step": 4664 + }, + { + "epoch": 0.36957813428401665, + "grad_norm": 1.7742172622260766, + "learning_rate": 1.4537732829057816e-05, + "loss": 0.3152, + "step": 4665 + }, + { + "epoch": 0.369657357892652, + "grad_norm": 2.125656945669275, + "learning_rate": 1.4535446044755082e-05, + "loss": 0.3574, + "step": 4666 + }, + { + "epoch": 0.3697365815012874, + "grad_norm": 2.198456694493325, + "learning_rate": 1.4533158961815048e-05, + "loss": 0.3725, + "step": 4667 + }, + { + "epoch": 0.36981580510992274, + "grad_norm": 1.4773839870182743, + "learning_rate": 1.4530871580388311e-05, + "loss": 0.1353, + "step": 4668 + }, + { + "epoch": 0.3698950287185581, + "grad_norm": 1.6611877169714782, + "learning_rate": 1.4528583900625481e-05, + "loss": 0.2634, + "step": 4669 + }, + { + "epoch": 0.3699742523271935, + "grad_norm": 1.7018209933075727, + "learning_rate": 1.4526295922677189e-05, + "loss": 0.3748, + "step": 4670 + }, + { + "epoch": 0.3700534759358289, + "grad_norm": 2.4185178534746403, + "learning_rate": 1.4524007646694091e-05, + "loss": 0.3315, + "step": 4671 + }, + { + "epoch": 0.37013269954446426, + "grad_norm": 1.7903678767947944, + "learning_rate": 1.4521719072826858e-05, + "loss": 0.1874, + "step": 4672 + }, + { + "epoch": 0.37021192315309964, + "grad_norm": 1.4288781545608105, + "learning_rate": 1.451943020122618e-05, + "loss": 0.1809, + "step": 4673 + }, + { + "epoch": 0.370291146761735, + "grad_norm": 1.6422288313024853, + "learning_rate": 1.4517141032042773e-05, + "loss": 0.2245, + "step": 4674 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 1.8735284688445157, + "learning_rate": 1.4514851565427362e-05, + "loss": 0.2704, + "step": 4675 + }, + { + "epoch": 0.37044959397900573, + "grad_norm": 2.2479902989191114, + "learning_rate": 1.4512561801530699e-05, + "loss": 0.2887, + "step": 4676 + }, + { + "epoch": 0.3705288175876411, + "grad_norm": 2.3373645804027436, + "learning_rate": 1.4510271740503555e-05, + "loss": 0.3139, + "step": 4677 + }, + { + "epoch": 0.3706080411962765, + "grad_norm": 1.5650395432174686, + "learning_rate": 1.4507981382496716e-05, + "loss": 0.2086, + "step": 4678 + }, + { + "epoch": 0.3706872648049119, + "grad_norm": 2.3625039464800826, + "learning_rate": 1.4505690727660997e-05, + "loss": 0.3312, + "step": 4679 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 1.8036702932799606, + "learning_rate": 1.4503399776147223e-05, + "loss": 0.265, + "step": 4680 + }, + { + "epoch": 0.3708457120221826, + "grad_norm": 1.9720782826214083, + "learning_rate": 1.4501108528106243e-05, + "loss": 0.3586, + "step": 4681 + }, + { + "epoch": 0.37092493563081796, + "grad_norm": 1.8281158547092875, + "learning_rate": 1.4498816983688926e-05, + "loss": 0.2726, + "step": 4682 + }, + { + "epoch": 0.37100415923945335, + "grad_norm": 2.111812174999127, + "learning_rate": 1.4496525143046154e-05, + "loss": 0.3488, + "step": 4683 + }, + { + "epoch": 0.3710833828480887, + "grad_norm": 1.8729364466817486, + "learning_rate": 1.4494233006328837e-05, + "loss": 0.286, + "step": 4684 + }, + { + "epoch": 0.3711626064567241, + "grad_norm": 1.7379818319196876, + "learning_rate": 1.4491940573687906e-05, + "loss": 0.2436, + "step": 4685 + }, + { + "epoch": 0.3712418300653595, + "grad_norm": 1.6843416581137713, + "learning_rate": 1.44896478452743e-05, + "loss": 0.2432, + "step": 4686 + }, + { + "epoch": 0.37132105367399487, + "grad_norm": 2.0286483827739548, + "learning_rate": 1.4487354821238983e-05, + "loss": 0.2842, + "step": 4687 + }, + { + "epoch": 0.3714002772826302, + "grad_norm": 1.911045750236119, + "learning_rate": 1.4485061501732949e-05, + "loss": 0.2678, + "step": 4688 + }, + { + "epoch": 0.3714795008912656, + "grad_norm": 1.7492314580876391, + "learning_rate": 1.448276788690719e-05, + "loss": 0.2592, + "step": 4689 + }, + { + "epoch": 0.37155872449990096, + "grad_norm": 1.7136385436125254, + "learning_rate": 1.4480473976912737e-05, + "loss": 0.2438, + "step": 4690 + }, + { + "epoch": 0.37163794810853634, + "grad_norm": 1.7401190917926839, + "learning_rate": 1.4478179771900634e-05, + "loss": 0.2423, + "step": 4691 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 1.8271774715508877, + "learning_rate": 1.4475885272021936e-05, + "loss": 0.346, + "step": 4692 + }, + { + "epoch": 0.3717963953258071, + "grad_norm": 1.781601678505233, + "learning_rate": 1.4473590477427735e-05, + "loss": 0.2917, + "step": 4693 + }, + { + "epoch": 0.3718756189344425, + "grad_norm": 1.9430399152505764, + "learning_rate": 1.4471295388269121e-05, + "loss": 0.3292, + "step": 4694 + }, + { + "epoch": 0.3719548425430778, + "grad_norm": 1.9534291727129052, + "learning_rate": 1.4469000004697224e-05, + "loss": 0.2956, + "step": 4695 + }, + { + "epoch": 0.3720340661517132, + "grad_norm": 2.183933025080302, + "learning_rate": 1.446670432686318e-05, + "loss": 0.3236, + "step": 4696 + }, + { + "epoch": 0.3721132897603486, + "grad_norm": 2.1682657162645893, + "learning_rate": 1.4464408354918145e-05, + "loss": 0.2877, + "step": 4697 + }, + { + "epoch": 0.37219251336898396, + "grad_norm": 1.4817786600749772, + "learning_rate": 1.4462112089013304e-05, + "loss": 0.2326, + "step": 4698 + }, + { + "epoch": 0.37227173697761934, + "grad_norm": 1.8533209906568295, + "learning_rate": 1.4459815529299851e-05, + "loss": 0.2531, + "step": 4699 + }, + { + "epoch": 0.3723509605862547, + "grad_norm": 1.8799966813142532, + "learning_rate": 1.4457518675929008e-05, + "loss": 0.2968, + "step": 4700 + }, + { + "epoch": 0.3724301841948901, + "grad_norm": 1.8644717360882417, + "learning_rate": 1.4455221529052006e-05, + "loss": 0.2205, + "step": 4701 + }, + { + "epoch": 0.3725094078035254, + "grad_norm": 2.0868630337187097, + "learning_rate": 1.4452924088820101e-05, + "loss": 0.3263, + "step": 4702 + }, + { + "epoch": 0.3725886314121608, + "grad_norm": 2.0433028399364175, + "learning_rate": 1.4450626355384573e-05, + "loss": 0.2889, + "step": 4703 + }, + { + "epoch": 0.3726678550207962, + "grad_norm": 1.7147792563781357, + "learning_rate": 1.4448328328896717e-05, + "loss": 0.2512, + "step": 4704 + }, + { + "epoch": 0.37274707862943157, + "grad_norm": 2.100717531941935, + "learning_rate": 1.444603000950784e-05, + "loss": 0.2273, + "step": 4705 + }, + { + "epoch": 0.37282630223806695, + "grad_norm": 1.9873458870634582, + "learning_rate": 1.4443731397369283e-05, + "loss": 0.2479, + "step": 4706 + }, + { + "epoch": 0.37290552584670233, + "grad_norm": 2.0897216126976477, + "learning_rate": 1.4441432492632395e-05, + "loss": 0.3453, + "step": 4707 + }, + { + "epoch": 0.3729847494553377, + "grad_norm": 1.85767304640378, + "learning_rate": 1.4439133295448547e-05, + "loss": 0.2637, + "step": 4708 + }, + { + "epoch": 0.37306397306397304, + "grad_norm": 1.8669765375254737, + "learning_rate": 1.4436833805969133e-05, + "loss": 0.2248, + "step": 4709 + }, + { + "epoch": 0.3731431966726084, + "grad_norm": 1.5468480808168354, + "learning_rate": 1.4434534024345558e-05, + "loss": 0.2213, + "step": 4710 + }, + { + "epoch": 0.3732224202812438, + "grad_norm": 2.1651278677154173, + "learning_rate": 1.4432233950729257e-05, + "loss": 0.2346, + "step": 4711 + }, + { + "epoch": 0.3733016438898792, + "grad_norm": 1.8622206801954768, + "learning_rate": 1.442993358527168e-05, + "loss": 0.2661, + "step": 4712 + }, + { + "epoch": 0.37338086749851457, + "grad_norm": 1.860985314496505, + "learning_rate": 1.4427632928124288e-05, + "loss": 0.2264, + "step": 4713 + }, + { + "epoch": 0.37346009110714995, + "grad_norm": 1.738303261273715, + "learning_rate": 1.4425331979438573e-05, + "loss": 0.2249, + "step": 4714 + }, + { + "epoch": 0.37353931471578533, + "grad_norm": 1.3493321298988545, + "learning_rate": 1.4423030739366042e-05, + "loss": 0.1953, + "step": 4715 + }, + { + "epoch": 0.37361853832442066, + "grad_norm": 1.757152477188081, + "learning_rate": 1.4420729208058217e-05, + "loss": 0.2797, + "step": 4716 + }, + { + "epoch": 0.37369776193305604, + "grad_norm": 2.4867732454480835, + "learning_rate": 1.4418427385666647e-05, + "loss": 0.3569, + "step": 4717 + }, + { + "epoch": 0.3737769855416914, + "grad_norm": 1.7485682772478601, + "learning_rate": 1.4416125272342891e-05, + "loss": 0.3015, + "step": 4718 + }, + { + "epoch": 0.3738562091503268, + "grad_norm": 1.6998442982286588, + "learning_rate": 1.4413822868238537e-05, + "loss": 0.2688, + "step": 4719 + }, + { + "epoch": 0.3739354327589622, + "grad_norm": 2.422891857454065, + "learning_rate": 1.4411520173505184e-05, + "loss": 0.2982, + "step": 4720 + }, + { + "epoch": 0.37401465636759756, + "grad_norm": 2.020050060082948, + "learning_rate": 1.4409217188294456e-05, + "loss": 0.2554, + "step": 4721 + }, + { + "epoch": 0.3740938799762329, + "grad_norm": 1.8270816579010787, + "learning_rate": 1.440691391275799e-05, + "loss": 0.2659, + "step": 4722 + }, + { + "epoch": 0.37417310358486827, + "grad_norm": 2.0408968489461228, + "learning_rate": 1.440461034704745e-05, + "loss": 0.2785, + "step": 4723 + }, + { + "epoch": 0.37425232719350365, + "grad_norm": 2.276670163471484, + "learning_rate": 1.4402306491314508e-05, + "loss": 0.382, + "step": 4724 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 1.7914286178273469, + "learning_rate": 1.4400002345710871e-05, + "loss": 0.1882, + "step": 4725 + }, + { + "epoch": 0.3744107744107744, + "grad_norm": 1.8696138522642984, + "learning_rate": 1.4397697910388248e-05, + "loss": 0.2113, + "step": 4726 + }, + { + "epoch": 0.3744899980194098, + "grad_norm": 1.9916567042877682, + "learning_rate": 1.4395393185498381e-05, + "loss": 0.3261, + "step": 4727 + }, + { + "epoch": 0.3745692216280452, + "grad_norm": 2.1764238125591273, + "learning_rate": 1.4393088171193021e-05, + "loss": 0.3606, + "step": 4728 + }, + { + "epoch": 0.3746484452366805, + "grad_norm": 2.287074179969357, + "learning_rate": 1.439078286762394e-05, + "loss": 0.296, + "step": 4729 + }, + { + "epoch": 0.3747276688453159, + "grad_norm": 2.069702512234331, + "learning_rate": 1.4388477274942936e-05, + "loss": 0.3394, + "step": 4730 + }, + { + "epoch": 0.37480689245395127, + "grad_norm": 1.7618470036155942, + "learning_rate": 1.438617139330182e-05, + "loss": 0.2939, + "step": 4731 + }, + { + "epoch": 0.37488611606258665, + "grad_norm": 1.927425830780056, + "learning_rate": 1.4383865222852423e-05, + "loss": 0.3757, + "step": 4732 + }, + { + "epoch": 0.37496533967122203, + "grad_norm": 1.6818755923497462, + "learning_rate": 1.4381558763746593e-05, + "loss": 0.2663, + "step": 4733 + }, + { + "epoch": 0.3750445632798574, + "grad_norm": 2.0275449152454845, + "learning_rate": 1.4379252016136203e-05, + "loss": 0.2412, + "step": 4734 + }, + { + "epoch": 0.3751237868884928, + "grad_norm": 2.365124968042268, + "learning_rate": 1.4376944980173138e-05, + "loss": 0.3016, + "step": 4735 + }, + { + "epoch": 0.3752030104971281, + "grad_norm": 1.726110119950647, + "learning_rate": 1.4374637656009309e-05, + "loss": 0.2693, + "step": 4736 + }, + { + "epoch": 0.3752822341057635, + "grad_norm": 1.825486942827154, + "learning_rate": 1.4372330043796636e-05, + "loss": 0.2709, + "step": 4737 + }, + { + "epoch": 0.3753614577143989, + "grad_norm": 1.8866113495498562, + "learning_rate": 1.437002214368707e-05, + "loss": 0.3569, + "step": 4738 + }, + { + "epoch": 0.37544068132303426, + "grad_norm": 1.8885572190682287, + "learning_rate": 1.4367713955832575e-05, + "loss": 0.2985, + "step": 4739 + }, + { + "epoch": 0.37551990493166965, + "grad_norm": 1.8054682288349528, + "learning_rate": 1.4365405480385129e-05, + "loss": 0.2478, + "step": 4740 + }, + { + "epoch": 0.375599128540305, + "grad_norm": 1.706464839265661, + "learning_rate": 1.4363096717496738e-05, + "loss": 0.3153, + "step": 4741 + }, + { + "epoch": 0.3756783521489404, + "grad_norm": 1.9111858773258044, + "learning_rate": 1.4360787667319423e-05, + "loss": 0.2685, + "step": 4742 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 1.8988461695719265, + "learning_rate": 1.4358478330005222e-05, + "loss": 0.25, + "step": 4743 + }, + { + "epoch": 0.3758367993662111, + "grad_norm": 2.05609388435826, + "learning_rate": 1.4356168705706195e-05, + "loss": 0.3652, + "step": 4744 + }, + { + "epoch": 0.3759160229748465, + "grad_norm": 1.418560946975717, + "learning_rate": 1.4353858794574418e-05, + "loss": 0.1918, + "step": 4745 + }, + { + "epoch": 0.3759952465834819, + "grad_norm": 1.8745809635321844, + "learning_rate": 1.435154859676199e-05, + "loss": 0.1911, + "step": 4746 + }, + { + "epoch": 0.37607447019211726, + "grad_norm": 1.8668818527695277, + "learning_rate": 1.4349238112421025e-05, + "loss": 0.3081, + "step": 4747 + }, + { + "epoch": 0.37615369380075264, + "grad_norm": 2.0402156683640653, + "learning_rate": 1.4346927341703659e-05, + "loss": 0.1871, + "step": 4748 + }, + { + "epoch": 0.376232917409388, + "grad_norm": 2.024151905844245, + "learning_rate": 1.4344616284762038e-05, + "loss": 0.2528, + "step": 4749 + }, + { + "epoch": 0.37631214101802335, + "grad_norm": 1.8115906364725716, + "learning_rate": 1.4342304941748347e-05, + "loss": 0.2524, + "step": 4750 + }, + { + "epoch": 0.37639136462665873, + "grad_norm": 1.730835837178477, + "learning_rate": 1.4339993312814765e-05, + "loss": 0.2416, + "step": 4751 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 2.4746287356539263, + "learning_rate": 1.4337681398113508e-05, + "loss": 0.3894, + "step": 4752 + }, + { + "epoch": 0.3765498118439295, + "grad_norm": 1.6848954803166556, + "learning_rate": 1.4335369197796803e-05, + "loss": 0.3042, + "step": 4753 + }, + { + "epoch": 0.3766290354525649, + "grad_norm": 1.6223749855599328, + "learning_rate": 1.4333056712016893e-05, + "loss": 0.2643, + "step": 4754 + }, + { + "epoch": 0.37670825906120026, + "grad_norm": 1.5619388055588996, + "learning_rate": 1.4330743940926052e-05, + "loss": 0.321, + "step": 4755 + }, + { + "epoch": 0.37678748266983564, + "grad_norm": 1.8907211588109332, + "learning_rate": 1.4328430884676559e-05, + "loss": 0.3613, + "step": 4756 + }, + { + "epoch": 0.37686670627847096, + "grad_norm": 1.8064758379779098, + "learning_rate": 1.432611754342072e-05, + "loss": 0.2049, + "step": 4757 + }, + { + "epoch": 0.37694592988710635, + "grad_norm": 2.2558741911370808, + "learning_rate": 1.4323803917310857e-05, + "loss": 0.2703, + "step": 4758 + }, + { + "epoch": 0.3770251534957417, + "grad_norm": 1.6493248676518513, + "learning_rate": 1.4321490006499309e-05, + "loss": 0.3129, + "step": 4759 + }, + { + "epoch": 0.3771043771043771, + "grad_norm": 2.023673583331004, + "learning_rate": 1.4319175811138439e-05, + "loss": 0.3841, + "step": 4760 + }, + { + "epoch": 0.3771836007130125, + "grad_norm": 1.5846736652531113, + "learning_rate": 1.4316861331380624e-05, + "loss": 0.2293, + "step": 4761 + }, + { + "epoch": 0.37726282432164787, + "grad_norm": 1.737152960625183, + "learning_rate": 1.431454656737826e-05, + "loss": 0.2553, + "step": 4762 + }, + { + "epoch": 0.3773420479302832, + "grad_norm": 1.847626636025447, + "learning_rate": 1.4312231519283768e-05, + "loss": 0.3948, + "step": 4763 + }, + { + "epoch": 0.3774212715389186, + "grad_norm": 1.9922604383254103, + "learning_rate": 1.4309916187249578e-05, + "loss": 0.2632, + "step": 4764 + }, + { + "epoch": 0.37750049514755396, + "grad_norm": 1.6642609860673572, + "learning_rate": 1.4307600571428143e-05, + "loss": 0.2273, + "step": 4765 + }, + { + "epoch": 0.37757971875618934, + "grad_norm": 2.3994548871589045, + "learning_rate": 1.4305284671971943e-05, + "loss": 0.2685, + "step": 4766 + }, + { + "epoch": 0.3776589423648247, + "grad_norm": 1.9601416463291454, + "learning_rate": 1.4302968489033462e-05, + "loss": 0.2729, + "step": 4767 + }, + { + "epoch": 0.3777381659734601, + "grad_norm": 1.5918009311669763, + "learning_rate": 1.4300652022765207e-05, + "loss": 0.1623, + "step": 4768 + }, + { + "epoch": 0.3778173895820955, + "grad_norm": 1.7491420675037992, + "learning_rate": 1.429833527331971e-05, + "loss": 0.2347, + "step": 4769 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 1.857020044890144, + "learning_rate": 1.4296018240849518e-05, + "loss": 0.2439, + "step": 4770 + }, + { + "epoch": 0.3779758367993662, + "grad_norm": 2.1939072313416497, + "learning_rate": 1.4293700925507199e-05, + "loss": 0.3053, + "step": 4771 + }, + { + "epoch": 0.3780550604080016, + "grad_norm": 1.6770552256889275, + "learning_rate": 1.429138332744533e-05, + "loss": 0.2479, + "step": 4772 + }, + { + "epoch": 0.37813428401663696, + "grad_norm": 2.2720296172370005, + "learning_rate": 1.428906544681652e-05, + "loss": 0.4448, + "step": 4773 + }, + { + "epoch": 0.37821350762527234, + "grad_norm": 1.9610731961200303, + "learning_rate": 1.4286747283773388e-05, + "loss": 0.2748, + "step": 4774 + }, + { + "epoch": 0.3782927312339077, + "grad_norm": 2.214411496713684, + "learning_rate": 1.4284428838468572e-05, + "loss": 0.3452, + "step": 4775 + }, + { + "epoch": 0.3783719548425431, + "grad_norm": 1.6701313928344728, + "learning_rate": 1.4282110111054733e-05, + "loss": 0.2299, + "step": 4776 + }, + { + "epoch": 0.3784511784511784, + "grad_norm": 1.8452207662857405, + "learning_rate": 1.4279791101684547e-05, + "loss": 0.2722, + "step": 4777 + }, + { + "epoch": 0.3785304020598138, + "grad_norm": 1.8936329879851312, + "learning_rate": 1.427747181051071e-05, + "loss": 0.3366, + "step": 4778 + }, + { + "epoch": 0.3786096256684492, + "grad_norm": 2.093726002352405, + "learning_rate": 1.4275152237685938e-05, + "loss": 0.2198, + "step": 4779 + }, + { + "epoch": 0.37868884927708457, + "grad_norm": 1.7039406740894012, + "learning_rate": 1.4272832383362962e-05, + "loss": 0.2576, + "step": 4780 + }, + { + "epoch": 0.37876807288571995, + "grad_norm": 2.0676157160633637, + "learning_rate": 1.427051224769453e-05, + "loss": 0.2289, + "step": 4781 + }, + { + "epoch": 0.37884729649435533, + "grad_norm": 1.9191436982745378, + "learning_rate": 1.4268191830833417e-05, + "loss": 0.3325, + "step": 4782 + }, + { + "epoch": 0.3789265201029907, + "grad_norm": 1.7454611128809623, + "learning_rate": 1.426587113293241e-05, + "loss": 0.2648, + "step": 4783 + }, + { + "epoch": 0.37900574371162604, + "grad_norm": 1.7793678504511723, + "learning_rate": 1.4263550154144313e-05, + "loss": 0.2631, + "step": 4784 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 1.5371724572623175, + "learning_rate": 1.4261228894621955e-05, + "loss": 0.2263, + "step": 4785 + }, + { + "epoch": 0.3791641909288968, + "grad_norm": 1.3992223728756408, + "learning_rate": 1.4258907354518177e-05, + "loss": 0.2742, + "step": 4786 + }, + { + "epoch": 0.3792434145375322, + "grad_norm": 2.1303990309180105, + "learning_rate": 1.4256585533985842e-05, + "loss": 0.2588, + "step": 4787 + }, + { + "epoch": 0.37932263814616757, + "grad_norm": 1.8591226424121843, + "learning_rate": 1.425426343317783e-05, + "loss": 0.2327, + "step": 4788 + }, + { + "epoch": 0.37940186175480295, + "grad_norm": 1.9874139774638357, + "learning_rate": 1.4251941052247044e-05, + "loss": 0.2767, + "step": 4789 + }, + { + "epoch": 0.37948108536343833, + "grad_norm": 1.4409303108247693, + "learning_rate": 1.4249618391346399e-05, + "loss": 0.2136, + "step": 4790 + }, + { + "epoch": 0.37956030897207366, + "grad_norm": 1.7374835963702855, + "learning_rate": 1.4247295450628826e-05, + "loss": 0.3302, + "step": 4791 + }, + { + "epoch": 0.37963953258070904, + "grad_norm": 2.066763210743459, + "learning_rate": 1.4244972230247287e-05, + "loss": 0.2477, + "step": 4792 + }, + { + "epoch": 0.3797187561893444, + "grad_norm": 1.667379683610971, + "learning_rate": 1.4242648730354756e-05, + "loss": 0.2205, + "step": 4793 + }, + { + "epoch": 0.3797979797979798, + "grad_norm": 2.4761630960969256, + "learning_rate": 1.4240324951104213e-05, + "loss": 0.3087, + "step": 4794 + }, + { + "epoch": 0.3798772034066152, + "grad_norm": 1.9259460121966194, + "learning_rate": 1.4238000892648682e-05, + "loss": 0.2996, + "step": 4795 + }, + { + "epoch": 0.37995642701525056, + "grad_norm": 1.80462871105025, + "learning_rate": 1.423567655514118e-05, + "loss": 0.2886, + "step": 4796 + }, + { + "epoch": 0.38003565062388595, + "grad_norm": 1.9088454542305258, + "learning_rate": 1.4233351938734758e-05, + "loss": 0.2883, + "step": 4797 + }, + { + "epoch": 0.38011487423252127, + "grad_norm": 1.8562698626926064, + "learning_rate": 1.4231027043582483e-05, + "loss": 0.2988, + "step": 4798 + }, + { + "epoch": 0.38019409784115665, + "grad_norm": 1.9023307467539372, + "learning_rate": 1.4228701869837433e-05, + "loss": 0.1947, + "step": 4799 + }, + { + "epoch": 0.38027332144979203, + "grad_norm": 1.5730118458882212, + "learning_rate": 1.4226376417652713e-05, + "loss": 0.2352, + "step": 4800 + }, + { + "epoch": 0.3803525450584274, + "grad_norm": 2.5466487394800588, + "learning_rate": 1.4224050687181442e-05, + "loss": 0.4451, + "step": 4801 + }, + { + "epoch": 0.3804317686670628, + "grad_norm": 1.7512508606591768, + "learning_rate": 1.4221724678576756e-05, + "loss": 0.261, + "step": 4802 + }, + { + "epoch": 0.3805109922756982, + "grad_norm": 1.9443566246310116, + "learning_rate": 1.421939839199182e-05, + "loss": 0.2422, + "step": 4803 + }, + { + "epoch": 0.3805902158843335, + "grad_norm": 2.23398072236846, + "learning_rate": 1.4217071827579796e-05, + "loss": 0.3715, + "step": 4804 + }, + { + "epoch": 0.3806694394929689, + "grad_norm": 1.7566686596767662, + "learning_rate": 1.4214744985493884e-05, + "loss": 0.2581, + "step": 4805 + }, + { + "epoch": 0.38074866310160427, + "grad_norm": 1.8365037645980977, + "learning_rate": 1.4212417865887299e-05, + "loss": 0.2675, + "step": 4806 + }, + { + "epoch": 0.38082788671023965, + "grad_norm": 2.222094531196926, + "learning_rate": 1.4210090468913263e-05, + "loss": 0.2966, + "step": 4807 + }, + { + "epoch": 0.38090711031887503, + "grad_norm": 1.674970284916282, + "learning_rate": 1.4207762794725026e-05, + "loss": 0.1844, + "step": 4808 + }, + { + "epoch": 0.3809863339275104, + "grad_norm": 1.8912679943198152, + "learning_rate": 1.4205434843475859e-05, + "loss": 0.3335, + "step": 4809 + }, + { + "epoch": 0.3810655575361458, + "grad_norm": 1.4026111151498115, + "learning_rate": 1.420310661531904e-05, + "loss": 0.2587, + "step": 4810 + }, + { + "epoch": 0.3811447811447811, + "grad_norm": 2.0979991976115833, + "learning_rate": 1.4200778110407873e-05, + "loss": 0.2792, + "step": 4811 + }, + { + "epoch": 0.3812240047534165, + "grad_norm": 1.8532419798592212, + "learning_rate": 1.4198449328895685e-05, + "loss": 0.227, + "step": 4812 + }, + { + "epoch": 0.3813032283620519, + "grad_norm": 1.8545119139565978, + "learning_rate": 1.4196120270935807e-05, + "loss": 0.2658, + "step": 4813 + }, + { + "epoch": 0.38138245197068726, + "grad_norm": 2.174031050158867, + "learning_rate": 1.4193790936681602e-05, + "loss": 0.3955, + "step": 4814 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 1.5039943437621284, + "learning_rate": 1.4191461326286442e-05, + "loss": 0.2524, + "step": 4815 + }, + { + "epoch": 0.381540899187958, + "grad_norm": 2.1090737438577083, + "learning_rate": 1.4189131439903721e-05, + "loss": 0.3839, + "step": 4816 + }, + { + "epoch": 0.3816201227965934, + "grad_norm": 1.7986950020087995, + "learning_rate": 1.4186801277686852e-05, + "loss": 0.321, + "step": 4817 + }, + { + "epoch": 0.38169934640522873, + "grad_norm": 1.7847700997923661, + "learning_rate": 1.4184470839789265e-05, + "loss": 0.1687, + "step": 4818 + }, + { + "epoch": 0.3817785700138641, + "grad_norm": 1.72693061428541, + "learning_rate": 1.4182140126364404e-05, + "loss": 0.216, + "step": 4819 + }, + { + "epoch": 0.3818577936224995, + "grad_norm": 1.842941970286251, + "learning_rate": 1.4179809137565742e-05, + "loss": 0.2566, + "step": 4820 + }, + { + "epoch": 0.3819370172311349, + "grad_norm": 1.8199027279628275, + "learning_rate": 1.417747787354676e-05, + "loss": 0.3312, + "step": 4821 + }, + { + "epoch": 0.38201624083977026, + "grad_norm": 1.8969625008135695, + "learning_rate": 1.4175146334460963e-05, + "loss": 0.3397, + "step": 4822 + }, + { + "epoch": 0.38209546444840564, + "grad_norm": 1.6402500927911516, + "learning_rate": 1.4172814520461867e-05, + "loss": 0.2617, + "step": 4823 + }, + { + "epoch": 0.382174688057041, + "grad_norm": 1.9468199653542566, + "learning_rate": 1.4170482431703012e-05, + "loss": 0.2846, + "step": 4824 + }, + { + "epoch": 0.38225391166567635, + "grad_norm": 1.8824592388014352, + "learning_rate": 1.4168150068337958e-05, + "loss": 0.2284, + "step": 4825 + }, + { + "epoch": 0.38233313527431173, + "grad_norm": 1.8205163241914524, + "learning_rate": 1.4165817430520276e-05, + "loss": 0.2875, + "step": 4826 + }, + { + "epoch": 0.3824123588829471, + "grad_norm": 1.6660189023837875, + "learning_rate": 1.4163484518403561e-05, + "loss": 0.297, + "step": 4827 + }, + { + "epoch": 0.3824915824915825, + "grad_norm": 2.3646889682065226, + "learning_rate": 1.4161151332141426e-05, + "loss": 0.2898, + "step": 4828 + }, + { + "epoch": 0.3825708061002179, + "grad_norm": 1.640092236248722, + "learning_rate": 1.4158817871887497e-05, + "loss": 0.2715, + "step": 4829 + }, + { + "epoch": 0.38265002970885326, + "grad_norm": 1.4300968509993024, + "learning_rate": 1.4156484137795424e-05, + "loss": 0.2185, + "step": 4830 + }, + { + "epoch": 0.38272925331748864, + "grad_norm": 1.5217152938992407, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.3335, + "step": 4831 + }, + { + "epoch": 0.38280847692612396, + "grad_norm": 1.8261276611801172, + "learning_rate": 1.4151815848711512e-05, + "loss": 0.2915, + "step": 4832 + }, + { + "epoch": 0.38288770053475935, + "grad_norm": 1.9081419485416535, + "learning_rate": 1.4149481294027063e-05, + "loss": 0.2701, + "step": 4833 + }, + { + "epoch": 0.3829669241433947, + "grad_norm": 1.6952495514126464, + "learning_rate": 1.4147146466119235e-05, + "loss": 0.2624, + "step": 4834 + }, + { + "epoch": 0.3830461477520301, + "grad_norm": 1.3043963096200666, + "learning_rate": 1.4144811365141769e-05, + "loss": 0.1968, + "step": 4835 + }, + { + "epoch": 0.3831253713606655, + "grad_norm": 1.7918340112956868, + "learning_rate": 1.4142475991248417e-05, + "loss": 0.317, + "step": 4836 + }, + { + "epoch": 0.38320459496930087, + "grad_norm": 1.9627614844999453, + "learning_rate": 1.4140140344592952e-05, + "loss": 0.38, + "step": 4837 + }, + { + "epoch": 0.3832838185779362, + "grad_norm": 1.9409581200822585, + "learning_rate": 1.413780442532917e-05, + "loss": 0.3079, + "step": 4838 + }, + { + "epoch": 0.3833630421865716, + "grad_norm": 1.950199498223228, + "learning_rate": 1.4135468233610872e-05, + "loss": 0.2871, + "step": 4839 + }, + { + "epoch": 0.38344226579520696, + "grad_norm": 1.600547648732173, + "learning_rate": 1.4133131769591893e-05, + "loss": 0.2282, + "step": 4840 + }, + { + "epoch": 0.38352148940384234, + "grad_norm": 1.8133758667906779, + "learning_rate": 1.4130795033426073e-05, + "loss": 0.2406, + "step": 4841 + }, + { + "epoch": 0.3836007130124777, + "grad_norm": 1.8636043586584148, + "learning_rate": 1.4128458025267276e-05, + "loss": 0.3167, + "step": 4842 + }, + { + "epoch": 0.3836799366211131, + "grad_norm": 1.571216578107831, + "learning_rate": 1.4126120745269382e-05, + "loss": 0.266, + "step": 4843 + }, + { + "epoch": 0.3837591602297485, + "grad_norm": 1.5994983578524533, + "learning_rate": 1.4123783193586294e-05, + "loss": 0.2493, + "step": 4844 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 1.7438582773507751, + "learning_rate": 1.4121445370371922e-05, + "loss": 0.2571, + "step": 4845 + }, + { + "epoch": 0.3839176074470192, + "grad_norm": 1.5970861836363022, + "learning_rate": 1.4119107275780203e-05, + "loss": 0.2394, + "step": 4846 + }, + { + "epoch": 0.3839968310556546, + "grad_norm": 1.7682876578657611, + "learning_rate": 1.4116768909965092e-05, + "loss": 0.2029, + "step": 4847 + }, + { + "epoch": 0.38407605466428996, + "grad_norm": 1.5758710726761345, + "learning_rate": 1.4114430273080558e-05, + "loss": 0.1753, + "step": 4848 + }, + { + "epoch": 0.38415527827292534, + "grad_norm": 1.980330054356402, + "learning_rate": 1.4112091365280585e-05, + "loss": 0.3266, + "step": 4849 + }, + { + "epoch": 0.3842345018815607, + "grad_norm": 1.6570488296570627, + "learning_rate": 1.4109752186719181e-05, + "loss": 0.3021, + "step": 4850 + }, + { + "epoch": 0.3843137254901961, + "grad_norm": 1.8031285082403183, + "learning_rate": 1.4107412737550372e-05, + "loss": 0.2704, + "step": 4851 + }, + { + "epoch": 0.3843929490988314, + "grad_norm": 2.0120707769336503, + "learning_rate": 1.4105073017928199e-05, + "loss": 0.299, + "step": 4852 + }, + { + "epoch": 0.3844721727074668, + "grad_norm": 1.8520540178085845, + "learning_rate": 1.4102733028006719e-05, + "loss": 0.3679, + "step": 4853 + }, + { + "epoch": 0.3845513963161022, + "grad_norm": 1.5037155668938742, + "learning_rate": 1.410039276794001e-05, + "loss": 0.2195, + "step": 4854 + }, + { + "epoch": 0.38463061992473757, + "grad_norm": 1.7267930169537224, + "learning_rate": 1.4098052237882168e-05, + "loss": 0.2238, + "step": 4855 + }, + { + "epoch": 0.38470984353337295, + "grad_norm": 2.101015355534599, + "learning_rate": 1.4095711437987303e-05, + "loss": 0.2565, + "step": 4856 + }, + { + "epoch": 0.38478906714200833, + "grad_norm": 1.4687022751446832, + "learning_rate": 1.4093370368409546e-05, + "loss": 0.1753, + "step": 4857 + }, + { + "epoch": 0.3848682907506437, + "grad_norm": 1.9474337981934573, + "learning_rate": 1.409102902930305e-05, + "loss": 0.2036, + "step": 4858 + }, + { + "epoch": 0.38494751435927904, + "grad_norm": 2.2218361081809004, + "learning_rate": 1.4088687420821974e-05, + "loss": 0.3247, + "step": 4859 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 2.1607974022322174, + "learning_rate": 1.4086345543120508e-05, + "loss": 0.277, + "step": 4860 + }, + { + "epoch": 0.3851059615765498, + "grad_norm": 1.9326311646466325, + "learning_rate": 1.4084003396352848e-05, + "loss": 0.3242, + "step": 4861 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 2.4904645284789497, + "learning_rate": 1.4081660980673215e-05, + "loss": 0.2546, + "step": 4862 + }, + { + "epoch": 0.38526440879382057, + "grad_norm": 2.0836945441265953, + "learning_rate": 1.4079318296235846e-05, + "loss": 0.2172, + "step": 4863 + }, + { + "epoch": 0.38534363240245595, + "grad_norm": 1.942911499323709, + "learning_rate": 1.4076975343194996e-05, + "loss": 0.2149, + "step": 4864 + }, + { + "epoch": 0.38542285601109133, + "grad_norm": 2.337131037974696, + "learning_rate": 1.4074632121704941e-05, + "loss": 0.415, + "step": 4865 + }, + { + "epoch": 0.38550207961972666, + "grad_norm": 1.8623991270514797, + "learning_rate": 1.4072288631919962e-05, + "loss": 0.2941, + "step": 4866 + }, + { + "epoch": 0.38558130322836204, + "grad_norm": 1.671605506172729, + "learning_rate": 1.406994487399437e-05, + "loss": 0.2679, + "step": 4867 + }, + { + "epoch": 0.3856605268369974, + "grad_norm": 1.661847975657666, + "learning_rate": 1.4067600848082496e-05, + "loss": 0.236, + "step": 4868 + }, + { + "epoch": 0.3857397504456328, + "grad_norm": 1.6534594221970502, + "learning_rate": 1.4065256554338675e-05, + "loss": 0.2034, + "step": 4869 + }, + { + "epoch": 0.3858189740542682, + "grad_norm": 1.6770660393433066, + "learning_rate": 1.406291199291727e-05, + "loss": 0.3157, + "step": 4870 + }, + { + "epoch": 0.38589819766290356, + "grad_norm": 1.9520223591967838, + "learning_rate": 1.4060567163972663e-05, + "loss": 0.394, + "step": 4871 + }, + { + "epoch": 0.38597742127153895, + "grad_norm": 1.6138797248703345, + "learning_rate": 1.4058222067659244e-05, + "loss": 0.274, + "step": 4872 + }, + { + "epoch": 0.38605664488017427, + "grad_norm": 1.5375729277822336, + "learning_rate": 1.405587670413143e-05, + "loss": 0.2322, + "step": 4873 + }, + { + "epoch": 0.38613586848880965, + "grad_norm": 2.139941920626899, + "learning_rate": 1.405353107354365e-05, + "loss": 0.2984, + "step": 4874 + }, + { + "epoch": 0.38621509209744503, + "grad_norm": 3.22239182199651, + "learning_rate": 1.4051185176050353e-05, + "loss": 0.3102, + "step": 4875 + }, + { + "epoch": 0.3862943157060804, + "grad_norm": 1.7545156430624596, + "learning_rate": 1.4048839011806006e-05, + "loss": 0.2105, + "step": 4876 + }, + { + "epoch": 0.3863735393147158, + "grad_norm": 1.9145457092868745, + "learning_rate": 1.404649258096509e-05, + "loss": 0.3642, + "step": 4877 + }, + { + "epoch": 0.3864527629233512, + "grad_norm": 2.0238697466619557, + "learning_rate": 1.4044145883682108e-05, + "loss": 0.242, + "step": 4878 + }, + { + "epoch": 0.3865319865319865, + "grad_norm": 2.3406179509660565, + "learning_rate": 1.4041798920111582e-05, + "loss": 0.2447, + "step": 4879 + }, + { + "epoch": 0.3866112101406219, + "grad_norm": 1.783521302038437, + "learning_rate": 1.4039451690408042e-05, + "loss": 0.2776, + "step": 4880 + }, + { + "epoch": 0.38669043374925727, + "grad_norm": 1.562636401133143, + "learning_rate": 1.4037104194726048e-05, + "loss": 0.2216, + "step": 4881 + }, + { + "epoch": 0.38676965735789265, + "grad_norm": 2.0934294543682626, + "learning_rate": 1.4034756433220164e-05, + "loss": 0.2941, + "step": 4882 + }, + { + "epoch": 0.38684888096652803, + "grad_norm": 1.8277893187308183, + "learning_rate": 1.4032408406044986e-05, + "loss": 0.2246, + "step": 4883 + }, + { + "epoch": 0.3869281045751634, + "grad_norm": 1.6428382442927187, + "learning_rate": 1.4030060113355118e-05, + "loss": 0.2189, + "step": 4884 + }, + { + "epoch": 0.3870073281837988, + "grad_norm": 1.6434971900372488, + "learning_rate": 1.402771155530518e-05, + "loss": 0.2749, + "step": 4885 + }, + { + "epoch": 0.3870865517924341, + "grad_norm": 1.7541482124622305, + "learning_rate": 1.4025362732049816e-05, + "loss": 0.2346, + "step": 4886 + }, + { + "epoch": 0.3871657754010695, + "grad_norm": 2.0755705580105093, + "learning_rate": 1.4023013643743688e-05, + "loss": 0.2427, + "step": 4887 + }, + { + "epoch": 0.3872449990097049, + "grad_norm": 1.6388321994048747, + "learning_rate": 1.4020664290541465e-05, + "loss": 0.1948, + "step": 4888 + }, + { + "epoch": 0.38732422261834026, + "grad_norm": 1.7288886140854272, + "learning_rate": 1.4018314672597848e-05, + "loss": 0.3013, + "step": 4889 + }, + { + "epoch": 0.38740344622697565, + "grad_norm": 1.6625003403145617, + "learning_rate": 1.4015964790067545e-05, + "loss": 0.2453, + "step": 4890 + }, + { + "epoch": 0.387482669835611, + "grad_norm": 1.4656638157368234, + "learning_rate": 1.401361464310528e-05, + "loss": 0.1906, + "step": 4891 + }, + { + "epoch": 0.3875618934442464, + "grad_norm": 2.1323091963914482, + "learning_rate": 1.4011264231865807e-05, + "loss": 0.325, + "step": 4892 + }, + { + "epoch": 0.38764111705288173, + "grad_norm": 1.7364929363055688, + "learning_rate": 1.4008913556503885e-05, + "loss": 0.2628, + "step": 4893 + }, + { + "epoch": 0.3877203406615171, + "grad_norm": 1.6629464357090071, + "learning_rate": 1.4006562617174292e-05, + "loss": 0.2416, + "step": 4894 + }, + { + "epoch": 0.3877995642701525, + "grad_norm": 1.397683060163063, + "learning_rate": 1.4004211414031831e-05, + "loss": 0.2043, + "step": 4895 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 2.0659803703039263, + "learning_rate": 1.4001859947231316e-05, + "loss": 0.2598, + "step": 4896 + }, + { + "epoch": 0.38795801148742326, + "grad_norm": 2.1824148040858717, + "learning_rate": 1.3999508216927578e-05, + "loss": 0.2318, + "step": 4897 + }, + { + "epoch": 0.38803723509605864, + "grad_norm": 2.442391324930562, + "learning_rate": 1.399715622327547e-05, + "loss": 0.3181, + "step": 4898 + }, + { + "epoch": 0.388116458704694, + "grad_norm": 1.675419768814645, + "learning_rate": 1.3994803966429854e-05, + "loss": 0.2692, + "step": 4899 + }, + { + "epoch": 0.38819568231332935, + "grad_norm": 1.8459992373106835, + "learning_rate": 1.3992451446545624e-05, + "loss": 0.2429, + "step": 4900 + }, + { + "epoch": 0.38827490592196473, + "grad_norm": 1.99070140567718, + "learning_rate": 1.3990098663777674e-05, + "loss": 0.2673, + "step": 4901 + }, + { + "epoch": 0.3883541295306001, + "grad_norm": 1.4336572724242114, + "learning_rate": 1.3987745618280925e-05, + "loss": 0.2015, + "step": 4902 + }, + { + "epoch": 0.3884333531392355, + "grad_norm": 1.8377236867934492, + "learning_rate": 1.3985392310210318e-05, + "loss": 0.3081, + "step": 4903 + }, + { + "epoch": 0.3885125767478709, + "grad_norm": 1.8703203058869515, + "learning_rate": 1.39830387397208e-05, + "loss": 0.2521, + "step": 4904 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 1.6717876731784511, + "learning_rate": 1.3980684906967348e-05, + "loss": 0.2553, + "step": 4905 + }, + { + "epoch": 0.38867102396514164, + "grad_norm": 1.910780058119731, + "learning_rate": 1.3978330812104947e-05, + "loss": 0.3801, + "step": 4906 + }, + { + "epoch": 0.38875024757377696, + "grad_norm": 2.363934119931117, + "learning_rate": 1.3975976455288607e-05, + "loss": 0.3791, + "step": 4907 + }, + { + "epoch": 0.38882947118241235, + "grad_norm": 1.6532092308011639, + "learning_rate": 1.397362183667335e-05, + "loss": 0.2529, + "step": 4908 + }, + { + "epoch": 0.3889086947910477, + "grad_norm": 2.4691674335767217, + "learning_rate": 1.3971266956414211e-05, + "loss": 0.276, + "step": 4909 + }, + { + "epoch": 0.3889879183996831, + "grad_norm": 1.937524828043644, + "learning_rate": 1.3968911814666252e-05, + "loss": 0.2142, + "step": 4910 + }, + { + "epoch": 0.3890671420083185, + "grad_norm": 1.4871096935617476, + "learning_rate": 1.3966556411584548e-05, + "loss": 0.228, + "step": 4911 + }, + { + "epoch": 0.38914636561695387, + "grad_norm": 1.4732108921826366, + "learning_rate": 1.396420074732419e-05, + "loss": 0.2159, + "step": 4912 + }, + { + "epoch": 0.38922558922558925, + "grad_norm": 1.6393131903376357, + "learning_rate": 1.396184482204029e-05, + "loss": 0.2829, + "step": 4913 + }, + { + "epoch": 0.3893048128342246, + "grad_norm": 2.1166159582836257, + "learning_rate": 1.3959488635887967e-05, + "loss": 0.2827, + "step": 4914 + }, + { + "epoch": 0.38938403644285996, + "grad_norm": 2.0861565135264044, + "learning_rate": 1.3957132189022373e-05, + "loss": 0.3116, + "step": 4915 + }, + { + "epoch": 0.38946326005149534, + "grad_norm": 1.7402310602116924, + "learning_rate": 1.3954775481598665e-05, + "loss": 0.2775, + "step": 4916 + }, + { + "epoch": 0.3895424836601307, + "grad_norm": 1.7299115685930881, + "learning_rate": 1.3952418513772016e-05, + "loss": 0.3166, + "step": 4917 + }, + { + "epoch": 0.3896217072687661, + "grad_norm": 1.8177487417502542, + "learning_rate": 1.3950061285697629e-05, + "loss": 0.2581, + "step": 4918 + }, + { + "epoch": 0.3897009308774015, + "grad_norm": 2.0557599088322576, + "learning_rate": 1.3947703797530716e-05, + "loss": 0.2265, + "step": 4919 + }, + { + "epoch": 0.3897801544860368, + "grad_norm": 2.426335853471312, + "learning_rate": 1.3945346049426498e-05, + "loss": 0.3799, + "step": 4920 + }, + { + "epoch": 0.3898593780946722, + "grad_norm": 1.39889361741148, + "learning_rate": 1.3942988041540226e-05, + "loss": 0.1728, + "step": 4921 + }, + { + "epoch": 0.3899386017033076, + "grad_norm": 1.8011418967853858, + "learning_rate": 1.394062977402717e-05, + "loss": 0.1954, + "step": 4922 + }, + { + "epoch": 0.39001782531194296, + "grad_norm": 1.8324520880123258, + "learning_rate": 1.3938271247042601e-05, + "loss": 0.2852, + "step": 4923 + }, + { + "epoch": 0.39009704892057834, + "grad_norm": 1.70950428343249, + "learning_rate": 1.3935912460741818e-05, + "loss": 0.2401, + "step": 4924 + }, + { + "epoch": 0.3901762725292137, + "grad_norm": 1.82562064446481, + "learning_rate": 1.3933553415280142e-05, + "loss": 0.2978, + "step": 4925 + }, + { + "epoch": 0.3902554961378491, + "grad_norm": 1.8792160520532144, + "learning_rate": 1.3931194110812896e-05, + "loss": 0.3616, + "step": 4926 + }, + { + "epoch": 0.3903347197464844, + "grad_norm": 2.086651976264605, + "learning_rate": 1.3928834547495438e-05, + "loss": 0.3373, + "step": 4927 + }, + { + "epoch": 0.3904139433551198, + "grad_norm": 1.6151752229474026, + "learning_rate": 1.3926474725483125e-05, + "loss": 0.2864, + "step": 4928 + }, + { + "epoch": 0.3904931669637552, + "grad_norm": 1.4121292564210077, + "learning_rate": 1.3924114644931346e-05, + "loss": 0.1935, + "step": 4929 + }, + { + "epoch": 0.39057239057239057, + "grad_norm": 1.7320009973356407, + "learning_rate": 1.3921754305995501e-05, + "loss": 0.2852, + "step": 4930 + }, + { + "epoch": 0.39065161418102595, + "grad_norm": 1.9151171410205767, + "learning_rate": 1.3919393708831004e-05, + "loss": 0.3141, + "step": 4931 + }, + { + "epoch": 0.39073083778966133, + "grad_norm": 2.6039328518843243, + "learning_rate": 1.3917032853593289e-05, + "loss": 0.4421, + "step": 4932 + }, + { + "epoch": 0.3908100613982967, + "grad_norm": 2.250046779436568, + "learning_rate": 1.3914671740437811e-05, + "loss": 0.2321, + "step": 4933 + }, + { + "epoch": 0.39088928500693204, + "grad_norm": 1.566608093385484, + "learning_rate": 1.3912310369520032e-05, + "loss": 0.2671, + "step": 4934 + }, + { + "epoch": 0.3909685086155674, + "grad_norm": 2.0292065487561755, + "learning_rate": 1.3909948740995442e-05, + "loss": 0.346, + "step": 4935 + }, + { + "epoch": 0.3910477322242028, + "grad_norm": 1.9078504796860136, + "learning_rate": 1.3907586855019538e-05, + "loss": 0.3763, + "step": 4936 + }, + { + "epoch": 0.3911269558328382, + "grad_norm": 1.8957842081482812, + "learning_rate": 1.3905224711747844e-05, + "loss": 0.3024, + "step": 4937 + }, + { + "epoch": 0.39120617944147357, + "grad_norm": 1.6406305258588172, + "learning_rate": 1.3902862311335896e-05, + "loss": 0.2617, + "step": 4938 + }, + { + "epoch": 0.39128540305010895, + "grad_norm": 1.4694186094641686, + "learning_rate": 1.390049965393924e-05, + "loss": 0.1937, + "step": 4939 + }, + { + "epoch": 0.39136462665874433, + "grad_norm": 2.177495606638902, + "learning_rate": 1.3898136739713451e-05, + "loss": 0.2414, + "step": 4940 + }, + { + "epoch": 0.39144385026737966, + "grad_norm": 1.2611322495580886, + "learning_rate": 1.3895773568814118e-05, + "loss": 0.1604, + "step": 4941 + }, + { + "epoch": 0.39152307387601504, + "grad_norm": 1.6107051489866262, + "learning_rate": 1.3893410141396835e-05, + "loss": 0.337, + "step": 4942 + }, + { + "epoch": 0.3916022974846504, + "grad_norm": 1.993077001132843, + "learning_rate": 1.3891046457617233e-05, + "loss": 0.2665, + "step": 4943 + }, + { + "epoch": 0.3916815210932858, + "grad_norm": 1.6475760015632661, + "learning_rate": 1.388868251763094e-05, + "loss": 0.2571, + "step": 4944 + }, + { + "epoch": 0.3917607447019212, + "grad_norm": 1.573292172431593, + "learning_rate": 1.3886318321593614e-05, + "loss": 0.247, + "step": 4945 + }, + { + "epoch": 0.39183996831055656, + "grad_norm": 1.8105642431196092, + "learning_rate": 1.388395386966093e-05, + "loss": 0.2616, + "step": 4946 + }, + { + "epoch": 0.39191919191919194, + "grad_norm": 2.189272330131032, + "learning_rate": 1.388158916198857e-05, + "loss": 0.2842, + "step": 4947 + }, + { + "epoch": 0.39199841552782727, + "grad_norm": 1.8707833903357227, + "learning_rate": 1.3879224198732239e-05, + "loss": 0.2423, + "step": 4948 + }, + { + "epoch": 0.39207763913646265, + "grad_norm": 1.6230767377699655, + "learning_rate": 1.3876858980047665e-05, + "loss": 0.2408, + "step": 4949 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 1.2470554971599215, + "learning_rate": 1.3874493506090578e-05, + "loss": 0.1702, + "step": 4950 + }, + { + "epoch": 0.3922360863537334, + "grad_norm": 1.6786225725593404, + "learning_rate": 1.3872127777016739e-05, + "loss": 0.2646, + "step": 4951 + }, + { + "epoch": 0.3923153099623688, + "grad_norm": 2.0102372643919417, + "learning_rate": 1.3869761792981915e-05, + "loss": 0.2323, + "step": 4952 + }, + { + "epoch": 0.3923945335710042, + "grad_norm": 2.183189800344821, + "learning_rate": 1.3867395554141899e-05, + "loss": 0.3992, + "step": 4953 + }, + { + "epoch": 0.39247375717963956, + "grad_norm": 1.6514654279951932, + "learning_rate": 1.3865029060652493e-05, + "loss": 0.2408, + "step": 4954 + }, + { + "epoch": 0.3925529807882749, + "grad_norm": 2.023566919321999, + "learning_rate": 1.3862662312669518e-05, + "loss": 0.2358, + "step": 4955 + }, + { + "epoch": 0.39263220439691027, + "grad_norm": 1.3332265181540435, + "learning_rate": 1.386029531034882e-05, + "loss": 0.1371, + "step": 4956 + }, + { + "epoch": 0.39271142800554565, + "grad_norm": 1.6769897130765947, + "learning_rate": 1.385792805384625e-05, + "loss": 0.2453, + "step": 4957 + }, + { + "epoch": 0.39279065161418103, + "grad_norm": 1.9500711749279414, + "learning_rate": 1.3855560543317679e-05, + "loss": 0.3715, + "step": 4958 + }, + { + "epoch": 0.3928698752228164, + "grad_norm": 1.7401008467798307, + "learning_rate": 1.3853192778919e-05, + "loss": 0.1431, + "step": 4959 + }, + { + "epoch": 0.3929490988314518, + "grad_norm": 1.6368249447133931, + "learning_rate": 1.3850824760806115e-05, + "loss": 0.3095, + "step": 4960 + }, + { + "epoch": 0.3930283224400871, + "grad_norm": 2.5225017538825147, + "learning_rate": 1.384845648913495e-05, + "loss": 0.3605, + "step": 4961 + }, + { + "epoch": 0.3931075460487225, + "grad_norm": 1.8799307507834746, + "learning_rate": 1.3846087964061442e-05, + "loss": 0.2772, + "step": 4962 + }, + { + "epoch": 0.3931867696573579, + "grad_norm": 1.8581621904477863, + "learning_rate": 1.3843719185741548e-05, + "loss": 0.3554, + "step": 4963 + }, + { + "epoch": 0.39326599326599326, + "grad_norm": 1.9002845174617342, + "learning_rate": 1.3841350154331239e-05, + "loss": 0.3284, + "step": 4964 + }, + { + "epoch": 0.39334521687462864, + "grad_norm": 1.8682381687798193, + "learning_rate": 1.383898086998651e-05, + "loss": 0.2141, + "step": 4965 + }, + { + "epoch": 0.393424440483264, + "grad_norm": 1.5478800222173965, + "learning_rate": 1.3836611332863356e-05, + "loss": 0.2497, + "step": 4966 + }, + { + "epoch": 0.3935036640918994, + "grad_norm": 2.103979040883339, + "learning_rate": 1.383424154311781e-05, + "loss": 0.2163, + "step": 4967 + }, + { + "epoch": 0.39358288770053473, + "grad_norm": 2.1554333998972344, + "learning_rate": 1.383187150090591e-05, + "loss": 0.3397, + "step": 4968 + }, + { + "epoch": 0.3936621113091701, + "grad_norm": 2.1009666264119833, + "learning_rate": 1.3829501206383704e-05, + "loss": 0.2637, + "step": 4969 + }, + { + "epoch": 0.3937413349178055, + "grad_norm": 1.80394897108238, + "learning_rate": 1.3827130659707275e-05, + "loss": 0.2668, + "step": 4970 + }, + { + "epoch": 0.3938205585264409, + "grad_norm": 2.222688601615571, + "learning_rate": 1.3824759861032704e-05, + "loss": 0.3653, + "step": 4971 + }, + { + "epoch": 0.39389978213507626, + "grad_norm": 2.0281326596183153, + "learning_rate": 1.38223888105161e-05, + "loss": 0.2443, + "step": 4972 + }, + { + "epoch": 0.39397900574371164, + "grad_norm": 1.5918307594741923, + "learning_rate": 1.3820017508313587e-05, + "loss": 0.2118, + "step": 4973 + }, + { + "epoch": 0.394058229352347, + "grad_norm": 1.7021430232938264, + "learning_rate": 1.3817645954581301e-05, + "loss": 0.2281, + "step": 4974 + }, + { + "epoch": 0.39413745296098235, + "grad_norm": 2.1696884719380205, + "learning_rate": 1.3815274149475395e-05, + "loss": 0.3571, + "step": 4975 + }, + { + "epoch": 0.39421667656961773, + "grad_norm": 1.8886359409169726, + "learning_rate": 1.3812902093152047e-05, + "loss": 0.2554, + "step": 4976 + }, + { + "epoch": 0.3942959001782531, + "grad_norm": 1.5331896532041256, + "learning_rate": 1.3810529785767444e-05, + "loss": 0.2454, + "step": 4977 + }, + { + "epoch": 0.3943751237868885, + "grad_norm": 2.4534832929597057, + "learning_rate": 1.3808157227477788e-05, + "loss": 0.2577, + "step": 4978 + }, + { + "epoch": 0.3944543473955239, + "grad_norm": 1.983776372229426, + "learning_rate": 1.3805784418439303e-05, + "loss": 0.3195, + "step": 4979 + }, + { + "epoch": 0.39453357100415926, + "grad_norm": 1.8166724279710076, + "learning_rate": 1.3803411358808222e-05, + "loss": 0.3308, + "step": 4980 + }, + { + "epoch": 0.39461279461279464, + "grad_norm": 2.1646010012152463, + "learning_rate": 1.3801038048740811e-05, + "loss": 0.3745, + "step": 4981 + }, + { + "epoch": 0.39469201822142996, + "grad_norm": 1.7175245046194172, + "learning_rate": 1.379866448839333e-05, + "loss": 0.2866, + "step": 4982 + }, + { + "epoch": 0.39477124183006534, + "grad_norm": 1.4705239764687448, + "learning_rate": 1.379629067792207e-05, + "loss": 0.2032, + "step": 4983 + }, + { + "epoch": 0.3948504654387007, + "grad_norm": 1.7308900507702025, + "learning_rate": 1.3793916617483338e-05, + "loss": 0.3028, + "step": 4984 + }, + { + "epoch": 0.3949296890473361, + "grad_norm": 1.688387068915595, + "learning_rate": 1.379154230723345e-05, + "loss": 0.3031, + "step": 4985 + }, + { + "epoch": 0.3950089126559715, + "grad_norm": 1.468650843878097, + "learning_rate": 1.3789167747328746e-05, + "loss": 0.2904, + "step": 4986 + }, + { + "epoch": 0.39508813626460687, + "grad_norm": 1.6078849983467876, + "learning_rate": 1.3786792937925576e-05, + "loss": 0.2307, + "step": 4987 + }, + { + "epoch": 0.39516735987324225, + "grad_norm": 1.6730174771707977, + "learning_rate": 1.3784417879180314e-05, + "loss": 0.322, + "step": 4988 + }, + { + "epoch": 0.3952465834818776, + "grad_norm": 1.360165546854493, + "learning_rate": 1.3782042571249343e-05, + "loss": 0.2055, + "step": 4989 + }, + { + "epoch": 0.39532580709051296, + "grad_norm": 1.6439129519005697, + "learning_rate": 1.3779667014289067e-05, + "loss": 0.2929, + "step": 4990 + }, + { + "epoch": 0.39540503069914834, + "grad_norm": 1.9872628282521692, + "learning_rate": 1.3777291208455902e-05, + "loss": 0.279, + "step": 4991 + }, + { + "epoch": 0.3954842543077837, + "grad_norm": 1.9328544820408649, + "learning_rate": 1.3774915153906292e-05, + "loss": 0.3648, + "step": 4992 + }, + { + "epoch": 0.3955634779164191, + "grad_norm": 2.2624746801900093, + "learning_rate": 1.377253885079668e-05, + "loss": 0.3193, + "step": 4993 + }, + { + "epoch": 0.3956427015250545, + "grad_norm": 2.113686190311988, + "learning_rate": 1.3770162299283535e-05, + "loss": 0.3202, + "step": 4994 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 1.8139399997624082, + "learning_rate": 1.3767785499523347e-05, + "loss": 0.3553, + "step": 4995 + }, + { + "epoch": 0.3958011487423252, + "grad_norm": 1.878926983384108, + "learning_rate": 1.376540845167261e-05, + "loss": 0.2716, + "step": 4996 + }, + { + "epoch": 0.3958803723509606, + "grad_norm": 1.5634011543611877, + "learning_rate": 1.3763031155887847e-05, + "loss": 0.1886, + "step": 4997 + }, + { + "epoch": 0.39595959595959596, + "grad_norm": 1.4121320013549208, + "learning_rate": 1.3760653612325588e-05, + "loss": 0.1379, + "step": 4998 + }, + { + "epoch": 0.39603881956823134, + "grad_norm": 1.8214632320985995, + "learning_rate": 1.3758275821142382e-05, + "loss": 0.3369, + "step": 4999 + }, + { + "epoch": 0.3961180431768667, + "grad_norm": 1.7973093353773184, + "learning_rate": 1.3755897782494803e-05, + "loss": 0.249, + "step": 5000 + }, + { + "epoch": 0.3961972667855021, + "grad_norm": 1.5617405475102666, + "learning_rate": 1.375351949653942e-05, + "loss": 0.2417, + "step": 5001 + }, + { + "epoch": 0.3962764903941374, + "grad_norm": 1.7471023862147215, + "learning_rate": 1.375114096343284e-05, + "loss": 0.2877, + "step": 5002 + }, + { + "epoch": 0.3963557140027728, + "grad_norm": 1.64337971421, + "learning_rate": 1.3748762183331681e-05, + "loss": 0.2191, + "step": 5003 + }, + { + "epoch": 0.3964349376114082, + "grad_norm": 1.600424142825091, + "learning_rate": 1.3746383156392566e-05, + "loss": 0.2636, + "step": 5004 + }, + { + "epoch": 0.39651416122004357, + "grad_norm": 1.7907144513952513, + "learning_rate": 1.374400388277215e-05, + "loss": 0.2417, + "step": 5005 + }, + { + "epoch": 0.39659338482867895, + "grad_norm": 1.5228632327356368, + "learning_rate": 1.3741624362627091e-05, + "loss": 0.1961, + "step": 5006 + }, + { + "epoch": 0.39667260843731433, + "grad_norm": 1.459007370078432, + "learning_rate": 1.373924459611407e-05, + "loss": 0.2281, + "step": 5007 + }, + { + "epoch": 0.3967518320459497, + "grad_norm": 1.741468124079045, + "learning_rate": 1.3736864583389789e-05, + "loss": 0.3022, + "step": 5008 + }, + { + "epoch": 0.39683105565458504, + "grad_norm": 1.8512721155465253, + "learning_rate": 1.373448432461095e-05, + "loss": 0.2493, + "step": 5009 + }, + { + "epoch": 0.3969102792632204, + "grad_norm": 1.794947096640278, + "learning_rate": 1.373210381993429e-05, + "loss": 0.2856, + "step": 5010 + }, + { + "epoch": 0.3969895028718558, + "grad_norm": 1.4903863045899304, + "learning_rate": 1.3729723069516554e-05, + "loss": 0.233, + "step": 5011 + }, + { + "epoch": 0.3970687264804912, + "grad_norm": 1.4396502435930123, + "learning_rate": 1.3727342073514497e-05, + "loss": 0.1945, + "step": 5012 + }, + { + "epoch": 0.39714795008912657, + "grad_norm": 1.7427975283031958, + "learning_rate": 1.3724960832084902e-05, + "loss": 0.3438, + "step": 5013 + }, + { + "epoch": 0.39722717369776195, + "grad_norm": 1.799696050369368, + "learning_rate": 1.3722579345384558e-05, + "loss": 0.2438, + "step": 5014 + }, + { + "epoch": 0.39730639730639733, + "grad_norm": 1.6612991846892513, + "learning_rate": 1.3720197613570272e-05, + "loss": 0.2728, + "step": 5015 + }, + { + "epoch": 0.39738562091503266, + "grad_norm": 1.5931453059655685, + "learning_rate": 1.3717815636798879e-05, + "loss": 0.181, + "step": 5016 + }, + { + "epoch": 0.39746484452366804, + "grad_norm": 1.7223887228104913, + "learning_rate": 1.3715433415227212e-05, + "loss": 0.2378, + "step": 5017 + }, + { + "epoch": 0.3975440681323034, + "grad_norm": 1.7525502529847075, + "learning_rate": 1.3713050949012134e-05, + "loss": 0.2196, + "step": 5018 + }, + { + "epoch": 0.3976232917409388, + "grad_norm": 1.8684258047792184, + "learning_rate": 1.3710668238310519e-05, + "loss": 0.3166, + "step": 5019 + }, + { + "epoch": 0.3977025153495742, + "grad_norm": 1.9444041543805182, + "learning_rate": 1.3708285283279252e-05, + "loss": 0.2784, + "step": 5020 + }, + { + "epoch": 0.39778173895820956, + "grad_norm": 2.0956591174241432, + "learning_rate": 1.3705902084075244e-05, + "loss": 0.3477, + "step": 5021 + }, + { + "epoch": 0.39786096256684494, + "grad_norm": 2.1816546635619587, + "learning_rate": 1.3703518640855414e-05, + "loss": 0.357, + "step": 5022 + }, + { + "epoch": 0.39794018617548027, + "grad_norm": 1.5634312168214664, + "learning_rate": 1.37011349537767e-05, + "loss": 0.3307, + "step": 5023 + }, + { + "epoch": 0.39801940978411565, + "grad_norm": 1.944784286465532, + "learning_rate": 1.3698751022996061e-05, + "loss": 0.2913, + "step": 5024 + }, + { + "epoch": 0.39809863339275103, + "grad_norm": 2.0047690184686378, + "learning_rate": 1.3696366848670464e-05, + "loss": 0.3413, + "step": 5025 + }, + { + "epoch": 0.3981778570013864, + "grad_norm": 1.7360612612633144, + "learning_rate": 1.3693982430956896e-05, + "loss": 0.2573, + "step": 5026 + }, + { + "epoch": 0.3982570806100218, + "grad_norm": 1.4386543667099085, + "learning_rate": 1.369159777001236e-05, + "loss": 0.194, + "step": 5027 + }, + { + "epoch": 0.3983363042186572, + "grad_norm": 1.5948091834397193, + "learning_rate": 1.368921286599387e-05, + "loss": 0.3183, + "step": 5028 + }, + { + "epoch": 0.39841552782729256, + "grad_norm": 1.8623985200814455, + "learning_rate": 1.368682771905847e-05, + "loss": 0.273, + "step": 5029 + }, + { + "epoch": 0.3984947514359279, + "grad_norm": 1.8322071548196106, + "learning_rate": 1.3684442329363199e-05, + "loss": 0.3161, + "step": 5030 + }, + { + "epoch": 0.39857397504456327, + "grad_norm": 1.9974121310115824, + "learning_rate": 1.368205669706513e-05, + "loss": 0.2946, + "step": 5031 + }, + { + "epoch": 0.39865319865319865, + "grad_norm": 1.849566686430218, + "learning_rate": 1.3679670822321347e-05, + "loss": 0.4029, + "step": 5032 + }, + { + "epoch": 0.39873242226183403, + "grad_norm": 1.6552099342540687, + "learning_rate": 1.3677284705288943e-05, + "loss": 0.2915, + "step": 5033 + }, + { + "epoch": 0.3988116458704694, + "grad_norm": 1.9924908492626303, + "learning_rate": 1.3674898346125036e-05, + "loss": 0.2559, + "step": 5034 + }, + { + "epoch": 0.3988908694791048, + "grad_norm": 2.1660942439728603, + "learning_rate": 1.3672511744986756e-05, + "loss": 0.3949, + "step": 5035 + }, + { + "epoch": 0.3989700930877402, + "grad_norm": 1.7103848494247698, + "learning_rate": 1.3670124902031248e-05, + "loss": 0.3066, + "step": 5036 + }, + { + "epoch": 0.3990493166963755, + "grad_norm": 1.4422331394262926, + "learning_rate": 1.3667737817415679e-05, + "loss": 0.1716, + "step": 5037 + }, + { + "epoch": 0.3991285403050109, + "grad_norm": 1.5424422592379177, + "learning_rate": 1.3665350491297215e-05, + "loss": 0.216, + "step": 5038 + }, + { + "epoch": 0.39920776391364626, + "grad_norm": 1.6087064335496908, + "learning_rate": 1.3662962923833063e-05, + "loss": 0.2535, + "step": 5039 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 1.7486998482405602, + "learning_rate": 1.3660575115180427e-05, + "loss": 0.2823, + "step": 5040 + }, + { + "epoch": 0.399366211130917, + "grad_norm": 1.5491830491544707, + "learning_rate": 1.3658187065496533e-05, + "loss": 0.2773, + "step": 5041 + }, + { + "epoch": 0.3994454347395524, + "grad_norm": 1.7637106301081635, + "learning_rate": 1.365579877493862e-05, + "loss": 0.3368, + "step": 5042 + }, + { + "epoch": 0.39952465834818773, + "grad_norm": 1.2304973791940805, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.18, + "step": 5043 + }, + { + "epoch": 0.3996038819568231, + "grad_norm": 2.137849827852629, + "learning_rate": 1.3651021471829797e-05, + "loss": 0.328, + "step": 5044 + }, + { + "epoch": 0.3996831055654585, + "grad_norm": 1.6510817194895882, + "learning_rate": 1.3648632459593444e-05, + "loss": 0.342, + "step": 5045 + }, + { + "epoch": 0.3997623291740939, + "grad_norm": 1.7749633383057926, + "learning_rate": 1.3646243207112204e-05, + "loss": 0.2752, + "step": 5046 + }, + { + "epoch": 0.39984155278272926, + "grad_norm": 1.7896738519741786, + "learning_rate": 1.3643853714543389e-05, + "loss": 0.2211, + "step": 5047 + }, + { + "epoch": 0.39992077639136464, + "grad_norm": 1.798033371382471, + "learning_rate": 1.3641463982044343e-05, + "loss": 0.2527, + "step": 5048 + }, + { + "epoch": 0.4, + "grad_norm": 1.6738869725704306, + "learning_rate": 1.3639074009772412e-05, + "loss": 0.3087, + "step": 5049 + }, + { + "epoch": 0.40007922360863535, + "grad_norm": 1.9034250653277713, + "learning_rate": 1.3636683797884971e-05, + "loss": 0.256, + "step": 5050 + }, + { + "epoch": 0.40015844721727073, + "grad_norm": 2.3394754442178867, + "learning_rate": 1.36342933465394e-05, + "loss": 0.2338, + "step": 5051 + }, + { + "epoch": 0.4002376708259061, + "grad_norm": 1.9322442787888647, + "learning_rate": 1.3631902655893096e-05, + "loss": 0.2931, + "step": 5052 + }, + { + "epoch": 0.4003168944345415, + "grad_norm": 1.8460460956167999, + "learning_rate": 1.3629511726103482e-05, + "loss": 0.3765, + "step": 5053 + }, + { + "epoch": 0.4003961180431769, + "grad_norm": 1.9128762740705467, + "learning_rate": 1.3627120557327982e-05, + "loss": 0.3223, + "step": 5054 + }, + { + "epoch": 0.40047534165181226, + "grad_norm": 2.121292997722571, + "learning_rate": 1.3624729149724047e-05, + "loss": 0.3657, + "step": 5055 + }, + { + "epoch": 0.40055456526044764, + "grad_norm": 2.062850495139018, + "learning_rate": 1.362233750344914e-05, + "loss": 0.2723, + "step": 5056 + }, + { + "epoch": 0.40063378886908296, + "grad_norm": 1.5442796276853339, + "learning_rate": 1.3619945618660735e-05, + "loss": 0.1732, + "step": 5057 + }, + { + "epoch": 0.40071301247771834, + "grad_norm": 1.5892232204371144, + "learning_rate": 1.3617553495516332e-05, + "loss": 0.2607, + "step": 5058 + }, + { + "epoch": 0.4007922360863537, + "grad_norm": 1.6667206027788657, + "learning_rate": 1.3615161134173435e-05, + "loss": 0.201, + "step": 5059 + }, + { + "epoch": 0.4008714596949891, + "grad_norm": 1.7295000969213772, + "learning_rate": 1.3612768534789573e-05, + "loss": 0.2719, + "step": 5060 + }, + { + "epoch": 0.4009506833036245, + "grad_norm": 2.139889963806682, + "learning_rate": 1.3610375697522287e-05, + "loss": 0.2277, + "step": 5061 + }, + { + "epoch": 0.40102990691225987, + "grad_norm": 1.8895629784509678, + "learning_rate": 1.3607982622529135e-05, + "loss": 0.3163, + "step": 5062 + }, + { + "epoch": 0.40110913052089525, + "grad_norm": 1.5585929795452302, + "learning_rate": 1.3605589309967686e-05, + "loss": 0.1831, + "step": 5063 + }, + { + "epoch": 0.4011883541295306, + "grad_norm": 2.760265621703642, + "learning_rate": 1.3603195759995531e-05, + "loss": 0.2304, + "step": 5064 + }, + { + "epoch": 0.40126757773816596, + "grad_norm": 1.6261390232067319, + "learning_rate": 1.3600801972770272e-05, + "loss": 0.275, + "step": 5065 + }, + { + "epoch": 0.40134680134680134, + "grad_norm": 1.6511663987681449, + "learning_rate": 1.3598407948449528e-05, + "loss": 0.2163, + "step": 5066 + }, + { + "epoch": 0.4014260249554367, + "grad_norm": 2.0763610174348415, + "learning_rate": 1.3596013687190936e-05, + "loss": 0.3251, + "step": 5067 + }, + { + "epoch": 0.4015052485640721, + "grad_norm": 1.8276755914270248, + "learning_rate": 1.3593619189152146e-05, + "loss": 0.2484, + "step": 5068 + }, + { + "epoch": 0.4015844721727075, + "grad_norm": 2.3228716331882953, + "learning_rate": 1.3591224454490824e-05, + "loss": 0.4665, + "step": 5069 + }, + { + "epoch": 0.40166369578134287, + "grad_norm": 1.711428802085721, + "learning_rate": 1.3588829483364652e-05, + "loss": 0.2556, + "step": 5070 + }, + { + "epoch": 0.4017429193899782, + "grad_norm": 1.8971546017599332, + "learning_rate": 1.3586434275931324e-05, + "loss": 0.2798, + "step": 5071 + }, + { + "epoch": 0.4018221429986136, + "grad_norm": 1.7116477180305871, + "learning_rate": 1.358403883234856e-05, + "loss": 0.2013, + "step": 5072 + }, + { + "epoch": 0.40190136660724896, + "grad_norm": 2.2459854696588915, + "learning_rate": 1.358164315277408e-05, + "loss": 0.3362, + "step": 5073 + }, + { + "epoch": 0.40198059021588434, + "grad_norm": 2.038729214693376, + "learning_rate": 1.3579247237365634e-05, + "loss": 0.2928, + "step": 5074 + }, + { + "epoch": 0.4020598138245197, + "grad_norm": 1.8371381398537223, + "learning_rate": 1.357685108628098e-05, + "loss": 0.2402, + "step": 5075 + }, + { + "epoch": 0.4021390374331551, + "grad_norm": 1.7092859616420752, + "learning_rate": 1.3574454699677893e-05, + "loss": 0.2464, + "step": 5076 + }, + { + "epoch": 0.4022182610417904, + "grad_norm": 1.762109324137486, + "learning_rate": 1.357205807771416e-05, + "loss": 0.2883, + "step": 5077 + }, + { + "epoch": 0.4022974846504258, + "grad_norm": 1.7286172338398547, + "learning_rate": 1.3569661220547596e-05, + "loss": 0.2199, + "step": 5078 + }, + { + "epoch": 0.4023767082590612, + "grad_norm": 2.0910441215810405, + "learning_rate": 1.3567264128336013e-05, + "loss": 0.2181, + "step": 5079 + }, + { + "epoch": 0.40245593186769657, + "grad_norm": 1.6132572458514118, + "learning_rate": 1.3564866801237254e-05, + "loss": 0.2482, + "step": 5080 + }, + { + "epoch": 0.40253515547633195, + "grad_norm": 1.3883172375145247, + "learning_rate": 1.3562469239409166e-05, + "loss": 0.1241, + "step": 5081 + }, + { + "epoch": 0.40261437908496733, + "grad_norm": 1.7380360527567251, + "learning_rate": 1.3560071443009622e-05, + "loss": 0.2233, + "step": 5082 + }, + { + "epoch": 0.4026936026936027, + "grad_norm": 1.6011516627954177, + "learning_rate": 1.3557673412196504e-05, + "loss": 0.25, + "step": 5083 + }, + { + "epoch": 0.40277282630223804, + "grad_norm": 1.637948566671406, + "learning_rate": 1.3555275147127709e-05, + "loss": 0.2378, + "step": 5084 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 1.6529579288993497, + "learning_rate": 1.3552876647961151e-05, + "loss": 0.2397, + "step": 5085 + }, + { + "epoch": 0.4029312735195088, + "grad_norm": 1.9768099106877912, + "learning_rate": 1.3550477914854766e-05, + "loss": 0.3139, + "step": 5086 + }, + { + "epoch": 0.4030104971281442, + "grad_norm": 1.7441395851358676, + "learning_rate": 1.3548078947966487e-05, + "loss": 0.1639, + "step": 5087 + }, + { + "epoch": 0.40308972073677957, + "grad_norm": 1.4630896070550634, + "learning_rate": 1.3545679747454286e-05, + "loss": 0.1754, + "step": 5088 + }, + { + "epoch": 0.40316894434541495, + "grad_norm": 1.5083975274614427, + "learning_rate": 1.3543280313476135e-05, + "loss": 0.2311, + "step": 5089 + }, + { + "epoch": 0.40324816795405033, + "grad_norm": 1.8381713048763741, + "learning_rate": 1.3540880646190022e-05, + "loss": 0.2806, + "step": 5090 + }, + { + "epoch": 0.40332739156268566, + "grad_norm": 2.004847224048754, + "learning_rate": 1.353848074575396e-05, + "loss": 0.3118, + "step": 5091 + }, + { + "epoch": 0.40340661517132104, + "grad_norm": 1.3312190726986168, + "learning_rate": 1.3536080612325963e-05, + "loss": 0.207, + "step": 5092 + }, + { + "epoch": 0.4034858387799564, + "grad_norm": 1.8958888059753587, + "learning_rate": 1.3533680246064073e-05, + "loss": 0.3282, + "step": 5093 + }, + { + "epoch": 0.4035650623885918, + "grad_norm": 1.7526014917456818, + "learning_rate": 1.3531279647126342e-05, + "loss": 0.2669, + "step": 5094 + }, + { + "epoch": 0.4036442859972272, + "grad_norm": 1.8792939581519124, + "learning_rate": 1.352887881567084e-05, + "loss": 0.2395, + "step": 5095 + }, + { + "epoch": 0.40372350960586256, + "grad_norm": 1.834515236887629, + "learning_rate": 1.3526477751855645e-05, + "loss": 0.3298, + "step": 5096 + }, + { + "epoch": 0.40380273321449794, + "grad_norm": 1.3616851076933911, + "learning_rate": 1.3524076455838859e-05, + "loss": 0.1685, + "step": 5097 + }, + { + "epoch": 0.40388195682313327, + "grad_norm": 1.9097697367325288, + "learning_rate": 1.3521674927778594e-05, + "loss": 0.2278, + "step": 5098 + }, + { + "epoch": 0.40396118043176865, + "grad_norm": 2.0662021147785494, + "learning_rate": 1.3519273167832982e-05, + "loss": 0.3435, + "step": 5099 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 2.0486283907732865, + "learning_rate": 1.3516871176160166e-05, + "loss": 0.3015, + "step": 5100 + }, + { + "epoch": 0.4041196276490394, + "grad_norm": 1.7394115483550667, + "learning_rate": 1.3514468952918303e-05, + "loss": 0.237, + "step": 5101 + }, + { + "epoch": 0.4041988512576748, + "grad_norm": 1.6005302850908452, + "learning_rate": 1.3512066498265572e-05, + "loss": 0.3126, + "step": 5102 + }, + { + "epoch": 0.4042780748663102, + "grad_norm": 1.7243835979696784, + "learning_rate": 1.3509663812360161e-05, + "loss": 0.2343, + "step": 5103 + }, + { + "epoch": 0.40435729847494556, + "grad_norm": 1.9792194218805517, + "learning_rate": 1.3507260895360274e-05, + "loss": 0.2746, + "step": 5104 + }, + { + "epoch": 0.4044365220835809, + "grad_norm": 1.7719273628646164, + "learning_rate": 1.3504857747424133e-05, + "loss": 0.2343, + "step": 5105 + }, + { + "epoch": 0.40451574569221627, + "grad_norm": 1.7110158744769208, + "learning_rate": 1.3502454368709973e-05, + "loss": 0.2644, + "step": 5106 + }, + { + "epoch": 0.40459496930085165, + "grad_norm": 1.8312623245381916, + "learning_rate": 1.3500050759376052e-05, + "loss": 0.3027, + "step": 5107 + }, + { + "epoch": 0.40467419290948703, + "grad_norm": 1.5730144082378932, + "learning_rate": 1.3497646919580623e-05, + "loss": 0.2614, + "step": 5108 + }, + { + "epoch": 0.4047534165181224, + "grad_norm": 1.9684226159090965, + "learning_rate": 1.3495242849481973e-05, + "loss": 0.2642, + "step": 5109 + }, + { + "epoch": 0.4048326401267578, + "grad_norm": 2.256453771822014, + "learning_rate": 1.3492838549238406e-05, + "loss": 0.3131, + "step": 5110 + }, + { + "epoch": 0.4049118637353932, + "grad_norm": 1.6807385618522714, + "learning_rate": 1.349043401900822e-05, + "loss": 0.3136, + "step": 5111 + }, + { + "epoch": 0.4049910873440285, + "grad_norm": 1.787421441463756, + "learning_rate": 1.348802925894975e-05, + "loss": 0.2853, + "step": 5112 + }, + { + "epoch": 0.4050703109526639, + "grad_norm": 1.7384424910122578, + "learning_rate": 1.348562426922134e-05, + "loss": 0.3134, + "step": 5113 + }, + { + "epoch": 0.40514953456129926, + "grad_norm": 2.1141066046581334, + "learning_rate": 1.3483219049981343e-05, + "loss": 0.2466, + "step": 5114 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 1.8931192920210242, + "learning_rate": 1.348081360138813e-05, + "loss": 0.2765, + "step": 5115 + }, + { + "epoch": 0.40530798177857, + "grad_norm": 1.5003005993759302, + "learning_rate": 1.347840792360009e-05, + "loss": 0.2283, + "step": 5116 + }, + { + "epoch": 0.4053872053872054, + "grad_norm": 1.7212036394274006, + "learning_rate": 1.3476002016775626e-05, + "loss": 0.2982, + "step": 5117 + }, + { + "epoch": 0.40546642899584073, + "grad_norm": 1.6347946516884355, + "learning_rate": 1.3473595881073154e-05, + "loss": 0.2245, + "step": 5118 + }, + { + "epoch": 0.4055456526044761, + "grad_norm": 1.778637944123748, + "learning_rate": 1.3471189516651108e-05, + "loss": 0.2585, + "step": 5119 + }, + { + "epoch": 0.4056248762131115, + "grad_norm": 1.9866194928007983, + "learning_rate": 1.3468782923667936e-05, + "loss": 0.2625, + "step": 5120 + }, + { + "epoch": 0.4057040998217469, + "grad_norm": 1.7201189995184551, + "learning_rate": 1.3466376102282098e-05, + "loss": 0.292, + "step": 5121 + }, + { + "epoch": 0.40578332343038226, + "grad_norm": 1.855959980595934, + "learning_rate": 1.3463969052652073e-05, + "loss": 0.2453, + "step": 5122 + }, + { + "epoch": 0.40586254703901764, + "grad_norm": 1.9838084063096186, + "learning_rate": 1.3461561774936352e-05, + "loss": 0.2398, + "step": 5123 + }, + { + "epoch": 0.405941770647653, + "grad_norm": 1.690781306775711, + "learning_rate": 1.3459154269293443e-05, + "loss": 0.2557, + "step": 5124 + }, + { + "epoch": 0.40602099425628835, + "grad_norm": 1.903089976280466, + "learning_rate": 1.3456746535881872e-05, + "loss": 0.3484, + "step": 5125 + }, + { + "epoch": 0.40610021786492373, + "grad_norm": 3.9955197655170127, + "learning_rate": 1.3454338574860175e-05, + "loss": 0.2926, + "step": 5126 + }, + { + "epoch": 0.4061794414735591, + "grad_norm": 2.049455391765512, + "learning_rate": 1.3451930386386902e-05, + "loss": 0.2596, + "step": 5127 + }, + { + "epoch": 0.4062586650821945, + "grad_norm": 2.2948576440661386, + "learning_rate": 1.3449521970620624e-05, + "loss": 0.2584, + "step": 5128 + }, + { + "epoch": 0.4063378886908299, + "grad_norm": 1.6667577359747496, + "learning_rate": 1.3447113327719923e-05, + "loss": 0.2166, + "step": 5129 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 1.8738589842280058, + "learning_rate": 1.3444704457843393e-05, + "loss": 0.2446, + "step": 5130 + }, + { + "epoch": 0.40649633590810064, + "grad_norm": 1.628671397406964, + "learning_rate": 1.3442295361149651e-05, + "loss": 0.2584, + "step": 5131 + }, + { + "epoch": 0.40657555951673596, + "grad_norm": 1.913003810064006, + "learning_rate": 1.3439886037797326e-05, + "loss": 0.1983, + "step": 5132 + }, + { + "epoch": 0.40665478312537134, + "grad_norm": 1.9936332626504387, + "learning_rate": 1.3437476487945051e-05, + "loss": 0.2907, + "step": 5133 + }, + { + "epoch": 0.4067340067340067, + "grad_norm": 1.9936659279781714, + "learning_rate": 1.3435066711751494e-05, + "loss": 0.2472, + "step": 5134 + }, + { + "epoch": 0.4068132303426421, + "grad_norm": 1.693690545361268, + "learning_rate": 1.343265670937532e-05, + "loss": 0.228, + "step": 5135 + }, + { + "epoch": 0.4068924539512775, + "grad_norm": 1.5215631469756292, + "learning_rate": 1.3430246480975218e-05, + "loss": 0.2413, + "step": 5136 + }, + { + "epoch": 0.40697167755991287, + "grad_norm": 2.354871344353882, + "learning_rate": 1.3427836026709892e-05, + "loss": 0.385, + "step": 5137 + }, + { + "epoch": 0.40705090116854825, + "grad_norm": 1.8539774936856872, + "learning_rate": 1.3425425346738057e-05, + "loss": 0.2372, + "step": 5138 + }, + { + "epoch": 0.4071301247771836, + "grad_norm": 1.7400157364653104, + "learning_rate": 1.3423014441218444e-05, + "loss": 0.2652, + "step": 5139 + }, + { + "epoch": 0.40720934838581896, + "grad_norm": 1.6583150102594297, + "learning_rate": 1.3420603310309805e-05, + "loss": 0.2619, + "step": 5140 + }, + { + "epoch": 0.40728857199445434, + "grad_norm": 1.4663055947715418, + "learning_rate": 1.3418191954170892e-05, + "loss": 0.1812, + "step": 5141 + }, + { + "epoch": 0.4073677956030897, + "grad_norm": 1.6256358494549412, + "learning_rate": 1.341578037296049e-05, + "loss": 0.1928, + "step": 5142 + }, + { + "epoch": 0.4074470192117251, + "grad_norm": 1.8235862741093718, + "learning_rate": 1.3413368566837384e-05, + "loss": 0.3374, + "step": 5143 + }, + { + "epoch": 0.4075262428203605, + "grad_norm": 1.7652030353751302, + "learning_rate": 1.341095653596038e-05, + "loss": 0.2203, + "step": 5144 + }, + { + "epoch": 0.40760546642899587, + "grad_norm": 1.7748994600164159, + "learning_rate": 1.3408544280488305e-05, + "loss": 0.2924, + "step": 5145 + }, + { + "epoch": 0.4076846900376312, + "grad_norm": 1.6447995004524583, + "learning_rate": 1.3406131800579985e-05, + "loss": 0.2269, + "step": 5146 + }, + { + "epoch": 0.4077639136462666, + "grad_norm": 1.5126775232309466, + "learning_rate": 1.3403719096394276e-05, + "loss": 0.2859, + "step": 5147 + }, + { + "epoch": 0.40784313725490196, + "grad_norm": 1.7659093894467255, + "learning_rate": 1.3401306168090047e-05, + "loss": 0.2616, + "step": 5148 + }, + { + "epoch": 0.40792236086353734, + "grad_norm": 2.1350416405799484, + "learning_rate": 1.3398893015826166e-05, + "loss": 0.3399, + "step": 5149 + }, + { + "epoch": 0.4080015844721727, + "grad_norm": 1.794507080313387, + "learning_rate": 1.3396479639761541e-05, + "loss": 0.3145, + "step": 5150 + }, + { + "epoch": 0.4080808080808081, + "grad_norm": 2.0923423201487714, + "learning_rate": 1.3394066040055071e-05, + "loss": 0.3554, + "step": 5151 + }, + { + "epoch": 0.4081600316894435, + "grad_norm": 1.8967020436756747, + "learning_rate": 1.3391652216865682e-05, + "loss": 0.3651, + "step": 5152 + }, + { + "epoch": 0.4082392552980788, + "grad_norm": 1.6751382737272622, + "learning_rate": 1.3389238170352318e-05, + "loss": 0.3178, + "step": 5153 + }, + { + "epoch": 0.4083184789067142, + "grad_norm": 1.2631128234870763, + "learning_rate": 1.3386823900673926e-05, + "loss": 0.174, + "step": 5154 + }, + { + "epoch": 0.40839770251534957, + "grad_norm": 1.6265536160393501, + "learning_rate": 1.3384409407989475e-05, + "loss": 0.2561, + "step": 5155 + }, + { + "epoch": 0.40847692612398495, + "grad_norm": 1.6957487309972814, + "learning_rate": 1.3381994692457956e-05, + "loss": 0.1714, + "step": 5156 + }, + { + "epoch": 0.40855614973262033, + "grad_norm": 1.7574147686109933, + "learning_rate": 1.3379579754238354e-05, + "loss": 0.204, + "step": 5157 + }, + { + "epoch": 0.4086353733412557, + "grad_norm": 2.097078066509515, + "learning_rate": 1.3377164593489687e-05, + "loss": 0.2867, + "step": 5158 + }, + { + "epoch": 0.40871459694989104, + "grad_norm": 1.6220661313385847, + "learning_rate": 1.3374749210370983e-05, + "loss": 0.2124, + "step": 5159 + }, + { + "epoch": 0.4087938205585264, + "grad_norm": 1.6110651182051976, + "learning_rate": 1.3372333605041282e-05, + "loss": 0.267, + "step": 5160 + }, + { + "epoch": 0.4088730441671618, + "grad_norm": 1.974748168586152, + "learning_rate": 1.3369917777659638e-05, + "loss": 0.3461, + "step": 5161 + }, + { + "epoch": 0.4089522677757972, + "grad_norm": 1.363031162050341, + "learning_rate": 1.3367501728385124e-05, + "loss": 0.1786, + "step": 5162 + }, + { + "epoch": 0.40903149138443257, + "grad_norm": 1.395920425938681, + "learning_rate": 1.3365085457376823e-05, + "loss": 0.1945, + "step": 5163 + }, + { + "epoch": 0.40911071499306795, + "grad_norm": 1.5738025260062642, + "learning_rate": 1.336266896479384e-05, + "loss": 0.2452, + "step": 5164 + }, + { + "epoch": 0.40918993860170333, + "grad_norm": 2.021218829076842, + "learning_rate": 1.3360252250795282e-05, + "loss": 0.2913, + "step": 5165 + }, + { + "epoch": 0.40926916221033866, + "grad_norm": 1.6007496199491629, + "learning_rate": 1.3357835315540281e-05, + "loss": 0.2754, + "step": 5166 + }, + { + "epoch": 0.40934838581897404, + "grad_norm": 2.333871675387061, + "learning_rate": 1.3355418159187988e-05, + "loss": 0.2185, + "step": 5167 + }, + { + "epoch": 0.4094276094276094, + "grad_norm": 1.9545293396520047, + "learning_rate": 1.335300078189755e-05, + "loss": 0.343, + "step": 5168 + }, + { + "epoch": 0.4095068330362448, + "grad_norm": 1.546073710643735, + "learning_rate": 1.3350583183828143e-05, + "loss": 0.1742, + "step": 5169 + }, + { + "epoch": 0.4095860566448802, + "grad_norm": 1.6717718479486692, + "learning_rate": 1.3348165365138956e-05, + "loss": 0.2436, + "step": 5170 + }, + { + "epoch": 0.40966528025351556, + "grad_norm": 1.4706808735228825, + "learning_rate": 1.3345747325989188e-05, + "loss": 0.2193, + "step": 5171 + }, + { + "epoch": 0.40974450386215094, + "grad_norm": 1.8568205353400227, + "learning_rate": 1.3343329066538064e-05, + "loss": 0.2955, + "step": 5172 + }, + { + "epoch": 0.40982372747078627, + "grad_norm": 1.8285885832835829, + "learning_rate": 1.3340910586944805e-05, + "loss": 0.2064, + "step": 5173 + }, + { + "epoch": 0.40990295107942165, + "grad_norm": 2.050711287397145, + "learning_rate": 1.3338491887368656e-05, + "loss": 0.2372, + "step": 5174 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 1.9055540603478667, + "learning_rate": 1.3336072967968882e-05, + "loss": 0.2918, + "step": 5175 + }, + { + "epoch": 0.4100613982966924, + "grad_norm": 1.5891300854150547, + "learning_rate": 1.3333653828904755e-05, + "loss": 0.2148, + "step": 5176 + }, + { + "epoch": 0.4101406219053278, + "grad_norm": 1.7067643837485824, + "learning_rate": 1.3331234470335566e-05, + "loss": 0.3335, + "step": 5177 + }, + { + "epoch": 0.4102198455139632, + "grad_norm": 1.331720194791185, + "learning_rate": 1.3328814892420613e-05, + "loss": 0.1647, + "step": 5178 + }, + { + "epoch": 0.41029906912259856, + "grad_norm": 1.834686906446113, + "learning_rate": 1.3326395095319218e-05, + "loss": 0.3405, + "step": 5179 + }, + { + "epoch": 0.4103782927312339, + "grad_norm": 1.5930543906984322, + "learning_rate": 1.3323975079190713e-05, + "loss": 0.2549, + "step": 5180 + }, + { + "epoch": 0.41045751633986927, + "grad_norm": 1.8510150638600589, + "learning_rate": 1.332155484419444e-05, + "loss": 0.3297, + "step": 5181 + }, + { + "epoch": 0.41053673994850465, + "grad_norm": 1.7059094686450418, + "learning_rate": 1.3319134390489765e-05, + "loss": 0.2846, + "step": 5182 + }, + { + "epoch": 0.41061596355714003, + "grad_norm": 1.965460334486044, + "learning_rate": 1.3316713718236061e-05, + "loss": 0.3457, + "step": 5183 + }, + { + "epoch": 0.4106951871657754, + "grad_norm": 1.812780032392159, + "learning_rate": 1.3314292827592716e-05, + "loss": 0.3125, + "step": 5184 + }, + { + "epoch": 0.4107744107744108, + "grad_norm": 1.607173129636733, + "learning_rate": 1.3311871718719137e-05, + "loss": 0.1976, + "step": 5185 + }, + { + "epoch": 0.4108536343830462, + "grad_norm": 1.6835184596520958, + "learning_rate": 1.330945039177474e-05, + "loss": 0.2003, + "step": 5186 + }, + { + "epoch": 0.4109328579916815, + "grad_norm": 1.6440472358397658, + "learning_rate": 1.3307028846918958e-05, + "loss": 0.1903, + "step": 5187 + }, + { + "epoch": 0.4110120816003169, + "grad_norm": 1.8080144299430012, + "learning_rate": 1.3304607084311246e-05, + "loss": 0.29, + "step": 5188 + }, + { + "epoch": 0.41109130520895226, + "grad_norm": 1.987029536338346, + "learning_rate": 1.3302185104111049e-05, + "loss": 0.3819, + "step": 5189 + }, + { + "epoch": 0.41117052881758764, + "grad_norm": 2.2764370943464747, + "learning_rate": 1.3299762906477855e-05, + "loss": 0.3204, + "step": 5190 + }, + { + "epoch": 0.411249752426223, + "grad_norm": 1.6812041309588308, + "learning_rate": 1.3297340491571153e-05, + "loss": 0.2808, + "step": 5191 + }, + { + "epoch": 0.4113289760348584, + "grad_norm": 1.8309150405418233, + "learning_rate": 1.3294917859550444e-05, + "loss": 0.2545, + "step": 5192 + }, + { + "epoch": 0.4114081996434938, + "grad_norm": 2.1501783197844184, + "learning_rate": 1.3292495010575249e-05, + "loss": 0.2171, + "step": 5193 + }, + { + "epoch": 0.4114874232521291, + "grad_norm": 1.436142715549541, + "learning_rate": 1.3290071944805099e-05, + "loss": 0.1733, + "step": 5194 + }, + { + "epoch": 0.4115666468607645, + "grad_norm": 1.9903326398661199, + "learning_rate": 1.3287648662399544e-05, + "loss": 0.3297, + "step": 5195 + }, + { + "epoch": 0.4116458704693999, + "grad_norm": 2.0961113365532027, + "learning_rate": 1.3285225163518141e-05, + "loss": 0.3485, + "step": 5196 + }, + { + "epoch": 0.41172509407803526, + "grad_norm": 2.1688706093461314, + "learning_rate": 1.328280144832047e-05, + "loss": 0.2803, + "step": 5197 + }, + { + "epoch": 0.41180431768667064, + "grad_norm": 2.093353150979585, + "learning_rate": 1.3280377516966118e-05, + "loss": 0.2104, + "step": 5198 + }, + { + "epoch": 0.411883541295306, + "grad_norm": 2.196762821673357, + "learning_rate": 1.3277953369614696e-05, + "loss": 0.1586, + "step": 5199 + }, + { + "epoch": 0.41196276490394135, + "grad_norm": 1.6668383747799789, + "learning_rate": 1.3275529006425808e-05, + "loss": 0.2006, + "step": 5200 + }, + { + "epoch": 0.41204198851257673, + "grad_norm": 1.711982844670911, + "learning_rate": 1.3273104427559102e-05, + "loss": 0.1579, + "step": 5201 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 1.6314340648189252, + "learning_rate": 1.3270679633174219e-05, + "loss": 0.2625, + "step": 5202 + }, + { + "epoch": 0.4122004357298475, + "grad_norm": 1.5120713409937838, + "learning_rate": 1.3268254623430817e-05, + "loss": 0.1082, + "step": 5203 + }, + { + "epoch": 0.4122796593384829, + "grad_norm": 1.7489700885427808, + "learning_rate": 1.3265829398488576e-05, + "loss": 0.237, + "step": 5204 + }, + { + "epoch": 0.41235888294711825, + "grad_norm": 1.5012984104698848, + "learning_rate": 1.3263403958507181e-05, + "loss": 0.1677, + "step": 5205 + }, + { + "epoch": 0.41243810655575364, + "grad_norm": 1.3957259372424156, + "learning_rate": 1.326097830364634e-05, + "loss": 0.2393, + "step": 5206 + }, + { + "epoch": 0.41251733016438896, + "grad_norm": 1.8903925559142896, + "learning_rate": 1.3258552434065768e-05, + "loss": 0.3144, + "step": 5207 + }, + { + "epoch": 0.41259655377302434, + "grad_norm": 2.0445936490573486, + "learning_rate": 1.3256126349925195e-05, + "loss": 0.277, + "step": 5208 + }, + { + "epoch": 0.4126757773816597, + "grad_norm": 1.5232667035235046, + "learning_rate": 1.3253700051384371e-05, + "loss": 0.3095, + "step": 5209 + }, + { + "epoch": 0.4127550009902951, + "grad_norm": 1.7996338482003338, + "learning_rate": 1.3251273538603056e-05, + "loss": 0.239, + "step": 5210 + }, + { + "epoch": 0.4128342245989305, + "grad_norm": 2.0306475842675744, + "learning_rate": 1.3248846811741021e-05, + "loss": 0.3208, + "step": 5211 + }, + { + "epoch": 0.41291344820756587, + "grad_norm": 1.7522197744915726, + "learning_rate": 1.3246419870958056e-05, + "loss": 0.2064, + "step": 5212 + }, + { + "epoch": 0.41299267181620125, + "grad_norm": 1.8621301770693548, + "learning_rate": 1.3243992716413962e-05, + "loss": 0.2948, + "step": 5213 + }, + { + "epoch": 0.4130718954248366, + "grad_norm": 1.8690129098080024, + "learning_rate": 1.324156534826856e-05, + "loss": 0.2599, + "step": 5214 + }, + { + "epoch": 0.41315111903347196, + "grad_norm": 1.8768059540239044, + "learning_rate": 1.3239137766681675e-05, + "loss": 0.1781, + "step": 5215 + }, + { + "epoch": 0.41323034264210734, + "grad_norm": 2.045483786114196, + "learning_rate": 1.3236709971813153e-05, + "loss": 0.238, + "step": 5216 + }, + { + "epoch": 0.4133095662507427, + "grad_norm": 1.7166613052071729, + "learning_rate": 1.3234281963822856e-05, + "loss": 0.2379, + "step": 5217 + }, + { + "epoch": 0.4133887898593781, + "grad_norm": 2.114992324498812, + "learning_rate": 1.3231853742870652e-05, + "loss": 0.3007, + "step": 5218 + }, + { + "epoch": 0.4134680134680135, + "grad_norm": 1.8623630779606364, + "learning_rate": 1.322942530911643e-05, + "loss": 0.2617, + "step": 5219 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 1.4836575719854659, + "learning_rate": 1.3226996662720094e-05, + "loss": 0.1843, + "step": 5220 + }, + { + "epoch": 0.4136264606852842, + "grad_norm": 1.8596948321747533, + "learning_rate": 1.322456780384155e-05, + "loss": 0.3164, + "step": 5221 + }, + { + "epoch": 0.4137056842939196, + "grad_norm": 1.6464179775636327, + "learning_rate": 1.3222138732640732e-05, + "loss": 0.1999, + "step": 5222 + }, + { + "epoch": 0.41378490790255495, + "grad_norm": 1.937607391187395, + "learning_rate": 1.3219709449277584e-05, + "loss": 0.2381, + "step": 5223 + }, + { + "epoch": 0.41386413151119034, + "grad_norm": 1.922053920233908, + "learning_rate": 1.3217279953912061e-05, + "loss": 0.2975, + "step": 5224 + }, + { + "epoch": 0.4139433551198257, + "grad_norm": 1.6567096135748143, + "learning_rate": 1.3214850246704134e-05, + "loss": 0.2623, + "step": 5225 + }, + { + "epoch": 0.4140225787284611, + "grad_norm": 1.4989643447832592, + "learning_rate": 1.3212420327813789e-05, + "loss": 0.2222, + "step": 5226 + }, + { + "epoch": 0.4141018023370965, + "grad_norm": 1.9283219327322054, + "learning_rate": 1.3209990197401016e-05, + "loss": 0.2744, + "step": 5227 + }, + { + "epoch": 0.4141810259457318, + "grad_norm": 1.7374988023831457, + "learning_rate": 1.3207559855625842e-05, + "loss": 0.2573, + "step": 5228 + }, + { + "epoch": 0.4142602495543672, + "grad_norm": 1.8155447949094714, + "learning_rate": 1.3205129302648282e-05, + "loss": 0.3124, + "step": 5229 + }, + { + "epoch": 0.41433947316300257, + "grad_norm": 2.101997272370507, + "learning_rate": 1.3202698538628376e-05, + "loss": 0.2681, + "step": 5230 + }, + { + "epoch": 0.41441869677163795, + "grad_norm": 2.0657793079428997, + "learning_rate": 1.3200267563726187e-05, + "loss": 0.3074, + "step": 5231 + }, + { + "epoch": 0.41449792038027333, + "grad_norm": 2.3618194154949776, + "learning_rate": 1.3197836378101773e-05, + "loss": 0.4118, + "step": 5232 + }, + { + "epoch": 0.4145771439889087, + "grad_norm": 1.817317713696645, + "learning_rate": 1.3195404981915223e-05, + "loss": 0.2826, + "step": 5233 + }, + { + "epoch": 0.4146563675975441, + "grad_norm": 1.814930338028792, + "learning_rate": 1.3192973375326635e-05, + "loss": 0.329, + "step": 5234 + }, + { + "epoch": 0.4147355912061794, + "grad_norm": 1.9300993049829744, + "learning_rate": 1.3190541558496106e-05, + "loss": 0.3374, + "step": 5235 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 1.7259491977036778, + "learning_rate": 1.318810953158377e-05, + "loss": 0.3181, + "step": 5236 + }, + { + "epoch": 0.4148940384234502, + "grad_norm": 1.8701968392638888, + "learning_rate": 1.3185677294749763e-05, + "loss": 0.2945, + "step": 5237 + }, + { + "epoch": 0.41497326203208557, + "grad_norm": 1.6894679515437134, + "learning_rate": 1.3183244848154232e-05, + "loss": 0.3069, + "step": 5238 + }, + { + "epoch": 0.41505248564072095, + "grad_norm": 1.9965917243085374, + "learning_rate": 1.3180812191957346e-05, + "loss": 0.3633, + "step": 5239 + }, + { + "epoch": 0.41513170924935633, + "grad_norm": 1.4216688900832601, + "learning_rate": 1.3178379326319284e-05, + "loss": 0.2976, + "step": 5240 + }, + { + "epoch": 0.41521093285799165, + "grad_norm": 1.8794530865962518, + "learning_rate": 1.3175946251400234e-05, + "loss": 0.2979, + "step": 5241 + }, + { + "epoch": 0.41529015646662704, + "grad_norm": 1.2856985918775345, + "learning_rate": 1.3173512967360406e-05, + "loss": 0.1585, + "step": 5242 + }, + { + "epoch": 0.4153693800752624, + "grad_norm": 1.6765815028562654, + "learning_rate": 1.317107947436002e-05, + "loss": 0.332, + "step": 5243 + }, + { + "epoch": 0.4154486036838978, + "grad_norm": 1.5236883384010385, + "learning_rate": 1.3168645772559308e-05, + "loss": 0.2816, + "step": 5244 + }, + { + "epoch": 0.4155278272925332, + "grad_norm": 1.5648584965519263, + "learning_rate": 1.3166211862118519e-05, + "loss": 0.2192, + "step": 5245 + }, + { + "epoch": 0.41560705090116856, + "grad_norm": 1.5256879916086248, + "learning_rate": 1.3163777743197912e-05, + "loss": 0.2118, + "step": 5246 + }, + { + "epoch": 0.41568627450980394, + "grad_norm": 1.5205413116315978, + "learning_rate": 1.3161343415957767e-05, + "loss": 0.2379, + "step": 5247 + }, + { + "epoch": 0.41576549811843927, + "grad_norm": 1.4177960353245953, + "learning_rate": 1.3158908880558366e-05, + "loss": 0.1392, + "step": 5248 + }, + { + "epoch": 0.41584472172707465, + "grad_norm": 1.7540228890352578, + "learning_rate": 1.3156474137160015e-05, + "loss": 0.2691, + "step": 5249 + }, + { + "epoch": 0.41592394533571003, + "grad_norm": 2.2833412720352544, + "learning_rate": 1.3154039185923034e-05, + "loss": 0.3523, + "step": 5250 + }, + { + "epoch": 0.4160031689443454, + "grad_norm": 2.235976518031537, + "learning_rate": 1.3151604027007744e-05, + "loss": 0.4049, + "step": 5251 + }, + { + "epoch": 0.4160823925529808, + "grad_norm": 1.4438007122077707, + "learning_rate": 1.3149168660574495e-05, + "loss": 0.2135, + "step": 5252 + }, + { + "epoch": 0.4161616161616162, + "grad_norm": 1.8661355462098546, + "learning_rate": 1.3146733086783646e-05, + "loss": 0.3075, + "step": 5253 + }, + { + "epoch": 0.41624083977025156, + "grad_norm": 1.775842709710853, + "learning_rate": 1.3144297305795559e-05, + "loss": 0.2227, + "step": 5254 + }, + { + "epoch": 0.4163200633788869, + "grad_norm": 1.656949959918874, + "learning_rate": 1.3141861317770628e-05, + "loss": 0.2423, + "step": 5255 + }, + { + "epoch": 0.41639928698752227, + "grad_norm": 1.8036213302740065, + "learning_rate": 1.3139425122869244e-05, + "loss": 0.3101, + "step": 5256 + }, + { + "epoch": 0.41647851059615765, + "grad_norm": 1.6799140032312851, + "learning_rate": 1.3136988721251823e-05, + "loss": 0.2708, + "step": 5257 + }, + { + "epoch": 0.41655773420479303, + "grad_norm": 2.296680665774159, + "learning_rate": 1.3134552113078788e-05, + "loss": 0.3344, + "step": 5258 + }, + { + "epoch": 0.4166369578134284, + "grad_norm": 2.0155753623581893, + "learning_rate": 1.3132115298510579e-05, + "loss": 0.3025, + "step": 5259 + }, + { + "epoch": 0.4167161814220638, + "grad_norm": 1.9435443817118128, + "learning_rate": 1.312967827770765e-05, + "loss": 0.3456, + "step": 5260 + }, + { + "epoch": 0.4167954050306992, + "grad_norm": 1.8889477335751863, + "learning_rate": 1.3127241050830463e-05, + "loss": 0.3628, + "step": 5261 + }, + { + "epoch": 0.4168746286393345, + "grad_norm": 1.804585188377334, + "learning_rate": 1.3124803618039501e-05, + "loss": 0.2484, + "step": 5262 + }, + { + "epoch": 0.4169538522479699, + "grad_norm": 1.689622961679414, + "learning_rate": 1.3122365979495259e-05, + "loss": 0.2295, + "step": 5263 + }, + { + "epoch": 0.41703307585660526, + "grad_norm": 1.6133616251922587, + "learning_rate": 1.3119928135358238e-05, + "loss": 0.2047, + "step": 5264 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 1.788420824189975, + "learning_rate": 1.3117490085788963e-05, + "loss": 0.2654, + "step": 5265 + }, + { + "epoch": 0.417191523073876, + "grad_norm": 1.9769138275297493, + "learning_rate": 1.3115051830947966e-05, + "loss": 0.365, + "step": 5266 + }, + { + "epoch": 0.4172707466825114, + "grad_norm": 1.7116061003029828, + "learning_rate": 1.3112613370995792e-05, + "loss": 0.2633, + "step": 5267 + }, + { + "epoch": 0.4173499702911468, + "grad_norm": 1.8468645056452868, + "learning_rate": 1.3110174706093007e-05, + "loss": 0.3064, + "step": 5268 + }, + { + "epoch": 0.4174291938997821, + "grad_norm": 2.0785124181840455, + "learning_rate": 1.3107735836400184e-05, + "loss": 0.2956, + "step": 5269 + }, + { + "epoch": 0.4175084175084175, + "grad_norm": 1.9209309419494196, + "learning_rate": 1.3105296762077906e-05, + "loss": 0.2608, + "step": 5270 + }, + { + "epoch": 0.4175876411170529, + "grad_norm": 1.5873708085738265, + "learning_rate": 1.3102857483286781e-05, + "loss": 0.1823, + "step": 5271 + }, + { + "epoch": 0.41766686472568826, + "grad_norm": 2.023338451498811, + "learning_rate": 1.310041800018742e-05, + "loss": 0.3227, + "step": 5272 + }, + { + "epoch": 0.41774608833432364, + "grad_norm": 1.7275235130921136, + "learning_rate": 1.3097978312940453e-05, + "loss": 0.2771, + "step": 5273 + }, + { + "epoch": 0.417825311942959, + "grad_norm": 1.5499425958351882, + "learning_rate": 1.309553842170652e-05, + "loss": 0.2284, + "step": 5274 + }, + { + "epoch": 0.4179045355515944, + "grad_norm": 1.7397350796249147, + "learning_rate": 1.3093098326646277e-05, + "loss": 0.1957, + "step": 5275 + }, + { + "epoch": 0.41798375916022973, + "grad_norm": 1.4392425689293202, + "learning_rate": 1.3090658027920391e-05, + "loss": 0.1714, + "step": 5276 + }, + { + "epoch": 0.4180629827688651, + "grad_norm": 1.9745192965774891, + "learning_rate": 1.3088217525689546e-05, + "loss": 0.3665, + "step": 5277 + }, + { + "epoch": 0.4181422063775005, + "grad_norm": 1.701401164972941, + "learning_rate": 1.3085776820114435e-05, + "loss": 0.2309, + "step": 5278 + }, + { + "epoch": 0.4182214299861359, + "grad_norm": 1.8861890401580752, + "learning_rate": 1.3083335911355768e-05, + "loss": 0.3023, + "step": 5279 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 1.639537165885871, + "learning_rate": 1.3080894799574271e-05, + "loss": 0.2693, + "step": 5280 + }, + { + "epoch": 0.41837987720340664, + "grad_norm": 1.7811459961214888, + "learning_rate": 1.3078453484930674e-05, + "loss": 0.2919, + "step": 5281 + }, + { + "epoch": 0.41845910081204196, + "grad_norm": 1.6712278031507863, + "learning_rate": 1.3076011967585727e-05, + "loss": 0.2327, + "step": 5282 + }, + { + "epoch": 0.41853832442067734, + "grad_norm": 1.7417391340591746, + "learning_rate": 1.3073570247700192e-05, + "loss": 0.3444, + "step": 5283 + }, + { + "epoch": 0.4186175480293127, + "grad_norm": 1.931846836836233, + "learning_rate": 1.3071128325434845e-05, + "loss": 0.3352, + "step": 5284 + }, + { + "epoch": 0.4186967716379481, + "grad_norm": 1.4572105965174231, + "learning_rate": 1.3068686200950475e-05, + "loss": 0.1631, + "step": 5285 + }, + { + "epoch": 0.4187759952465835, + "grad_norm": 1.3912395463488743, + "learning_rate": 1.3066243874407886e-05, + "loss": 0.2103, + "step": 5286 + }, + { + "epoch": 0.41885521885521887, + "grad_norm": 1.5245415605188581, + "learning_rate": 1.306380134596789e-05, + "loss": 0.1921, + "step": 5287 + }, + { + "epoch": 0.41893444246385425, + "grad_norm": 1.7795315791154995, + "learning_rate": 1.306135861579132e-05, + "loss": 0.2995, + "step": 5288 + }, + { + "epoch": 0.4190136660724896, + "grad_norm": 1.534087796584696, + "learning_rate": 1.3058915684039013e-05, + "loss": 0.2132, + "step": 5289 + }, + { + "epoch": 0.41909288968112496, + "grad_norm": 1.5299809687136736, + "learning_rate": 1.3056472550871829e-05, + "loss": 0.193, + "step": 5290 + }, + { + "epoch": 0.41917211328976034, + "grad_norm": 1.65777487310565, + "learning_rate": 1.3054029216450632e-05, + "loss": 0.3148, + "step": 5291 + }, + { + "epoch": 0.4192513368983957, + "grad_norm": 2.090520647761956, + "learning_rate": 1.3051585680936305e-05, + "loss": 0.3395, + "step": 5292 + }, + { + "epoch": 0.4193305605070311, + "grad_norm": 1.743699918943753, + "learning_rate": 1.304914194448975e-05, + "loss": 0.1986, + "step": 5293 + }, + { + "epoch": 0.4194097841156665, + "grad_norm": 1.9008619441110146, + "learning_rate": 1.3046698007271864e-05, + "loss": 0.2787, + "step": 5294 + }, + { + "epoch": 0.41948900772430187, + "grad_norm": 1.5416233467110712, + "learning_rate": 1.3044253869443575e-05, + "loss": 0.2747, + "step": 5295 + }, + { + "epoch": 0.4195682313329372, + "grad_norm": 1.4642403972621758, + "learning_rate": 1.3041809531165819e-05, + "loss": 0.1994, + "step": 5296 + }, + { + "epoch": 0.4196474549415726, + "grad_norm": 2.320887547839888, + "learning_rate": 1.3039364992599538e-05, + "loss": 0.3794, + "step": 5297 + }, + { + "epoch": 0.41972667855020795, + "grad_norm": 1.862682209757833, + "learning_rate": 1.30369202539057e-05, + "loss": 0.2795, + "step": 5298 + }, + { + "epoch": 0.41980590215884334, + "grad_norm": 1.4791216553717002, + "learning_rate": 1.3034475315245273e-05, + "loss": 0.2739, + "step": 5299 + }, + { + "epoch": 0.4198851257674787, + "grad_norm": 2.349124545979662, + "learning_rate": 1.303203017677925e-05, + "loss": 0.2666, + "step": 5300 + }, + { + "epoch": 0.4199643493761141, + "grad_norm": 1.8222694978081466, + "learning_rate": 1.302958483866863e-05, + "loss": 0.3268, + "step": 5301 + }, + { + "epoch": 0.4200435729847495, + "grad_norm": 1.5554712641977537, + "learning_rate": 1.3027139301074423e-05, + "loss": 0.2758, + "step": 5302 + }, + { + "epoch": 0.4201227965933848, + "grad_norm": 1.5951832067539373, + "learning_rate": 1.3024693564157658e-05, + "loss": 0.2158, + "step": 5303 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 2.3035560622154607, + "learning_rate": 1.3022247628079381e-05, + "loss": 0.3497, + "step": 5304 + }, + { + "epoch": 0.42028124381065557, + "grad_norm": 2.016302820851935, + "learning_rate": 1.3019801493000634e-05, + "loss": 0.2421, + "step": 5305 + }, + { + "epoch": 0.42036046741929095, + "grad_norm": 1.625619973075902, + "learning_rate": 1.3017355159082495e-05, + "loss": 0.1999, + "step": 5306 + }, + { + "epoch": 0.42043969102792633, + "grad_norm": 1.3797597210355135, + "learning_rate": 1.3014908626486032e-05, + "loss": 0.2551, + "step": 5307 + }, + { + "epoch": 0.4205189146365617, + "grad_norm": 1.4962566611498553, + "learning_rate": 1.3012461895372343e-05, + "loss": 0.2045, + "step": 5308 + }, + { + "epoch": 0.4205981382451971, + "grad_norm": 1.7202227734224333, + "learning_rate": 1.3010014965902535e-05, + "loss": 0.2211, + "step": 5309 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 1.995332339798482, + "learning_rate": 1.3007567838237725e-05, + "loss": 0.2348, + "step": 5310 + }, + { + "epoch": 0.4207565854624678, + "grad_norm": 1.818271065911864, + "learning_rate": 1.3005120512539042e-05, + "loss": 0.286, + "step": 5311 + }, + { + "epoch": 0.4208358090711032, + "grad_norm": 1.7061833414089553, + "learning_rate": 1.300267298896764e-05, + "loss": 0.2617, + "step": 5312 + }, + { + "epoch": 0.42091503267973857, + "grad_norm": 2.0295291395244774, + "learning_rate": 1.3000225267684663e-05, + "loss": 0.2941, + "step": 5313 + }, + { + "epoch": 0.42099425628837395, + "grad_norm": 1.8220830654245137, + "learning_rate": 1.2997777348851288e-05, + "loss": 0.2978, + "step": 5314 + }, + { + "epoch": 0.42107347989700933, + "grad_norm": 1.9948853356414633, + "learning_rate": 1.2995329232628702e-05, + "loss": 0.2662, + "step": 5315 + }, + { + "epoch": 0.42115270350564465, + "grad_norm": 1.5622268042548941, + "learning_rate": 1.2992880919178097e-05, + "loss": 0.2519, + "step": 5316 + }, + { + "epoch": 0.42123192711428004, + "grad_norm": 1.7482158606983982, + "learning_rate": 1.2990432408660682e-05, + "loss": 0.1897, + "step": 5317 + }, + { + "epoch": 0.4213111507229154, + "grad_norm": 1.5901910918658888, + "learning_rate": 1.2987983701237688e-05, + "loss": 0.2713, + "step": 5318 + }, + { + "epoch": 0.4213903743315508, + "grad_norm": 2.4458224658396515, + "learning_rate": 1.298553479707034e-05, + "loss": 0.3555, + "step": 5319 + }, + { + "epoch": 0.4214695979401862, + "grad_norm": 1.6720320572727707, + "learning_rate": 1.2983085696319892e-05, + "loss": 0.28, + "step": 5320 + }, + { + "epoch": 0.42154882154882156, + "grad_norm": 1.5459557603633542, + "learning_rate": 1.2980636399147606e-05, + "loss": 0.1933, + "step": 5321 + }, + { + "epoch": 0.42162804515745694, + "grad_norm": 1.8197302528710626, + "learning_rate": 1.2978186905714752e-05, + "loss": 0.2153, + "step": 5322 + }, + { + "epoch": 0.42170726876609227, + "grad_norm": 1.9509194573752373, + "learning_rate": 1.2975737216182625e-05, + "loss": 0.2517, + "step": 5323 + }, + { + "epoch": 0.42178649237472765, + "grad_norm": 1.3747995294809678, + "learning_rate": 1.2973287330712516e-05, + "loss": 0.1617, + "step": 5324 + }, + { + "epoch": 0.42186571598336303, + "grad_norm": 1.6873013316156398, + "learning_rate": 1.2970837249465746e-05, + "loss": 0.2788, + "step": 5325 + }, + { + "epoch": 0.4219449395919984, + "grad_norm": 1.9206875941699928, + "learning_rate": 1.2968386972603635e-05, + "loss": 0.2904, + "step": 5326 + }, + { + "epoch": 0.4220241632006338, + "grad_norm": 1.8821599668747608, + "learning_rate": 1.2965936500287526e-05, + "loss": 0.2267, + "step": 5327 + }, + { + "epoch": 0.4221033868092692, + "grad_norm": 1.6882944969178815, + "learning_rate": 1.2963485832678772e-05, + "loss": 0.2162, + "step": 5328 + }, + { + "epoch": 0.42218261041790456, + "grad_norm": 1.9782034352060445, + "learning_rate": 1.2961034969938732e-05, + "loss": 0.1963, + "step": 5329 + }, + { + "epoch": 0.4222618340265399, + "grad_norm": 1.8705005015044316, + "learning_rate": 1.2958583912228785e-05, + "loss": 0.3257, + "step": 5330 + }, + { + "epoch": 0.42234105763517527, + "grad_norm": 1.506027358609764, + "learning_rate": 1.295613265971033e-05, + "loss": 0.1935, + "step": 5331 + }, + { + "epoch": 0.42242028124381065, + "grad_norm": 1.8642259177441336, + "learning_rate": 1.2953681212544757e-05, + "loss": 0.2818, + "step": 5332 + }, + { + "epoch": 0.42249950485244603, + "grad_norm": 1.915602154327558, + "learning_rate": 1.2951229570893493e-05, + "loss": 0.2535, + "step": 5333 + }, + { + "epoch": 0.4225787284610814, + "grad_norm": 1.9270586802060459, + "learning_rate": 1.2948777734917961e-05, + "loss": 0.2655, + "step": 5334 + }, + { + "epoch": 0.4226579520697168, + "grad_norm": 1.8091362621772333, + "learning_rate": 1.2946325704779602e-05, + "loss": 0.2943, + "step": 5335 + }, + { + "epoch": 0.4227371756783522, + "grad_norm": 1.9385980196787909, + "learning_rate": 1.2943873480639875e-05, + "loss": 0.2716, + "step": 5336 + }, + { + "epoch": 0.4228163992869875, + "grad_norm": 1.6522502876088858, + "learning_rate": 1.294142106266024e-05, + "loss": 0.3378, + "step": 5337 + }, + { + "epoch": 0.4228956228956229, + "grad_norm": 2.312137753778925, + "learning_rate": 1.2938968451002183e-05, + "loss": 0.3319, + "step": 5338 + }, + { + "epoch": 0.42297484650425826, + "grad_norm": 1.7757960413210792, + "learning_rate": 1.2936515645827198e-05, + "loss": 0.359, + "step": 5339 + }, + { + "epoch": 0.42305407011289364, + "grad_norm": 1.6152680203144114, + "learning_rate": 1.2934062647296783e-05, + "loss": 0.1985, + "step": 5340 + }, + { + "epoch": 0.423133293721529, + "grad_norm": 1.7620697388118594, + "learning_rate": 1.2931609455572462e-05, + "loss": 0.2423, + "step": 5341 + }, + { + "epoch": 0.4232125173301644, + "grad_norm": 1.8255664656678905, + "learning_rate": 1.2929156070815765e-05, + "loss": 0.2393, + "step": 5342 + }, + { + "epoch": 0.4232917409387998, + "grad_norm": 1.9061765497731176, + "learning_rate": 1.2926702493188235e-05, + "loss": 0.3265, + "step": 5343 + }, + { + "epoch": 0.4233709645474351, + "grad_norm": 1.7186140003029269, + "learning_rate": 1.292424872285143e-05, + "loss": 0.2088, + "step": 5344 + }, + { + "epoch": 0.4234501881560705, + "grad_norm": 1.935200215980244, + "learning_rate": 1.2921794759966913e-05, + "loss": 0.2368, + "step": 5345 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 1.644584974229887, + "learning_rate": 1.2919340604696272e-05, + "loss": 0.237, + "step": 5346 + }, + { + "epoch": 0.42360863537334126, + "grad_norm": 1.707562127909152, + "learning_rate": 1.29168862572011e-05, + "loss": 0.3126, + "step": 5347 + }, + { + "epoch": 0.42368785898197664, + "grad_norm": 1.5626803481090905, + "learning_rate": 1.2914431717643e-05, + "loss": 0.3265, + "step": 5348 + }, + { + "epoch": 0.423767082590612, + "grad_norm": 1.8831074350709485, + "learning_rate": 1.2911976986183598e-05, + "loss": 0.301, + "step": 5349 + }, + { + "epoch": 0.4238463061992474, + "grad_norm": 1.8146522411546355, + "learning_rate": 1.2909522062984524e-05, + "loss": 0.2458, + "step": 5350 + }, + { + "epoch": 0.42392552980788273, + "grad_norm": 1.4448526173348721, + "learning_rate": 1.290706694820742e-05, + "loss": 0.2366, + "step": 5351 + }, + { + "epoch": 0.4240047534165181, + "grad_norm": 1.3324539222264242, + "learning_rate": 1.2904611642013945e-05, + "loss": 0.1628, + "step": 5352 + }, + { + "epoch": 0.4240839770251535, + "grad_norm": 1.7934542794522415, + "learning_rate": 1.2902156144565769e-05, + "loss": 0.2851, + "step": 5353 + }, + { + "epoch": 0.4241632006337889, + "grad_norm": 1.3289472061397964, + "learning_rate": 1.2899700456024576e-05, + "loss": 0.2118, + "step": 5354 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.8400555231629665, + "learning_rate": 1.2897244576552062e-05, + "loss": 0.3162, + "step": 5355 + }, + { + "epoch": 0.42432164785105964, + "grad_norm": 1.666685062296082, + "learning_rate": 1.289478850630993e-05, + "loss": 0.2224, + "step": 5356 + }, + { + "epoch": 0.42440087145969496, + "grad_norm": 1.5062152873832801, + "learning_rate": 1.2892332245459904e-05, + "loss": 0.2847, + "step": 5357 + }, + { + "epoch": 0.42448009506833034, + "grad_norm": 1.473074964787706, + "learning_rate": 1.288987579416372e-05, + "loss": 0.1833, + "step": 5358 + }, + { + "epoch": 0.4245593186769657, + "grad_norm": 1.5985653513362799, + "learning_rate": 1.2887419152583117e-05, + "loss": 0.2457, + "step": 5359 + }, + { + "epoch": 0.4246385422856011, + "grad_norm": 2.153596681911628, + "learning_rate": 1.2884962320879857e-05, + "loss": 0.2969, + "step": 5360 + }, + { + "epoch": 0.4247177658942365, + "grad_norm": 2.0702961719461226, + "learning_rate": 1.2882505299215711e-05, + "loss": 0.3443, + "step": 5361 + }, + { + "epoch": 0.42479698950287187, + "grad_norm": 1.702787559197427, + "learning_rate": 1.288004808775246e-05, + "loss": 0.197, + "step": 5362 + }, + { + "epoch": 0.42487621311150725, + "grad_norm": 1.4922726383266287, + "learning_rate": 1.28775906866519e-05, + "loss": 0.1376, + "step": 5363 + }, + { + "epoch": 0.4249554367201426, + "grad_norm": 1.87135215453094, + "learning_rate": 1.2875133096075839e-05, + "loss": 0.2684, + "step": 5364 + }, + { + "epoch": 0.42503466032877796, + "grad_norm": 1.8583033954717476, + "learning_rate": 1.2872675316186096e-05, + "loss": 0.2831, + "step": 5365 + }, + { + "epoch": 0.42511388393741334, + "grad_norm": 2.1650304660574746, + "learning_rate": 1.2870217347144511e-05, + "loss": 0.3172, + "step": 5366 + }, + { + "epoch": 0.4251931075460487, + "grad_norm": 1.6590298553305656, + "learning_rate": 1.2867759189112921e-05, + "loss": 0.2436, + "step": 5367 + }, + { + "epoch": 0.4252723311546841, + "grad_norm": 1.7659124444747074, + "learning_rate": 1.2865300842253188e-05, + "loss": 0.1691, + "step": 5368 + }, + { + "epoch": 0.4253515547633195, + "grad_norm": 1.7124052522714717, + "learning_rate": 1.2862842306727181e-05, + "loss": 0.2355, + "step": 5369 + }, + { + "epoch": 0.42543077837195487, + "grad_norm": 1.880332769584433, + "learning_rate": 1.2860383582696783e-05, + "loss": 0.2713, + "step": 5370 + }, + { + "epoch": 0.4255100019805902, + "grad_norm": 1.8693987209668768, + "learning_rate": 1.2857924670323892e-05, + "loss": 0.2671, + "step": 5371 + }, + { + "epoch": 0.4255892255892256, + "grad_norm": 2.7753845810807927, + "learning_rate": 1.2855465569770407e-05, + "loss": 0.2372, + "step": 5372 + }, + { + "epoch": 0.42566844919786095, + "grad_norm": 1.8058623258430284, + "learning_rate": 1.2853006281198257e-05, + "loss": 0.2261, + "step": 5373 + }, + { + "epoch": 0.42574767280649634, + "grad_norm": 2.2840985239287512, + "learning_rate": 1.2850546804769372e-05, + "loss": 0.3987, + "step": 5374 + }, + { + "epoch": 0.4258268964151317, + "grad_norm": 1.746917631561016, + "learning_rate": 1.2848087140645695e-05, + "loss": 0.3313, + "step": 5375 + }, + { + "epoch": 0.4259061200237671, + "grad_norm": 1.524329647115645, + "learning_rate": 1.2845627288989186e-05, + "loss": 0.1749, + "step": 5376 + }, + { + "epoch": 0.4259853436324025, + "grad_norm": 1.842257345403571, + "learning_rate": 1.284316724996181e-05, + "loss": 0.2819, + "step": 5377 + }, + { + "epoch": 0.4260645672410378, + "grad_norm": 2.158195232222975, + "learning_rate": 1.2840707023725552e-05, + "loss": 0.2363, + "step": 5378 + }, + { + "epoch": 0.4261437908496732, + "grad_norm": 1.4442132356995627, + "learning_rate": 1.2838246610442406e-05, + "loss": 0.2089, + "step": 5379 + }, + { + "epoch": 0.42622301445830857, + "grad_norm": 1.5426906647674505, + "learning_rate": 1.2835786010274376e-05, + "loss": 0.2405, + "step": 5380 + }, + { + "epoch": 0.42630223806694395, + "grad_norm": 1.4825124278536477, + "learning_rate": 1.283332522338348e-05, + "loss": 0.1868, + "step": 5381 + }, + { + "epoch": 0.42638146167557933, + "grad_norm": 1.8528798207041872, + "learning_rate": 1.2830864249931756e-05, + "loss": 0.3421, + "step": 5382 + }, + { + "epoch": 0.4264606852842147, + "grad_norm": 1.5470101560190086, + "learning_rate": 1.2828403090081238e-05, + "loss": 0.2653, + "step": 5383 + }, + { + "epoch": 0.4265399088928501, + "grad_norm": 1.9035733284706517, + "learning_rate": 1.282594174399399e-05, + "loss": 0.2839, + "step": 5384 + }, + { + "epoch": 0.4266191325014854, + "grad_norm": 2.0533093535192948, + "learning_rate": 1.2823480211832073e-05, + "loss": 0.2062, + "step": 5385 + }, + { + "epoch": 0.4266983561101208, + "grad_norm": 1.582668620664453, + "learning_rate": 1.2821018493757569e-05, + "loss": 0.2318, + "step": 5386 + }, + { + "epoch": 0.4267775797187562, + "grad_norm": 1.940917753614369, + "learning_rate": 1.2818556589932575e-05, + "loss": 0.3238, + "step": 5387 + }, + { + "epoch": 0.42685680332739157, + "grad_norm": 1.461519927659968, + "learning_rate": 1.2816094500519188e-05, + "loss": 0.245, + "step": 5388 + }, + { + "epoch": 0.42693602693602695, + "grad_norm": 1.9187965375954472, + "learning_rate": 1.2813632225679528e-05, + "loss": 0.2883, + "step": 5389 + }, + { + "epoch": 0.42701525054466233, + "grad_norm": 1.5228257338008833, + "learning_rate": 1.281116976557573e-05, + "loss": 0.2327, + "step": 5390 + }, + { + "epoch": 0.4270944741532977, + "grad_norm": 1.5815830243191027, + "learning_rate": 1.2808707120369923e-05, + "loss": 0.2248, + "step": 5391 + }, + { + "epoch": 0.42717369776193304, + "grad_norm": 1.8103231911112754, + "learning_rate": 1.280624429022427e-05, + "loss": 0.1637, + "step": 5392 + }, + { + "epoch": 0.4272529213705684, + "grad_norm": 2.022183976038517, + "learning_rate": 1.2803781275300933e-05, + "loss": 0.3333, + "step": 5393 + }, + { + "epoch": 0.4273321449792038, + "grad_norm": 1.5021408611429654, + "learning_rate": 1.2801318075762088e-05, + "loss": 0.2548, + "step": 5394 + }, + { + "epoch": 0.4274113685878392, + "grad_norm": 1.4639210729849936, + "learning_rate": 1.2798854691769927e-05, + "loss": 0.2182, + "step": 5395 + }, + { + "epoch": 0.42749059219647456, + "grad_norm": 2.297049941147285, + "learning_rate": 1.2796391123486654e-05, + "loss": 0.184, + "step": 5396 + }, + { + "epoch": 0.42756981580510994, + "grad_norm": 1.8712805500727123, + "learning_rate": 1.2793927371074477e-05, + "loss": 0.272, + "step": 5397 + }, + { + "epoch": 0.42764903941374527, + "grad_norm": 1.5633081657549015, + "learning_rate": 1.279146343469563e-05, + "loss": 0.2169, + "step": 5398 + }, + { + "epoch": 0.42772826302238065, + "grad_norm": 1.5156355401866126, + "learning_rate": 1.2788999314512347e-05, + "loss": 0.148, + "step": 5399 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 1.5359488123733942, + "learning_rate": 1.2786535010686879e-05, + "loss": 0.1755, + "step": 5400 + }, + { + "epoch": 0.4278867102396514, + "grad_norm": 1.4993321739975185, + "learning_rate": 1.2784070523381487e-05, + "loss": 0.1866, + "step": 5401 + }, + { + "epoch": 0.4279659338482868, + "grad_norm": 1.731990208789258, + "learning_rate": 1.2781605852758448e-05, + "loss": 0.1538, + "step": 5402 + }, + { + "epoch": 0.4280451574569222, + "grad_norm": 2.032828969180398, + "learning_rate": 1.2779140998980048e-05, + "loss": 0.2935, + "step": 5403 + }, + { + "epoch": 0.42812438106555756, + "grad_norm": 1.826500074746919, + "learning_rate": 1.2776675962208585e-05, + "loss": 0.2619, + "step": 5404 + }, + { + "epoch": 0.4282036046741929, + "grad_norm": 1.7775231965261296, + "learning_rate": 1.2774210742606368e-05, + "loss": 0.2486, + "step": 5405 + }, + { + "epoch": 0.42828282828282827, + "grad_norm": 1.7240917875847057, + "learning_rate": 1.2771745340335726e-05, + "loss": 0.3003, + "step": 5406 + }, + { + "epoch": 0.42836205189146365, + "grad_norm": 1.4335769548113746, + "learning_rate": 1.276927975555899e-05, + "loss": 0.2083, + "step": 5407 + }, + { + "epoch": 0.42844127550009903, + "grad_norm": 2.0101894665021036, + "learning_rate": 1.2766813988438505e-05, + "loss": 0.2552, + "step": 5408 + }, + { + "epoch": 0.4285204991087344, + "grad_norm": 1.8716626060809927, + "learning_rate": 1.2764348039136634e-05, + "loss": 0.2651, + "step": 5409 + }, + { + "epoch": 0.4285997227173698, + "grad_norm": 2.055183409829232, + "learning_rate": 1.2761881907815744e-05, + "loss": 0.3857, + "step": 5410 + }, + { + "epoch": 0.4286789463260052, + "grad_norm": 1.8964593392670914, + "learning_rate": 1.275941559463822e-05, + "loss": 0.2867, + "step": 5411 + }, + { + "epoch": 0.4287581699346405, + "grad_norm": 1.6525673120776647, + "learning_rate": 1.2756949099766458e-05, + "loss": 0.2624, + "step": 5412 + }, + { + "epoch": 0.4288373935432759, + "grad_norm": 1.4822272390514033, + "learning_rate": 1.2754482423362861e-05, + "loss": 0.2376, + "step": 5413 + }, + { + "epoch": 0.42891661715191126, + "grad_norm": 1.8093507181095638, + "learning_rate": 1.2752015565589852e-05, + "loss": 0.2831, + "step": 5414 + }, + { + "epoch": 0.42899584076054664, + "grad_norm": 1.6626516576781947, + "learning_rate": 1.2749548526609858e-05, + "loss": 0.2935, + "step": 5415 + }, + { + "epoch": 0.429075064369182, + "grad_norm": 1.690352518528334, + "learning_rate": 1.2747081306585325e-05, + "loss": 0.2507, + "step": 5416 + }, + { + "epoch": 0.4291542879778174, + "grad_norm": 1.6597317607978157, + "learning_rate": 1.2744613905678707e-05, + "loss": 0.2959, + "step": 5417 + }, + { + "epoch": 0.4292335115864528, + "grad_norm": 1.5780426260009828, + "learning_rate": 1.2742146324052466e-05, + "loss": 0.1623, + "step": 5418 + }, + { + "epoch": 0.4293127351950881, + "grad_norm": 1.9898694340271916, + "learning_rate": 1.273967856186909e-05, + "loss": 0.3279, + "step": 5419 + }, + { + "epoch": 0.4293919588037235, + "grad_norm": 1.8811669881933684, + "learning_rate": 1.2737210619291058e-05, + "loss": 0.2585, + "step": 5420 + }, + { + "epoch": 0.4294711824123589, + "grad_norm": 1.68178818467249, + "learning_rate": 1.2734742496480878e-05, + "loss": 0.2954, + "step": 5421 + }, + { + "epoch": 0.42955040602099426, + "grad_norm": 1.93435763884173, + "learning_rate": 1.2732274193601066e-05, + "loss": 0.3486, + "step": 5422 + }, + { + "epoch": 0.42962962962962964, + "grad_norm": 1.6292207452604424, + "learning_rate": 1.2729805710814142e-05, + "loss": 0.3197, + "step": 5423 + }, + { + "epoch": 0.429708853238265, + "grad_norm": 1.4927795912425283, + "learning_rate": 1.2727337048282649e-05, + "loss": 0.2438, + "step": 5424 + }, + { + "epoch": 0.4297880768469004, + "grad_norm": 1.745199533843985, + "learning_rate": 1.2724868206169134e-05, + "loss": 0.1919, + "step": 5425 + }, + { + "epoch": 0.42986730045553573, + "grad_norm": 1.7559047035876254, + "learning_rate": 1.2722399184636158e-05, + "loss": 0.2468, + "step": 5426 + }, + { + "epoch": 0.4299465240641711, + "grad_norm": 1.8885260071622678, + "learning_rate": 1.2719929983846298e-05, + "loss": 0.1916, + "step": 5427 + }, + { + "epoch": 0.4300257476728065, + "grad_norm": 1.4265432023812874, + "learning_rate": 1.2717460603962132e-05, + "loss": 0.2754, + "step": 5428 + }, + { + "epoch": 0.4301049712814419, + "grad_norm": 1.9253587278463462, + "learning_rate": 1.2714991045146265e-05, + "loss": 0.3184, + "step": 5429 + }, + { + "epoch": 0.43018419489007725, + "grad_norm": 2.7356114423023006, + "learning_rate": 1.2712521307561298e-05, + "loss": 0.2386, + "step": 5430 + }, + { + "epoch": 0.43026341849871264, + "grad_norm": 1.7161370398496638, + "learning_rate": 1.2710051391369857e-05, + "loss": 0.212, + "step": 5431 + }, + { + "epoch": 0.430342642107348, + "grad_norm": 2.126274607968103, + "learning_rate": 1.270758129673457e-05, + "loss": 0.259, + "step": 5432 + }, + { + "epoch": 0.43042186571598334, + "grad_norm": 1.9176940533748545, + "learning_rate": 1.2705111023818083e-05, + "loss": 0.3205, + "step": 5433 + }, + { + "epoch": 0.4305010893246187, + "grad_norm": 1.7948539985560719, + "learning_rate": 1.2702640572783051e-05, + "loss": 0.1773, + "step": 5434 + }, + { + "epoch": 0.4305803129332541, + "grad_norm": 2.3653275001351006, + "learning_rate": 1.2700169943792143e-05, + "loss": 0.3302, + "step": 5435 + }, + { + "epoch": 0.4306595365418895, + "grad_norm": 1.7949426445358412, + "learning_rate": 1.2697699137008038e-05, + "loss": 0.161, + "step": 5436 + }, + { + "epoch": 0.43073876015052487, + "grad_norm": 1.5846959939957417, + "learning_rate": 1.2695228152593419e-05, + "loss": 0.2435, + "step": 5437 + }, + { + "epoch": 0.43081798375916025, + "grad_norm": 1.6607361026155776, + "learning_rate": 1.2692756990710998e-05, + "loss": 0.2532, + "step": 5438 + }, + { + "epoch": 0.4308972073677956, + "grad_norm": 1.8561217870204176, + "learning_rate": 1.269028565152349e-05, + "loss": 0.2077, + "step": 5439 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 1.639455907986524, + "learning_rate": 1.2687814135193613e-05, + "loss": 0.2168, + "step": 5440 + }, + { + "epoch": 0.43105565458506634, + "grad_norm": 1.5721960135252286, + "learning_rate": 1.2685342441884107e-05, + "loss": 0.2295, + "step": 5441 + }, + { + "epoch": 0.4311348781937017, + "grad_norm": 2.3975404497159305, + "learning_rate": 1.2682870571757724e-05, + "loss": 0.3942, + "step": 5442 + }, + { + "epoch": 0.4312141018023371, + "grad_norm": 2.119407524807117, + "learning_rate": 1.2680398524977222e-05, + "loss": 0.3352, + "step": 5443 + }, + { + "epoch": 0.4312933254109725, + "grad_norm": 2.2495309558705543, + "learning_rate": 1.2677926301705376e-05, + "loss": 0.2969, + "step": 5444 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 2.0352016589185546, + "learning_rate": 1.2675453902104967e-05, + "loss": 0.3301, + "step": 5445 + }, + { + "epoch": 0.4314517726282432, + "grad_norm": 1.919627350714918, + "learning_rate": 1.2672981326338793e-05, + "loss": 0.2662, + "step": 5446 + }, + { + "epoch": 0.4315309962368786, + "grad_norm": 1.7635225274372561, + "learning_rate": 1.267050857456966e-05, + "loss": 0.3038, + "step": 5447 + }, + { + "epoch": 0.43161021984551395, + "grad_norm": 2.2901971036837097, + "learning_rate": 1.2668035646960384e-05, + "loss": 0.2443, + "step": 5448 + }, + { + "epoch": 0.43168944345414934, + "grad_norm": 1.6261062971369695, + "learning_rate": 1.2665562543673803e-05, + "loss": 0.2792, + "step": 5449 + }, + { + "epoch": 0.4317686670627847, + "grad_norm": 1.5629127154479712, + "learning_rate": 1.2663089264872751e-05, + "loss": 0.2346, + "step": 5450 + }, + { + "epoch": 0.4318478906714201, + "grad_norm": 1.4355018568123559, + "learning_rate": 1.2660615810720087e-05, + "loss": 0.2289, + "step": 5451 + }, + { + "epoch": 0.4319271142800555, + "grad_norm": 1.6112527287999885, + "learning_rate": 1.2658142181378675e-05, + "loss": 0.2412, + "step": 5452 + }, + { + "epoch": 0.4320063378886908, + "grad_norm": 1.5799455883585092, + "learning_rate": 1.2655668377011387e-05, + "loss": 0.2263, + "step": 5453 + }, + { + "epoch": 0.4320855614973262, + "grad_norm": 1.71696534381602, + "learning_rate": 1.2653194397781117e-05, + "loss": 0.3079, + "step": 5454 + }, + { + "epoch": 0.43216478510596157, + "grad_norm": 1.4774875816103368, + "learning_rate": 1.2650720243850762e-05, + "loss": 0.2892, + "step": 5455 + }, + { + "epoch": 0.43224400871459695, + "grad_norm": 1.9751794764373034, + "learning_rate": 1.2648245915383233e-05, + "loss": 0.3497, + "step": 5456 + }, + { + "epoch": 0.43232323232323233, + "grad_norm": 1.9365298520538066, + "learning_rate": 1.2645771412541455e-05, + "loss": 0.306, + "step": 5457 + }, + { + "epoch": 0.4324024559318677, + "grad_norm": 1.688136530431482, + "learning_rate": 1.2643296735488355e-05, + "loss": 0.2445, + "step": 5458 + }, + { + "epoch": 0.4324816795405031, + "grad_norm": 1.7619735651018633, + "learning_rate": 1.2640821884386887e-05, + "loss": 0.3825, + "step": 5459 + }, + { + "epoch": 0.4325609031491384, + "grad_norm": 2.0817750274592832, + "learning_rate": 1.2638346859400006e-05, + "loss": 0.2964, + "step": 5460 + }, + { + "epoch": 0.4326401267577738, + "grad_norm": 1.7423077822766184, + "learning_rate": 1.2635871660690677e-05, + "loss": 0.2889, + "step": 5461 + }, + { + "epoch": 0.4327193503664092, + "grad_norm": 1.8896748711458533, + "learning_rate": 1.2633396288421884e-05, + "loss": 0.3743, + "step": 5462 + }, + { + "epoch": 0.43279857397504456, + "grad_norm": 1.9597792123374402, + "learning_rate": 1.2630920742756616e-05, + "loss": 0.2499, + "step": 5463 + }, + { + "epoch": 0.43287779758367995, + "grad_norm": 1.7990870222222786, + "learning_rate": 1.2628445023857875e-05, + "loss": 0.3014, + "step": 5464 + }, + { + "epoch": 0.43295702119231533, + "grad_norm": 1.572313647538007, + "learning_rate": 1.2625969131888677e-05, + "loss": 0.2474, + "step": 5465 + }, + { + "epoch": 0.4330362448009507, + "grad_norm": 1.7151617500919933, + "learning_rate": 1.2623493067012047e-05, + "loss": 0.2932, + "step": 5466 + }, + { + "epoch": 0.43311546840958604, + "grad_norm": 1.167642517070261, + "learning_rate": 1.2621016829391022e-05, + "loss": 0.1457, + "step": 5467 + }, + { + "epoch": 0.4331946920182214, + "grad_norm": 1.2933591746811266, + "learning_rate": 1.2618540419188654e-05, + "loss": 0.2202, + "step": 5468 + }, + { + "epoch": 0.4332739156268568, + "grad_norm": 2.0055816596212512, + "learning_rate": 1.2616063836567994e-05, + "loss": 0.2337, + "step": 5469 + }, + { + "epoch": 0.4333531392354922, + "grad_norm": 1.5751077740266293, + "learning_rate": 1.2613587081692118e-05, + "loss": 0.2615, + "step": 5470 + }, + { + "epoch": 0.43343236284412756, + "grad_norm": 2.2820851958304584, + "learning_rate": 1.2611110154724113e-05, + "loss": 0.4322, + "step": 5471 + }, + { + "epoch": 0.43351158645276294, + "grad_norm": 1.8239004930029694, + "learning_rate": 1.2608633055827064e-05, + "loss": 0.2882, + "step": 5472 + }, + { + "epoch": 0.4335908100613983, + "grad_norm": 1.5706907778246673, + "learning_rate": 1.260615578516408e-05, + "loss": 0.194, + "step": 5473 + }, + { + "epoch": 0.43367003367003365, + "grad_norm": 1.8305559062055323, + "learning_rate": 1.260367834289828e-05, + "loss": 0.2851, + "step": 5474 + }, + { + "epoch": 0.43374925727866903, + "grad_norm": 1.7282616822156323, + "learning_rate": 1.2601200729192789e-05, + "loss": 0.3158, + "step": 5475 + }, + { + "epoch": 0.4338284808873044, + "grad_norm": 1.329400155830024, + "learning_rate": 1.2598722944210746e-05, + "loss": 0.2117, + "step": 5476 + }, + { + "epoch": 0.4339077044959398, + "grad_norm": 2.166495569146077, + "learning_rate": 1.25962449881153e-05, + "loss": 0.365, + "step": 5477 + }, + { + "epoch": 0.4339869281045752, + "grad_norm": 1.653593835744873, + "learning_rate": 1.2593766861069615e-05, + "loss": 0.251, + "step": 5478 + }, + { + "epoch": 0.43406615171321056, + "grad_norm": 1.812698542999159, + "learning_rate": 1.2591288563236864e-05, + "loss": 0.2841, + "step": 5479 + }, + { + "epoch": 0.4341453753218459, + "grad_norm": 1.956474221697521, + "learning_rate": 1.2588810094780227e-05, + "loss": 0.3164, + "step": 5480 + }, + { + "epoch": 0.43422459893048126, + "grad_norm": 2.2571850808896183, + "learning_rate": 1.2586331455862902e-05, + "loss": 0.2218, + "step": 5481 + }, + { + "epoch": 0.43430382253911665, + "grad_norm": 1.6317531260474232, + "learning_rate": 1.2583852646648097e-05, + "loss": 0.2784, + "step": 5482 + }, + { + "epoch": 0.434383046147752, + "grad_norm": 2.3992499432315633, + "learning_rate": 1.2581373667299026e-05, + "loss": 0.2842, + "step": 5483 + }, + { + "epoch": 0.4344622697563874, + "grad_norm": 1.8156317465830156, + "learning_rate": 1.257889451797892e-05, + "loss": 0.2902, + "step": 5484 + }, + { + "epoch": 0.4345414933650228, + "grad_norm": 1.867246536200824, + "learning_rate": 1.257641519885102e-05, + "loss": 0.2513, + "step": 5485 + }, + { + "epoch": 0.43462071697365817, + "grad_norm": 4.821655929546751, + "learning_rate": 1.2573935710078576e-05, + "loss": 0.2758, + "step": 5486 + }, + { + "epoch": 0.4346999405822935, + "grad_norm": 1.900495069813588, + "learning_rate": 1.2571456051824851e-05, + "loss": 0.27, + "step": 5487 + }, + { + "epoch": 0.4347791641909289, + "grad_norm": 1.7776921465600903, + "learning_rate": 1.2568976224253115e-05, + "loss": 0.1825, + "step": 5488 + }, + { + "epoch": 0.43485838779956426, + "grad_norm": 1.9167540797299247, + "learning_rate": 1.256649622752666e-05, + "loss": 0.3305, + "step": 5489 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 1.443967542835639, + "learning_rate": 1.2564016061808774e-05, + "loss": 0.1745, + "step": 5490 + }, + { + "epoch": 0.435016835016835, + "grad_norm": 1.8918603333841295, + "learning_rate": 1.2561535727262769e-05, + "loss": 0.2952, + "step": 5491 + }, + { + "epoch": 0.4350960586254704, + "grad_norm": 1.7909017470976067, + "learning_rate": 1.2559055224051963e-05, + "loss": 0.2702, + "step": 5492 + }, + { + "epoch": 0.4351752822341058, + "grad_norm": 1.7642961151086562, + "learning_rate": 1.2556574552339682e-05, + "loss": 0.2888, + "step": 5493 + }, + { + "epoch": 0.4352545058427411, + "grad_norm": 2.232147306807697, + "learning_rate": 1.2554093712289267e-05, + "loss": 0.4624, + "step": 5494 + }, + { + "epoch": 0.4353337294513765, + "grad_norm": 1.994304661816742, + "learning_rate": 1.2551612704064074e-05, + "loss": 0.2933, + "step": 5495 + }, + { + "epoch": 0.4354129530600119, + "grad_norm": 1.863028396711072, + "learning_rate": 1.2549131527827458e-05, + "loss": 0.341, + "step": 5496 + }, + { + "epoch": 0.43549217666864726, + "grad_norm": 1.9004633400564483, + "learning_rate": 1.2546650183742801e-05, + "loss": 0.2501, + "step": 5497 + }, + { + "epoch": 0.43557140027728264, + "grad_norm": 1.7787182114905022, + "learning_rate": 1.254416867197348e-05, + "loss": 0.2681, + "step": 5498 + }, + { + "epoch": 0.435650623885918, + "grad_norm": 2.1338241241751894, + "learning_rate": 1.2541686992682896e-05, + "loss": 0.351, + "step": 5499 + }, + { + "epoch": 0.4357298474945534, + "grad_norm": 2.1444470836773273, + "learning_rate": 1.2539205146034452e-05, + "loss": 0.3059, + "step": 5500 + }, + { + "epoch": 0.4358090711031887, + "grad_norm": 1.804899149245901, + "learning_rate": 1.2536723132191566e-05, + "loss": 0.2822, + "step": 5501 + }, + { + "epoch": 0.4358882947118241, + "grad_norm": 1.6909439641368984, + "learning_rate": 1.2534240951317669e-05, + "loss": 0.3134, + "step": 5502 + }, + { + "epoch": 0.4359675183204595, + "grad_norm": 1.6813275567337902, + "learning_rate": 1.25317586035762e-05, + "loss": 0.2804, + "step": 5503 + }, + { + "epoch": 0.43604674192909487, + "grad_norm": 1.9269916240950935, + "learning_rate": 1.2529276089130607e-05, + "loss": 0.2858, + "step": 5504 + }, + { + "epoch": 0.43612596553773025, + "grad_norm": 1.4661097092602278, + "learning_rate": 1.2526793408144355e-05, + "loss": 0.2363, + "step": 5505 + }, + { + "epoch": 0.43620518914636564, + "grad_norm": 2.2589710537496006, + "learning_rate": 1.2524310560780914e-05, + "loss": 0.3209, + "step": 5506 + }, + { + "epoch": 0.436284412755001, + "grad_norm": 1.7050742570119874, + "learning_rate": 1.2521827547203773e-05, + "loss": 0.2347, + "step": 5507 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 1.4272216551852428, + "learning_rate": 1.2519344367576418e-05, + "loss": 0.1991, + "step": 5508 + }, + { + "epoch": 0.4364428599722717, + "grad_norm": 1.7641149987040359, + "learning_rate": 1.2516861022062361e-05, + "loss": 0.2518, + "step": 5509 + }, + { + "epoch": 0.4365220835809071, + "grad_norm": 1.7226810933862249, + "learning_rate": 1.2514377510825113e-05, + "loss": 0.245, + "step": 5510 + }, + { + "epoch": 0.4366013071895425, + "grad_norm": 1.817967480069139, + "learning_rate": 1.2511893834028209e-05, + "loss": 0.3702, + "step": 5511 + }, + { + "epoch": 0.43668053079817787, + "grad_norm": 1.468023453662494, + "learning_rate": 1.2509409991835178e-05, + "loss": 0.2824, + "step": 5512 + }, + { + "epoch": 0.43675975440681325, + "grad_norm": 1.8088047701744943, + "learning_rate": 1.2506925984409574e-05, + "loss": 0.2614, + "step": 5513 + }, + { + "epoch": 0.43683897801544863, + "grad_norm": 1.9148143300817841, + "learning_rate": 1.250444181191496e-05, + "loss": 0.2179, + "step": 5514 + }, + { + "epoch": 0.43691820162408396, + "grad_norm": 1.4062051180754462, + "learning_rate": 1.2501957474514898e-05, + "loss": 0.2315, + "step": 5515 + }, + { + "epoch": 0.43699742523271934, + "grad_norm": 1.5843020885835293, + "learning_rate": 1.249947297237298e-05, + "loss": 0.2017, + "step": 5516 + }, + { + "epoch": 0.4370766488413547, + "grad_norm": 1.4593567429079943, + "learning_rate": 1.249698830565279e-05, + "loss": 0.3039, + "step": 5517 + }, + { + "epoch": 0.4371558724499901, + "grad_norm": 1.6725745706863753, + "learning_rate": 1.2494503474517935e-05, + "loss": 0.1894, + "step": 5518 + }, + { + "epoch": 0.4372350960586255, + "grad_norm": 1.5462750484566714, + "learning_rate": 1.2492018479132033e-05, + "loss": 0.2277, + "step": 5519 + }, + { + "epoch": 0.43731431966726086, + "grad_norm": 2.1372395529647394, + "learning_rate": 1.2489533319658703e-05, + "loss": 0.2408, + "step": 5520 + }, + { + "epoch": 0.4373935432758962, + "grad_norm": 2.101696212519979, + "learning_rate": 1.2487047996261578e-05, + "loss": 0.3196, + "step": 5521 + }, + { + "epoch": 0.43747276688453157, + "grad_norm": 2.085075491133594, + "learning_rate": 1.2484562509104316e-05, + "loss": 0.3495, + "step": 5522 + }, + { + "epoch": 0.43755199049316695, + "grad_norm": 1.596447449650139, + "learning_rate": 1.2482076858350564e-05, + "loss": 0.2183, + "step": 5523 + }, + { + "epoch": 0.43763121410180233, + "grad_norm": 2.0405537203790436, + "learning_rate": 1.2479591044163997e-05, + "loss": 0.3685, + "step": 5524 + }, + { + "epoch": 0.4377104377104377, + "grad_norm": 1.6272053409271723, + "learning_rate": 1.2477105066708286e-05, + "loss": 0.2153, + "step": 5525 + }, + { + "epoch": 0.4377896613190731, + "grad_norm": 2.058652815114697, + "learning_rate": 1.2474618926147129e-05, + "loss": 0.2604, + "step": 5526 + }, + { + "epoch": 0.4378688849277085, + "grad_norm": 1.4614896895086462, + "learning_rate": 1.2472132622644222e-05, + "loss": 0.2066, + "step": 5527 + }, + { + "epoch": 0.4379481085363438, + "grad_norm": 2.300909346858426, + "learning_rate": 1.2469646156363276e-05, + "loss": 0.3558, + "step": 5528 + }, + { + "epoch": 0.4380273321449792, + "grad_norm": 2.2419840792969365, + "learning_rate": 1.2467159527468014e-05, + "loss": 0.3688, + "step": 5529 + }, + { + "epoch": 0.43810655575361457, + "grad_norm": 2.0755338587433956, + "learning_rate": 1.246467273612217e-05, + "loss": 0.3243, + "step": 5530 + }, + { + "epoch": 0.43818577936224995, + "grad_norm": 1.8103273346738842, + "learning_rate": 1.2462185782489484e-05, + "loss": 0.2803, + "step": 5531 + }, + { + "epoch": 0.43826500297088533, + "grad_norm": 1.917407555372304, + "learning_rate": 1.2459698666733712e-05, + "loss": 0.3254, + "step": 5532 + }, + { + "epoch": 0.4383442265795207, + "grad_norm": 1.5443887552858382, + "learning_rate": 1.2457211389018619e-05, + "loss": 0.2175, + "step": 5533 + }, + { + "epoch": 0.4384234501881561, + "grad_norm": 1.7551780932551353, + "learning_rate": 1.2454723949507978e-05, + "loss": 0.2349, + "step": 5534 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 1.7325451357549508, + "learning_rate": 1.2452236348365579e-05, + "loss": 0.2432, + "step": 5535 + }, + { + "epoch": 0.4385818974054268, + "grad_norm": 1.5421334749096918, + "learning_rate": 1.244974858575521e-05, + "loss": 0.2165, + "step": 5536 + }, + { + "epoch": 0.4386611210140622, + "grad_norm": 1.7874818317400485, + "learning_rate": 1.2447260661840688e-05, + "loss": 0.2635, + "step": 5537 + }, + { + "epoch": 0.43874034462269756, + "grad_norm": 1.7830562624803026, + "learning_rate": 1.2444772576785828e-05, + "loss": 0.2868, + "step": 5538 + }, + { + "epoch": 0.43881956823133295, + "grad_norm": 1.7947636442883355, + "learning_rate": 1.2442284330754456e-05, + "loss": 0.2936, + "step": 5539 + }, + { + "epoch": 0.4388987918399683, + "grad_norm": 1.6460127837170502, + "learning_rate": 1.2439795923910413e-05, + "loss": 0.2217, + "step": 5540 + }, + { + "epoch": 0.4389780154486037, + "grad_norm": 1.8546204948439255, + "learning_rate": 1.2437307356417547e-05, + "loss": 0.304, + "step": 5541 + }, + { + "epoch": 0.43905723905723903, + "grad_norm": 1.6205382497888166, + "learning_rate": 1.2434818628439718e-05, + "loss": 0.2257, + "step": 5542 + }, + { + "epoch": 0.4391364626658744, + "grad_norm": 1.9151888255326355, + "learning_rate": 1.24323297401408e-05, + "loss": 0.2797, + "step": 5543 + }, + { + "epoch": 0.4392156862745098, + "grad_norm": 2.0800274853519447, + "learning_rate": 1.2429840691684672e-05, + "loss": 0.3313, + "step": 5544 + }, + { + "epoch": 0.4392949098831452, + "grad_norm": 1.587651224950868, + "learning_rate": 1.2427351483235224e-05, + "loss": 0.2514, + "step": 5545 + }, + { + "epoch": 0.43937413349178056, + "grad_norm": 2.575196396627111, + "learning_rate": 1.2424862114956367e-05, + "loss": 0.3126, + "step": 5546 + }, + { + "epoch": 0.43945335710041594, + "grad_norm": 2.1599715381853, + "learning_rate": 1.2422372587012001e-05, + "loss": 0.3151, + "step": 5547 + }, + { + "epoch": 0.4395325807090513, + "grad_norm": 1.8588654981225188, + "learning_rate": 1.2419882899566056e-05, + "loss": 0.2655, + "step": 5548 + }, + { + "epoch": 0.43961180431768665, + "grad_norm": 1.7703227924498588, + "learning_rate": 1.241739305278247e-05, + "loss": 0.3364, + "step": 5549 + }, + { + "epoch": 0.43969102792632203, + "grad_norm": 1.9765304445076906, + "learning_rate": 1.2414903046825178e-05, + "loss": 0.2676, + "step": 5550 + }, + { + "epoch": 0.4397702515349574, + "grad_norm": 1.61452327389998, + "learning_rate": 1.2412412881858142e-05, + "loss": 0.236, + "step": 5551 + }, + { + "epoch": 0.4398494751435928, + "grad_norm": 1.7669374352760172, + "learning_rate": 1.240992255804533e-05, + "loss": 0.1895, + "step": 5552 + }, + { + "epoch": 0.4399286987522282, + "grad_norm": 2.105520387711574, + "learning_rate": 1.2407432075550707e-05, + "loss": 0.2739, + "step": 5553 + }, + { + "epoch": 0.44000792236086356, + "grad_norm": 1.2661266224547834, + "learning_rate": 1.2404941434538269e-05, + "loss": 0.1389, + "step": 5554 + }, + { + "epoch": 0.4400871459694989, + "grad_norm": 1.9841313451169875, + "learning_rate": 1.2402450635172008e-05, + "loss": 0.3841, + "step": 5555 + }, + { + "epoch": 0.44016636957813426, + "grad_norm": 1.34577365985844, + "learning_rate": 1.2399959677615932e-05, + "loss": 0.1794, + "step": 5556 + }, + { + "epoch": 0.44024559318676965, + "grad_norm": 1.7286740148924424, + "learning_rate": 1.239746856203406e-05, + "loss": 0.2663, + "step": 5557 + }, + { + "epoch": 0.440324816795405, + "grad_norm": 2.067470759268853, + "learning_rate": 1.239497728859042e-05, + "loss": 0.3059, + "step": 5558 + }, + { + "epoch": 0.4404040404040404, + "grad_norm": 1.5996789879842293, + "learning_rate": 1.2392485857449048e-05, + "loss": 0.2873, + "step": 5559 + }, + { + "epoch": 0.4404832640126758, + "grad_norm": 1.925739671400025, + "learning_rate": 1.2389994268773995e-05, + "loss": 0.2891, + "step": 5560 + }, + { + "epoch": 0.44056248762131117, + "grad_norm": 1.8705449086688861, + "learning_rate": 1.238750252272932e-05, + "loss": 0.2841, + "step": 5561 + }, + { + "epoch": 0.4406417112299465, + "grad_norm": 1.7993176556123558, + "learning_rate": 1.2385010619479093e-05, + "loss": 0.2858, + "step": 5562 + }, + { + "epoch": 0.4407209348385819, + "grad_norm": 1.6503914610901576, + "learning_rate": 1.2382518559187389e-05, + "loss": 0.2867, + "step": 5563 + }, + { + "epoch": 0.44080015844721726, + "grad_norm": 1.562713994438325, + "learning_rate": 1.23800263420183e-05, + "loss": 0.2182, + "step": 5564 + }, + { + "epoch": 0.44087938205585264, + "grad_norm": 1.5569650355680327, + "learning_rate": 1.2377533968135934e-05, + "loss": 0.2551, + "step": 5565 + }, + { + "epoch": 0.440958605664488, + "grad_norm": 1.530833590028577, + "learning_rate": 1.2375041437704394e-05, + "loss": 0.1657, + "step": 5566 + }, + { + "epoch": 0.4410378292731234, + "grad_norm": 1.4333135753332196, + "learning_rate": 1.2372548750887805e-05, + "loss": 0.2477, + "step": 5567 + }, + { + "epoch": 0.4411170528817588, + "grad_norm": 1.6057533183158326, + "learning_rate": 1.2370055907850293e-05, + "loss": 0.2813, + "step": 5568 + }, + { + "epoch": 0.4411962764903941, + "grad_norm": 1.8208680548108085, + "learning_rate": 1.2367562908756005e-05, + "loss": 0.2195, + "step": 5569 + }, + { + "epoch": 0.4412755000990295, + "grad_norm": 1.4857069970802577, + "learning_rate": 1.2365069753769092e-05, + "loss": 0.1942, + "step": 5570 + }, + { + "epoch": 0.4413547237076649, + "grad_norm": 1.738289976258963, + "learning_rate": 1.2362576443053716e-05, + "loss": 0.2732, + "step": 5571 + }, + { + "epoch": 0.44143394731630026, + "grad_norm": 1.8382833584529485, + "learning_rate": 1.2360082976774049e-05, + "loss": 0.3294, + "step": 5572 + }, + { + "epoch": 0.44151317092493564, + "grad_norm": 1.269740545350416, + "learning_rate": 1.2357589355094275e-05, + "loss": 0.149, + "step": 5573 + }, + { + "epoch": 0.441592394533571, + "grad_norm": 1.4254127836215902, + "learning_rate": 1.2355095578178582e-05, + "loss": 0.2864, + "step": 5574 + }, + { + "epoch": 0.4416716181422064, + "grad_norm": 1.550504855024614, + "learning_rate": 1.2352601646191182e-05, + "loss": 0.2744, + "step": 5575 + }, + { + "epoch": 0.4417508417508417, + "grad_norm": 1.8465370653031214, + "learning_rate": 1.235010755929628e-05, + "loss": 0.2366, + "step": 5576 + }, + { + "epoch": 0.4418300653594771, + "grad_norm": 2.52399882626482, + "learning_rate": 1.2347613317658105e-05, + "loss": 0.2743, + "step": 5577 + }, + { + "epoch": 0.4419092889681125, + "grad_norm": 1.9979630358777374, + "learning_rate": 1.234511892144089e-05, + "loss": 0.3342, + "step": 5578 + }, + { + "epoch": 0.44198851257674787, + "grad_norm": 1.8292849580158084, + "learning_rate": 1.2342624370808876e-05, + "loss": 0.3391, + "step": 5579 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 1.8840703845049853, + "learning_rate": 1.2340129665926319e-05, + "loss": 0.2061, + "step": 5580 + }, + { + "epoch": 0.44214695979401863, + "grad_norm": 1.4131557429062231, + "learning_rate": 1.2337634806957486e-05, + "loss": 0.199, + "step": 5581 + }, + { + "epoch": 0.442226183402654, + "grad_norm": 1.4812768691978588, + "learning_rate": 1.2335139794066645e-05, + "loss": 0.2136, + "step": 5582 + }, + { + "epoch": 0.44230540701128934, + "grad_norm": 1.3928135927593728, + "learning_rate": 1.2332644627418088e-05, + "loss": 0.2, + "step": 5583 + }, + { + "epoch": 0.4423846306199247, + "grad_norm": 1.3945721608753796, + "learning_rate": 1.2330149307176105e-05, + "loss": 0.2361, + "step": 5584 + }, + { + "epoch": 0.4424638542285601, + "grad_norm": 1.7030500572131864, + "learning_rate": 1.2327653833505005e-05, + "loss": 0.2669, + "step": 5585 + }, + { + "epoch": 0.4425430778371955, + "grad_norm": 1.8683991648700715, + "learning_rate": 1.2325158206569095e-05, + "loss": 0.3727, + "step": 5586 + }, + { + "epoch": 0.44262230144583087, + "grad_norm": 1.803765743201922, + "learning_rate": 1.232266242653271e-05, + "loss": 0.2772, + "step": 5587 + }, + { + "epoch": 0.44270152505446625, + "grad_norm": 1.7128991807011351, + "learning_rate": 1.2320166493560176e-05, + "loss": 0.2883, + "step": 5588 + }, + { + "epoch": 0.44278074866310163, + "grad_norm": 2.1271880829026184, + "learning_rate": 1.2317670407815844e-05, + "loss": 0.3474, + "step": 5589 + }, + { + "epoch": 0.44285997227173696, + "grad_norm": 2.0133423031209925, + "learning_rate": 1.2315174169464068e-05, + "loss": 0.2137, + "step": 5590 + }, + { + "epoch": 0.44293919588037234, + "grad_norm": 1.7251046875037204, + "learning_rate": 1.2312677778669211e-05, + "loss": 0.2447, + "step": 5591 + }, + { + "epoch": 0.4430184194890077, + "grad_norm": 1.771664297739277, + "learning_rate": 1.2310181235595652e-05, + "loss": 0.2438, + "step": 5592 + }, + { + "epoch": 0.4430976430976431, + "grad_norm": 1.5754437258239973, + "learning_rate": 1.2307684540407775e-05, + "loss": 0.2102, + "step": 5593 + }, + { + "epoch": 0.4431768667062785, + "grad_norm": 1.6464045481880836, + "learning_rate": 1.230518769326997e-05, + "loss": 0.217, + "step": 5594 + }, + { + "epoch": 0.44325609031491386, + "grad_norm": 1.307725877865682, + "learning_rate": 1.2302690694346654e-05, + "loss": 0.136, + "step": 5595 + }, + { + "epoch": 0.4433353139235492, + "grad_norm": 1.6689501118796468, + "learning_rate": 1.230019354380223e-05, + "loss": 0.2133, + "step": 5596 + }, + { + "epoch": 0.44341453753218457, + "grad_norm": 1.5316433588924565, + "learning_rate": 1.2297696241801133e-05, + "loss": 0.2113, + "step": 5597 + }, + { + "epoch": 0.44349376114081995, + "grad_norm": 1.5760364514371232, + "learning_rate": 1.2295198788507794e-05, + "loss": 0.239, + "step": 5598 + }, + { + "epoch": 0.44357298474945533, + "grad_norm": 1.7627453645705313, + "learning_rate": 1.2292701184086656e-05, + "loss": 0.26, + "step": 5599 + }, + { + "epoch": 0.4436522083580907, + "grad_norm": 1.353336064216166, + "learning_rate": 1.2290203428702178e-05, + "loss": 0.2254, + "step": 5600 + }, + { + "epoch": 0.4437314319667261, + "grad_norm": 1.990131106787289, + "learning_rate": 1.2287705522518824e-05, + "loss": 0.2373, + "step": 5601 + }, + { + "epoch": 0.4438106555753615, + "grad_norm": 2.0707576472903306, + "learning_rate": 1.228520746570107e-05, + "loss": 0.2317, + "step": 5602 + }, + { + "epoch": 0.4438898791839968, + "grad_norm": 1.9003787282143014, + "learning_rate": 1.22827092584134e-05, + "loss": 0.2423, + "step": 5603 + }, + { + "epoch": 0.4439691027926322, + "grad_norm": 2.0097484417478255, + "learning_rate": 1.2280210900820309e-05, + "loss": 0.2623, + "step": 5604 + }, + { + "epoch": 0.44404832640126757, + "grad_norm": 1.4604997957290904, + "learning_rate": 1.22777123930863e-05, + "loss": 0.1813, + "step": 5605 + }, + { + "epoch": 0.44412755000990295, + "grad_norm": 1.586622149428566, + "learning_rate": 1.227521373537589e-05, + "loss": 0.217, + "step": 5606 + }, + { + "epoch": 0.44420677361853833, + "grad_norm": 1.8423769598148934, + "learning_rate": 1.2272714927853604e-05, + "loss": 0.2672, + "step": 5607 + }, + { + "epoch": 0.4442859972271737, + "grad_norm": 1.8180083283009185, + "learning_rate": 1.2270215970683977e-05, + "loss": 0.2677, + "step": 5608 + }, + { + "epoch": 0.4443652208358091, + "grad_norm": 1.678765782490066, + "learning_rate": 1.226771686403155e-05, + "loss": 0.2714, + "step": 5609 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.3530826818979518, + "learning_rate": 1.2265217608060879e-05, + "loss": 0.2218, + "step": 5610 + }, + { + "epoch": 0.4445236680530798, + "grad_norm": 1.7687839103939773, + "learning_rate": 1.226271820293653e-05, + "loss": 0.2488, + "step": 5611 + }, + { + "epoch": 0.4446028916617152, + "grad_norm": 1.4532043185049228, + "learning_rate": 1.2260218648823073e-05, + "loss": 0.2979, + "step": 5612 + }, + { + "epoch": 0.44468211527035056, + "grad_norm": 1.5051170978763282, + "learning_rate": 1.2257718945885096e-05, + "loss": 0.2067, + "step": 5613 + }, + { + "epoch": 0.44476133887898595, + "grad_norm": 1.599107926559045, + "learning_rate": 1.2255219094287186e-05, + "loss": 0.2974, + "step": 5614 + }, + { + "epoch": 0.4448405624876213, + "grad_norm": 1.6996675287274634, + "learning_rate": 1.225271909419395e-05, + "loss": 0.2359, + "step": 5615 + }, + { + "epoch": 0.4449197860962567, + "grad_norm": 1.5911728489437782, + "learning_rate": 1.2250218945770005e-05, + "loss": 0.2298, + "step": 5616 + }, + { + "epoch": 0.44499900970489203, + "grad_norm": 1.7819883921466069, + "learning_rate": 1.2247718649179966e-05, + "loss": 0.3332, + "step": 5617 + }, + { + "epoch": 0.4450782333135274, + "grad_norm": 1.7027583867730316, + "learning_rate": 1.2245218204588474e-05, + "loss": 0.2488, + "step": 5618 + }, + { + "epoch": 0.4451574569221628, + "grad_norm": 1.7773787609735372, + "learning_rate": 1.2242717612160163e-05, + "loss": 0.2883, + "step": 5619 + }, + { + "epoch": 0.4452366805307982, + "grad_norm": 1.9673029481501914, + "learning_rate": 1.2240216872059687e-05, + "loss": 0.3515, + "step": 5620 + }, + { + "epoch": 0.44531590413943356, + "grad_norm": 1.621800032743037, + "learning_rate": 1.2237715984451713e-05, + "loss": 0.2755, + "step": 5621 + }, + { + "epoch": 0.44539512774806894, + "grad_norm": 1.6726240888379609, + "learning_rate": 1.2235214949500906e-05, + "loss": 0.2304, + "step": 5622 + }, + { + "epoch": 0.4454743513567043, + "grad_norm": 1.6827391371357892, + "learning_rate": 1.223271376737195e-05, + "loss": 0.3081, + "step": 5623 + }, + { + "epoch": 0.44555357496533965, + "grad_norm": 1.7660659715012286, + "learning_rate": 1.2230212438229539e-05, + "loss": 0.2384, + "step": 5624 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 1.4269374380961006, + "learning_rate": 1.2227710962238367e-05, + "loss": 0.2524, + "step": 5625 + }, + { + "epoch": 0.4457120221826104, + "grad_norm": 1.8563296437003585, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.2315, + "step": 5626 + }, + { + "epoch": 0.4457912457912458, + "grad_norm": 1.5813856049921018, + "learning_rate": 1.22227075703686e-05, + "loss": 0.2439, + "step": 5627 + }, + { + "epoch": 0.4458704693998812, + "grad_norm": 1.5684642176959884, + "learning_rate": 1.2220205654819453e-05, + "loss": 0.2004, + "step": 5628 + }, + { + "epoch": 0.44594969300851656, + "grad_norm": 1.9540375860434265, + "learning_rate": 1.2217703593080445e-05, + "loss": 0.3284, + "step": 5629 + }, + { + "epoch": 0.44602891661715194, + "grad_norm": 1.6334800298115226, + "learning_rate": 1.221520138531633e-05, + "loss": 0.2809, + "step": 5630 + }, + { + "epoch": 0.44610814022578726, + "grad_norm": 1.4658006142445195, + "learning_rate": 1.2212699031691861e-05, + "loss": 0.1689, + "step": 5631 + }, + { + "epoch": 0.44618736383442265, + "grad_norm": 1.7900767546554626, + "learning_rate": 1.221019653237181e-05, + "loss": 0.2753, + "step": 5632 + }, + { + "epoch": 0.446266587443058, + "grad_norm": 1.7836698246748153, + "learning_rate": 1.2207693887520949e-05, + "loss": 0.2829, + "step": 5633 + }, + { + "epoch": 0.4463458110516934, + "grad_norm": 1.8930673819747978, + "learning_rate": 1.2205191097304067e-05, + "loss": 0.1892, + "step": 5634 + }, + { + "epoch": 0.4464250346603288, + "grad_norm": 1.667504374561492, + "learning_rate": 1.2202688161885967e-05, + "loss": 0.2881, + "step": 5635 + }, + { + "epoch": 0.44650425826896417, + "grad_norm": 1.8539069139441045, + "learning_rate": 1.2200185081431446e-05, + "loss": 0.2925, + "step": 5636 + }, + { + "epoch": 0.4465834818775995, + "grad_norm": 1.9632896204663735, + "learning_rate": 1.2197681856105326e-05, + "loss": 0.2926, + "step": 5637 + }, + { + "epoch": 0.4466627054862349, + "grad_norm": 1.4986191288150035, + "learning_rate": 1.219517848607243e-05, + "loss": 0.242, + "step": 5638 + }, + { + "epoch": 0.44674192909487026, + "grad_norm": 1.625662930972558, + "learning_rate": 1.2192674971497593e-05, + "loss": 0.2402, + "step": 5639 + }, + { + "epoch": 0.44682115270350564, + "grad_norm": 1.2274533186976864, + "learning_rate": 1.219017131254566e-05, + "loss": 0.188, + "step": 5640 + }, + { + "epoch": 0.446900376312141, + "grad_norm": 1.5926323311077184, + "learning_rate": 1.2187667509381484e-05, + "loss": 0.2242, + "step": 5641 + }, + { + "epoch": 0.4469795999207764, + "grad_norm": 1.2706132917955917, + "learning_rate": 1.2185163562169928e-05, + "loss": 0.1539, + "step": 5642 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 1.837584806937904, + "learning_rate": 1.2182659471075868e-05, + "loss": 0.2427, + "step": 5643 + }, + { + "epoch": 0.4471380471380471, + "grad_norm": 1.4811924937857484, + "learning_rate": 1.2180155236264182e-05, + "loss": 0.1885, + "step": 5644 + }, + { + "epoch": 0.4472172707466825, + "grad_norm": 1.445345043292019, + "learning_rate": 1.2177650857899767e-05, + "loss": 0.2325, + "step": 5645 + }, + { + "epoch": 0.4472964943553179, + "grad_norm": 1.64068043823466, + "learning_rate": 1.217514633614752e-05, + "loss": 0.266, + "step": 5646 + }, + { + "epoch": 0.44737571796395326, + "grad_norm": 1.8801734881053611, + "learning_rate": 1.217264167117235e-05, + "loss": 0.3213, + "step": 5647 + }, + { + "epoch": 0.44745494157258864, + "grad_norm": 2.2525293814258585, + "learning_rate": 1.2170136863139183e-05, + "loss": 0.2368, + "step": 5648 + }, + { + "epoch": 0.447534165181224, + "grad_norm": 1.7790331323131856, + "learning_rate": 1.2167631912212942e-05, + "loss": 0.3457, + "step": 5649 + }, + { + "epoch": 0.4476133887898594, + "grad_norm": 1.936715387902646, + "learning_rate": 1.2165126818558572e-05, + "loss": 0.1923, + "step": 5650 + }, + { + "epoch": 0.4476926123984947, + "grad_norm": 1.9259934773737695, + "learning_rate": 1.2162621582341021e-05, + "loss": 0.2649, + "step": 5651 + }, + { + "epoch": 0.4477718360071301, + "grad_norm": 1.6704567895652418, + "learning_rate": 1.2160116203725243e-05, + "loss": 0.2096, + "step": 5652 + }, + { + "epoch": 0.4478510596157655, + "grad_norm": 1.6943965762721673, + "learning_rate": 1.2157610682876206e-05, + "loss": 0.2351, + "step": 5653 + }, + { + "epoch": 0.44793028322440087, + "grad_norm": 1.81810060944767, + "learning_rate": 1.2155105019958888e-05, + "loss": 0.1738, + "step": 5654 + }, + { + "epoch": 0.44800950683303625, + "grad_norm": 2.0973218627662207, + "learning_rate": 1.2152599215138274e-05, + "loss": 0.2384, + "step": 5655 + }, + { + "epoch": 0.44808873044167163, + "grad_norm": 1.910169325577818, + "learning_rate": 1.215009326857936e-05, + "loss": 0.2089, + "step": 5656 + }, + { + "epoch": 0.448167954050307, + "grad_norm": 2.256710950714623, + "learning_rate": 1.2147587180447149e-05, + "loss": 0.2798, + "step": 5657 + }, + { + "epoch": 0.44824717765894234, + "grad_norm": 1.565782817578264, + "learning_rate": 1.2145080950906656e-05, + "loss": 0.2186, + "step": 5658 + }, + { + "epoch": 0.4483264012675777, + "grad_norm": 1.525088456470306, + "learning_rate": 1.2142574580122903e-05, + "loss": 0.1943, + "step": 5659 + }, + { + "epoch": 0.4484056248762131, + "grad_norm": 1.8443753190917436, + "learning_rate": 1.2140068068260923e-05, + "loss": 0.2354, + "step": 5660 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 1.5512746842008314, + "learning_rate": 1.2137561415485761e-05, + "loss": 0.2582, + "step": 5661 + }, + { + "epoch": 0.44856407209348387, + "grad_norm": 1.8603636018452885, + "learning_rate": 1.2135054621962464e-05, + "loss": 0.2451, + "step": 5662 + }, + { + "epoch": 0.44864329570211925, + "grad_norm": 1.889618114956987, + "learning_rate": 1.2132547687856093e-05, + "loss": 0.2357, + "step": 5663 + }, + { + "epoch": 0.44872251931075463, + "grad_norm": 1.9815243497588506, + "learning_rate": 1.2130040613331717e-05, + "loss": 0.3195, + "step": 5664 + }, + { + "epoch": 0.44880174291938996, + "grad_norm": 1.5243803797839164, + "learning_rate": 1.2127533398554417e-05, + "loss": 0.1674, + "step": 5665 + }, + { + "epoch": 0.44888096652802534, + "grad_norm": 1.418760243918266, + "learning_rate": 1.2125026043689278e-05, + "loss": 0.1771, + "step": 5666 + }, + { + "epoch": 0.4489601901366607, + "grad_norm": 1.7412611872463941, + "learning_rate": 1.2122518548901401e-05, + "loss": 0.1763, + "step": 5667 + }, + { + "epoch": 0.4490394137452961, + "grad_norm": 1.8353027474063524, + "learning_rate": 1.2120010914355888e-05, + "loss": 0.2377, + "step": 5668 + }, + { + "epoch": 0.4491186373539315, + "grad_norm": 1.918201063336504, + "learning_rate": 1.2117503140217858e-05, + "loss": 0.3078, + "step": 5669 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 1.7551579801259165, + "learning_rate": 1.2114995226652437e-05, + "loss": 0.2168, + "step": 5670 + }, + { + "epoch": 0.44927708457120225, + "grad_norm": 1.674301397411181, + "learning_rate": 1.2112487173824755e-05, + "loss": 0.2645, + "step": 5671 + }, + { + "epoch": 0.44935630817983757, + "grad_norm": 1.4922935738061887, + "learning_rate": 1.2109978981899956e-05, + "loss": 0.2545, + "step": 5672 + }, + { + "epoch": 0.44943553178847295, + "grad_norm": 2.0760449593188306, + "learning_rate": 1.2107470651043198e-05, + "loss": 0.3298, + "step": 5673 + }, + { + "epoch": 0.44951475539710833, + "grad_norm": 1.9440791398158375, + "learning_rate": 1.2104962181419635e-05, + "loss": 0.3052, + "step": 5674 + }, + { + "epoch": 0.4495939790057437, + "grad_norm": 1.750501798235069, + "learning_rate": 1.2102453573194442e-05, + "loss": 0.2402, + "step": 5675 + }, + { + "epoch": 0.4496732026143791, + "grad_norm": 1.6263665516047727, + "learning_rate": 1.2099944826532796e-05, + "loss": 0.2278, + "step": 5676 + }, + { + "epoch": 0.4497524262230145, + "grad_norm": 1.8257200513512624, + "learning_rate": 1.2097435941599886e-05, + "loss": 0.2198, + "step": 5677 + }, + { + "epoch": 0.4498316498316498, + "grad_norm": 1.8320465927526974, + "learning_rate": 1.2094926918560917e-05, + "loss": 0.1903, + "step": 5678 + }, + { + "epoch": 0.4499108734402852, + "grad_norm": 2.3613795721421393, + "learning_rate": 1.2092417757581085e-05, + "loss": 0.4167, + "step": 5679 + }, + { + "epoch": 0.44999009704892057, + "grad_norm": 1.8109900768597607, + "learning_rate": 1.2089908458825614e-05, + "loss": 0.2132, + "step": 5680 + }, + { + "epoch": 0.45006932065755595, + "grad_norm": 1.5151629803787015, + "learning_rate": 1.2087399022459729e-05, + "loss": 0.1851, + "step": 5681 + }, + { + "epoch": 0.45014854426619133, + "grad_norm": 1.6994388633856963, + "learning_rate": 1.208488944864866e-05, + "loss": 0.183, + "step": 5682 + }, + { + "epoch": 0.4502277678748267, + "grad_norm": 1.8816540048226704, + "learning_rate": 1.2082379737557655e-05, + "loss": 0.3413, + "step": 5683 + }, + { + "epoch": 0.4503069914834621, + "grad_norm": 1.787224448096987, + "learning_rate": 1.2079869889351961e-05, + "loss": 0.2554, + "step": 5684 + }, + { + "epoch": 0.4503862150920974, + "grad_norm": 1.661965270157595, + "learning_rate": 1.2077359904196841e-05, + "loss": 0.2392, + "step": 5685 + }, + { + "epoch": 0.4504654387007328, + "grad_norm": 1.5009075526942932, + "learning_rate": 1.2074849782257572e-05, + "loss": 0.2269, + "step": 5686 + }, + { + "epoch": 0.4505446623093682, + "grad_norm": 2.013573695623766, + "learning_rate": 1.2072339523699426e-05, + "loss": 0.3267, + "step": 5687 + }, + { + "epoch": 0.45062388591800356, + "grad_norm": 2.023128416523469, + "learning_rate": 1.2069829128687693e-05, + "loss": 0.3402, + "step": 5688 + }, + { + "epoch": 0.45070310952663895, + "grad_norm": 2.007862560420196, + "learning_rate": 1.2067318597387672e-05, + "loss": 0.2908, + "step": 5689 + }, + { + "epoch": 0.4507823331352743, + "grad_norm": 2.0846805600088927, + "learning_rate": 1.2064807929964668e-05, + "loss": 0.2684, + "step": 5690 + }, + { + "epoch": 0.4508615567439097, + "grad_norm": 1.467914372033032, + "learning_rate": 1.2062297126584e-05, + "loss": 0.1973, + "step": 5691 + }, + { + "epoch": 0.45094078035254503, + "grad_norm": 1.6352678572822281, + "learning_rate": 1.2059786187410984e-05, + "loss": 0.2524, + "step": 5692 + }, + { + "epoch": 0.4510200039611804, + "grad_norm": 1.7934860117601583, + "learning_rate": 1.2057275112610962e-05, + "loss": 0.2316, + "step": 5693 + }, + { + "epoch": 0.4510992275698158, + "grad_norm": 2.0735360665530935, + "learning_rate": 1.2054763902349273e-05, + "loss": 0.3287, + "step": 5694 + }, + { + "epoch": 0.4511784511784512, + "grad_norm": 1.6926272461466183, + "learning_rate": 1.2052252556791267e-05, + "loss": 0.2669, + "step": 5695 + }, + { + "epoch": 0.45125767478708656, + "grad_norm": 1.8476100541327651, + "learning_rate": 1.2049741076102307e-05, + "loss": 0.2875, + "step": 5696 + }, + { + "epoch": 0.45133689839572194, + "grad_norm": 1.8394593907200811, + "learning_rate": 1.2047229460447759e-05, + "loss": 0.3065, + "step": 5697 + }, + { + "epoch": 0.4514161220043573, + "grad_norm": 1.6570746450863916, + "learning_rate": 1.2044717709993e-05, + "loss": 0.249, + "step": 5698 + }, + { + "epoch": 0.45149534561299265, + "grad_norm": 1.6468534738421132, + "learning_rate": 1.2042205824903419e-05, + "loss": 0.3011, + "step": 5699 + }, + { + "epoch": 0.45157456922162803, + "grad_norm": 1.5826133488286724, + "learning_rate": 1.203969380534441e-05, + "loss": 0.2141, + "step": 5700 + }, + { + "epoch": 0.4516537928302634, + "grad_norm": 1.448858358123047, + "learning_rate": 1.2037181651481378e-05, + "loss": 0.1877, + "step": 5701 + }, + { + "epoch": 0.4517330164388988, + "grad_norm": 1.4060736416732036, + "learning_rate": 1.2034669363479741e-05, + "loss": 0.1723, + "step": 5702 + }, + { + "epoch": 0.4518122400475342, + "grad_norm": 2.194651302555289, + "learning_rate": 1.2032156941504913e-05, + "loss": 0.3147, + "step": 5703 + }, + { + "epoch": 0.45189146365616956, + "grad_norm": 2.434042439309015, + "learning_rate": 1.2029644385722327e-05, + "loss": 0.4223, + "step": 5704 + }, + { + "epoch": 0.45197068726480494, + "grad_norm": 1.7223276273115296, + "learning_rate": 1.2027131696297429e-05, + "loss": 0.1779, + "step": 5705 + }, + { + "epoch": 0.45204991087344026, + "grad_norm": 1.8690637313818925, + "learning_rate": 1.202461887339566e-05, + "loss": 0.3313, + "step": 5706 + }, + { + "epoch": 0.45212913448207565, + "grad_norm": 1.2984302609785088, + "learning_rate": 1.2022105917182478e-05, + "loss": 0.1613, + "step": 5707 + }, + { + "epoch": 0.452208358090711, + "grad_norm": 1.5134060683355437, + "learning_rate": 1.2019592827823354e-05, + "loss": 0.2111, + "step": 5708 + }, + { + "epoch": 0.4522875816993464, + "grad_norm": 1.7825850411321524, + "learning_rate": 1.2017079605483758e-05, + "loss": 0.2614, + "step": 5709 + }, + { + "epoch": 0.4523668053079818, + "grad_norm": 1.3710241477174698, + "learning_rate": 1.201456625032918e-05, + "loss": 0.1453, + "step": 5710 + }, + { + "epoch": 0.45244602891661717, + "grad_norm": 1.9447854544056213, + "learning_rate": 1.2012052762525104e-05, + "loss": 0.2591, + "step": 5711 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 1.9027330082779925, + "learning_rate": 1.2009539142237034e-05, + "loss": 0.2847, + "step": 5712 + }, + { + "epoch": 0.4526044761338879, + "grad_norm": 1.7255140795593775, + "learning_rate": 1.2007025389630484e-05, + "loss": 0.2531, + "step": 5713 + }, + { + "epoch": 0.45268369974252326, + "grad_norm": 1.6766496571648108, + "learning_rate": 1.2004511504870966e-05, + "loss": 0.3097, + "step": 5714 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 1.4525698942967835, + "learning_rate": 1.2001997488124011e-05, + "loss": 0.2155, + "step": 5715 + }, + { + "epoch": 0.452842146959794, + "grad_norm": 1.4313806650051693, + "learning_rate": 1.1999483339555159e-05, + "loss": 0.2239, + "step": 5716 + }, + { + "epoch": 0.4529213705684294, + "grad_norm": 2.0563171218989345, + "learning_rate": 1.1996969059329944e-05, + "loss": 0.221, + "step": 5717 + }, + { + "epoch": 0.4530005941770648, + "grad_norm": 2.0020093390544575, + "learning_rate": 1.1994454647613928e-05, + "loss": 0.2808, + "step": 5718 + }, + { + "epoch": 0.4530798177857001, + "grad_norm": 1.6310472674684602, + "learning_rate": 1.199194010457267e-05, + "loss": 0.2575, + "step": 5719 + }, + { + "epoch": 0.4531590413943355, + "grad_norm": 1.6684955833381414, + "learning_rate": 1.1989425430371739e-05, + "loss": 0.2416, + "step": 5720 + }, + { + "epoch": 0.4532382650029709, + "grad_norm": 1.7084574873374, + "learning_rate": 1.198691062517672e-05, + "loss": 0.2816, + "step": 5721 + }, + { + "epoch": 0.45331748861160626, + "grad_norm": 1.8304227424179893, + "learning_rate": 1.1984395689153195e-05, + "loss": 0.1627, + "step": 5722 + }, + { + "epoch": 0.45339671222024164, + "grad_norm": 1.4105149923080635, + "learning_rate": 1.1981880622466759e-05, + "loss": 0.2155, + "step": 5723 + }, + { + "epoch": 0.453475935828877, + "grad_norm": 1.6962681680234089, + "learning_rate": 1.1979365425283022e-05, + "loss": 0.271, + "step": 5724 + }, + { + "epoch": 0.4535551594375124, + "grad_norm": 1.8799150847503883, + "learning_rate": 1.1976850097767598e-05, + "loss": 0.2485, + "step": 5725 + }, + { + "epoch": 0.4536343830461477, + "grad_norm": 1.7389361906903575, + "learning_rate": 1.1974334640086104e-05, + "loss": 0.2526, + "step": 5726 + }, + { + "epoch": 0.4537136066547831, + "grad_norm": 1.8988759161283906, + "learning_rate": 1.1971819052404177e-05, + "loss": 0.1956, + "step": 5727 + }, + { + "epoch": 0.4537928302634185, + "grad_norm": 1.798914781621013, + "learning_rate": 1.196930333488745e-05, + "loss": 0.19, + "step": 5728 + }, + { + "epoch": 0.45387205387205387, + "grad_norm": 2.107663434616877, + "learning_rate": 1.1966787487701577e-05, + "loss": 0.3069, + "step": 5729 + }, + { + "epoch": 0.45395127748068925, + "grad_norm": 2.2034162652334, + "learning_rate": 1.1964271511012208e-05, + "loss": 0.1783, + "step": 5730 + }, + { + "epoch": 0.45403050108932463, + "grad_norm": 2.3184036760138027, + "learning_rate": 1.1961755404985015e-05, + "loss": 0.3463, + "step": 5731 + }, + { + "epoch": 0.45410972469796, + "grad_norm": 2.1791121681031655, + "learning_rate": 1.1959239169785668e-05, + "loss": 0.3954, + "step": 5732 + }, + { + "epoch": 0.45418894830659534, + "grad_norm": 1.6628015567935317, + "learning_rate": 1.1956722805579846e-05, + "loss": 0.2001, + "step": 5733 + }, + { + "epoch": 0.4542681719152307, + "grad_norm": 2.077869327403923, + "learning_rate": 1.1954206312533246e-05, + "loss": 0.2385, + "step": 5734 + }, + { + "epoch": 0.4543473955238661, + "grad_norm": 2.2045005215714255, + "learning_rate": 1.1951689690811558e-05, + "loss": 0.2856, + "step": 5735 + }, + { + "epoch": 0.4544266191325015, + "grad_norm": 1.8371638284667358, + "learning_rate": 1.1949172940580498e-05, + "loss": 0.3068, + "step": 5736 + }, + { + "epoch": 0.45450584274113687, + "grad_norm": 1.8522300201654076, + "learning_rate": 1.1946656062005781e-05, + "loss": 0.2227, + "step": 5737 + }, + { + "epoch": 0.45458506634977225, + "grad_norm": 1.7320302182338698, + "learning_rate": 1.1944139055253126e-05, + "loss": 0.2321, + "step": 5738 + }, + { + "epoch": 0.45466428995840763, + "grad_norm": 1.5701091947575414, + "learning_rate": 1.1941621920488271e-05, + "loss": 0.2887, + "step": 5739 + }, + { + "epoch": 0.45474351356704296, + "grad_norm": 1.8035475755157526, + "learning_rate": 1.1939104657876953e-05, + "loss": 0.3246, + "step": 5740 + }, + { + "epoch": 0.45482273717567834, + "grad_norm": 1.506252713456613, + "learning_rate": 1.1936587267584924e-05, + "loss": 0.2165, + "step": 5741 + }, + { + "epoch": 0.4549019607843137, + "grad_norm": 1.5025311652810647, + "learning_rate": 1.193406974977794e-05, + "loss": 0.2393, + "step": 5742 + }, + { + "epoch": 0.4549811843929491, + "grad_norm": 1.55815729235976, + "learning_rate": 1.1931552104621776e-05, + "loss": 0.2478, + "step": 5743 + }, + { + "epoch": 0.4550604080015845, + "grad_norm": 1.6653861728121144, + "learning_rate": 1.1929034332282192e-05, + "loss": 0.2436, + "step": 5744 + }, + { + "epoch": 0.45513963161021986, + "grad_norm": 1.6945743023892634, + "learning_rate": 1.1926516432924984e-05, + "loss": 0.2689, + "step": 5745 + }, + { + "epoch": 0.45521885521885525, + "grad_norm": 1.5850622647354944, + "learning_rate": 1.1923998406715937e-05, + "loss": 0.1872, + "step": 5746 + }, + { + "epoch": 0.45529807882749057, + "grad_norm": 1.9430325418481773, + "learning_rate": 1.1921480253820852e-05, + "loss": 0.2715, + "step": 5747 + }, + { + "epoch": 0.45537730243612595, + "grad_norm": 1.8173887897524583, + "learning_rate": 1.1918961974405539e-05, + "loss": 0.2754, + "step": 5748 + }, + { + "epoch": 0.45545652604476133, + "grad_norm": 1.280313438106088, + "learning_rate": 1.1916443568635812e-05, + "loss": 0.1448, + "step": 5749 + }, + { + "epoch": 0.4555357496533967, + "grad_norm": 1.8065914795032998, + "learning_rate": 1.1913925036677497e-05, + "loss": 0.2318, + "step": 5750 + }, + { + "epoch": 0.4556149732620321, + "grad_norm": 1.3941230617962175, + "learning_rate": 1.191140637869643e-05, + "loss": 0.1908, + "step": 5751 + }, + { + "epoch": 0.4556941968706675, + "grad_norm": 1.8012365446164473, + "learning_rate": 1.1908887594858447e-05, + "loss": 0.3145, + "step": 5752 + }, + { + "epoch": 0.45577342047930286, + "grad_norm": 2.003655272890045, + "learning_rate": 1.1906368685329403e-05, + "loss": 0.3109, + "step": 5753 + }, + { + "epoch": 0.4558526440879382, + "grad_norm": 1.7791427389185093, + "learning_rate": 1.1903849650275154e-05, + "loss": 0.2439, + "step": 5754 + }, + { + "epoch": 0.45593186769657357, + "grad_norm": 1.8198868020074712, + "learning_rate": 1.1901330489861564e-05, + "loss": 0.3041, + "step": 5755 + }, + { + "epoch": 0.45601109130520895, + "grad_norm": 1.8983324369961625, + "learning_rate": 1.1898811204254515e-05, + "loss": 0.2702, + "step": 5756 + }, + { + "epoch": 0.45609031491384433, + "grad_norm": 1.6998132353877888, + "learning_rate": 1.189629179361988e-05, + "loss": 0.2099, + "step": 5757 + }, + { + "epoch": 0.4561695385224797, + "grad_norm": 1.770023711048656, + "learning_rate": 1.1893772258123554e-05, + "loss": 0.2208, + "step": 5758 + }, + { + "epoch": 0.4562487621311151, + "grad_norm": 1.5521802466294234, + "learning_rate": 1.1891252597931441e-05, + "loss": 0.1861, + "step": 5759 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 2.0872186412203098, + "learning_rate": 1.1888732813209442e-05, + "loss": 0.3252, + "step": 5760 + }, + { + "epoch": 0.4564072093483858, + "grad_norm": 1.7383376525179475, + "learning_rate": 1.1886212904123477e-05, + "loss": 0.2736, + "step": 5761 + }, + { + "epoch": 0.4564864329570212, + "grad_norm": 1.4851146181572743, + "learning_rate": 1.1883692870839466e-05, + "loss": 0.2334, + "step": 5762 + }, + { + "epoch": 0.45656565656565656, + "grad_norm": 1.5889724507244454, + "learning_rate": 1.1881172713523346e-05, + "loss": 0.1323, + "step": 5763 + }, + { + "epoch": 0.45664488017429194, + "grad_norm": 2.1243043322113913, + "learning_rate": 1.1878652432341053e-05, + "loss": 0.2817, + "step": 5764 + }, + { + "epoch": 0.4567241037829273, + "grad_norm": 1.7035316138862764, + "learning_rate": 1.1876132027458535e-05, + "loss": 0.2476, + "step": 5765 + }, + { + "epoch": 0.4568033273915627, + "grad_norm": 1.758939142378042, + "learning_rate": 1.1873611499041752e-05, + "loss": 0.2034, + "step": 5766 + }, + { + "epoch": 0.45688255100019803, + "grad_norm": 1.6959868516346108, + "learning_rate": 1.1871090847256667e-05, + "loss": 0.2186, + "step": 5767 + }, + { + "epoch": 0.4569617746088334, + "grad_norm": 1.632204906295364, + "learning_rate": 1.1868570072269252e-05, + "loss": 0.2214, + "step": 5768 + }, + { + "epoch": 0.4570409982174688, + "grad_norm": 1.8184560474306608, + "learning_rate": 1.186604917424549e-05, + "loss": 0.2635, + "step": 5769 + }, + { + "epoch": 0.4571202218261042, + "grad_norm": 1.693087665826069, + "learning_rate": 1.1863528153351369e-05, + "loss": 0.2084, + "step": 5770 + }, + { + "epoch": 0.45719944543473956, + "grad_norm": 2.068639291248907, + "learning_rate": 1.1861007009752884e-05, + "loss": 0.3096, + "step": 5771 + }, + { + "epoch": 0.45727866904337494, + "grad_norm": 1.6351346272512304, + "learning_rate": 1.1858485743616044e-05, + "loss": 0.2158, + "step": 5772 + }, + { + "epoch": 0.4573578926520103, + "grad_norm": 2.0681240053978005, + "learning_rate": 1.185596435510686e-05, + "loss": 0.2142, + "step": 5773 + }, + { + "epoch": 0.45743711626064565, + "grad_norm": 1.687055194181285, + "learning_rate": 1.1853442844391354e-05, + "loss": 0.2101, + "step": 5774 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 2.1951786792761205, + "learning_rate": 1.1850921211635554e-05, + "loss": 0.3035, + "step": 5775 + }, + { + "epoch": 0.4575955634779164, + "grad_norm": 1.6212772413129573, + "learning_rate": 1.1848399457005496e-05, + "loss": 0.2268, + "step": 5776 + }, + { + "epoch": 0.4576747870865518, + "grad_norm": 2.1000464880750704, + "learning_rate": 1.1845877580667232e-05, + "loss": 0.1641, + "step": 5777 + }, + { + "epoch": 0.4577540106951872, + "grad_norm": 1.8810821539518814, + "learning_rate": 1.1843355582786806e-05, + "loss": 0.2251, + "step": 5778 + }, + { + "epoch": 0.45783323430382256, + "grad_norm": 2.395428251726005, + "learning_rate": 1.1840833463530289e-05, + "loss": 0.3123, + "step": 5779 + }, + { + "epoch": 0.45791245791245794, + "grad_norm": 1.7391295158909688, + "learning_rate": 1.1838311223063745e-05, + "loss": 0.3127, + "step": 5780 + }, + { + "epoch": 0.45799168152109326, + "grad_norm": 1.967443189627691, + "learning_rate": 1.1835788861553252e-05, + "loss": 0.3024, + "step": 5781 + }, + { + "epoch": 0.45807090512972864, + "grad_norm": 1.8133512046860787, + "learning_rate": 1.1833266379164894e-05, + "loss": 0.2106, + "step": 5782 + }, + { + "epoch": 0.458150128738364, + "grad_norm": 1.4623703715386362, + "learning_rate": 1.183074377606477e-05, + "loss": 0.2107, + "step": 5783 + }, + { + "epoch": 0.4582293523469994, + "grad_norm": 1.9096201908070127, + "learning_rate": 1.1828221052418973e-05, + "loss": 0.2938, + "step": 5784 + }, + { + "epoch": 0.4583085759556348, + "grad_norm": 1.6753449934392568, + "learning_rate": 1.182569820839362e-05, + "loss": 0.202, + "step": 5785 + }, + { + "epoch": 0.45838779956427017, + "grad_norm": 1.8242711819879487, + "learning_rate": 1.1823175244154823e-05, + "loss": 0.2697, + "step": 5786 + }, + { + "epoch": 0.45846702317290555, + "grad_norm": 1.4081247640611776, + "learning_rate": 1.1820652159868706e-05, + "loss": 0.2109, + "step": 5787 + }, + { + "epoch": 0.4585462467815409, + "grad_norm": 1.529074088770382, + "learning_rate": 1.1818128955701409e-05, + "loss": 0.1721, + "step": 5788 + }, + { + "epoch": 0.45862547039017626, + "grad_norm": 1.4828488754206717, + "learning_rate": 1.1815605631819066e-05, + "loss": 0.2021, + "step": 5789 + }, + { + "epoch": 0.45870469399881164, + "grad_norm": 1.806729503536785, + "learning_rate": 1.181308218838783e-05, + "loss": 0.3213, + "step": 5790 + }, + { + "epoch": 0.458783917607447, + "grad_norm": 1.5487741005707878, + "learning_rate": 1.1810558625573856e-05, + "loss": 0.1864, + "step": 5791 + }, + { + "epoch": 0.4588631412160824, + "grad_norm": 1.7392990868517602, + "learning_rate": 1.1808034943543308e-05, + "loss": 0.2683, + "step": 5792 + }, + { + "epoch": 0.4589423648247178, + "grad_norm": 1.6541046452481614, + "learning_rate": 1.1805511142462355e-05, + "loss": 0.1611, + "step": 5793 + }, + { + "epoch": 0.4590215884333531, + "grad_norm": 1.5821801754425744, + "learning_rate": 1.1802987222497186e-05, + "loss": 0.2217, + "step": 5794 + }, + { + "epoch": 0.4591008120419885, + "grad_norm": 1.8139946303305532, + "learning_rate": 1.1800463183813982e-05, + "loss": 0.2898, + "step": 5795 + }, + { + "epoch": 0.4591800356506239, + "grad_norm": 1.8933410343974801, + "learning_rate": 1.1797939026578941e-05, + "loss": 0.2746, + "step": 5796 + }, + { + "epoch": 0.45925925925925926, + "grad_norm": 1.8992641843732025, + "learning_rate": 1.1795414750958265e-05, + "loss": 0.3166, + "step": 5797 + }, + { + "epoch": 0.45933848286789464, + "grad_norm": 1.7989903007294956, + "learning_rate": 1.1792890357118165e-05, + "loss": 0.317, + "step": 5798 + }, + { + "epoch": 0.45941770647653, + "grad_norm": 1.5819186653080919, + "learning_rate": 1.1790365845224866e-05, + "loss": 0.2095, + "step": 5799 + }, + { + "epoch": 0.4594969300851654, + "grad_norm": 1.7439990149458584, + "learning_rate": 1.1787841215444588e-05, + "loss": 0.3441, + "step": 5800 + }, + { + "epoch": 0.4595761536938007, + "grad_norm": 2.035666682458812, + "learning_rate": 1.1785316467943568e-05, + "loss": 0.2876, + "step": 5801 + }, + { + "epoch": 0.4596553773024361, + "grad_norm": 1.6470474336274994, + "learning_rate": 1.1782791602888052e-05, + "loss": 0.2947, + "step": 5802 + }, + { + "epoch": 0.4597346009110715, + "grad_norm": 2.111066612780628, + "learning_rate": 1.1780266620444285e-05, + "loss": 0.1849, + "step": 5803 + }, + { + "epoch": 0.45981382451970687, + "grad_norm": 1.4749315806855938, + "learning_rate": 1.1777741520778529e-05, + "loss": 0.1852, + "step": 5804 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 1.808560130929128, + "learning_rate": 1.1775216304057046e-05, + "loss": 0.3892, + "step": 5805 + }, + { + "epoch": 0.45997227173697763, + "grad_norm": 1.3150678294117604, + "learning_rate": 1.1772690970446113e-05, + "loss": 0.161, + "step": 5806 + }, + { + "epoch": 0.460051495345613, + "grad_norm": 1.5040489890375237, + "learning_rate": 1.177016552011201e-05, + "loss": 0.2207, + "step": 5807 + }, + { + "epoch": 0.46013071895424834, + "grad_norm": 1.3479858677554062, + "learning_rate": 1.176763995322102e-05, + "loss": 0.1882, + "step": 5808 + }, + { + "epoch": 0.4602099425628837, + "grad_norm": 1.2297739170073987, + "learning_rate": 1.1765114269939448e-05, + "loss": 0.2293, + "step": 5809 + }, + { + "epoch": 0.4602891661715191, + "grad_norm": 1.5554855153406992, + "learning_rate": 1.1762588470433593e-05, + "loss": 0.2815, + "step": 5810 + }, + { + "epoch": 0.4603683897801545, + "grad_norm": 1.7346957038526587, + "learning_rate": 1.176006255486977e-05, + "loss": 0.2792, + "step": 5811 + }, + { + "epoch": 0.46044761338878987, + "grad_norm": 1.871493236041805, + "learning_rate": 1.1757536523414297e-05, + "loss": 0.2401, + "step": 5812 + }, + { + "epoch": 0.46052683699742525, + "grad_norm": 1.775567559196208, + "learning_rate": 1.1755010376233498e-05, + "loss": 0.3088, + "step": 5813 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 1.5104027255890675, + "learning_rate": 1.175248411349371e-05, + "loss": 0.1875, + "step": 5814 + }, + { + "epoch": 0.46068528421469596, + "grad_norm": 1.678974569795767, + "learning_rate": 1.1749957735361279e-05, + "loss": 0.2723, + "step": 5815 + }, + { + "epoch": 0.46076450782333134, + "grad_norm": 1.8460194062530695, + "learning_rate": 1.174743124200255e-05, + "loss": 0.2761, + "step": 5816 + }, + { + "epoch": 0.4608437314319667, + "grad_norm": 1.505189384180577, + "learning_rate": 1.1744904633583883e-05, + "loss": 0.1988, + "step": 5817 + }, + { + "epoch": 0.4609229550406021, + "grad_norm": 1.839112409772032, + "learning_rate": 1.1742377910271638e-05, + "loss": 0.2902, + "step": 5818 + }, + { + "epoch": 0.4610021786492375, + "grad_norm": 1.9124512564277765, + "learning_rate": 1.1739851072232195e-05, + "loss": 0.157, + "step": 5819 + }, + { + "epoch": 0.46108140225787286, + "grad_norm": 1.353703020989209, + "learning_rate": 1.1737324119631927e-05, + "loss": 0.1562, + "step": 5820 + }, + { + "epoch": 0.46116062586650824, + "grad_norm": 2.017406980376989, + "learning_rate": 1.173479705263723e-05, + "loss": 0.3418, + "step": 5821 + }, + { + "epoch": 0.46123984947514357, + "grad_norm": 1.3243514393275548, + "learning_rate": 1.1732269871414492e-05, + "loss": 0.1615, + "step": 5822 + }, + { + "epoch": 0.46131907308377895, + "grad_norm": 1.6905790301471357, + "learning_rate": 1.1729742576130119e-05, + "loss": 0.292, + "step": 5823 + }, + { + "epoch": 0.46139829669241433, + "grad_norm": 1.9603246118344408, + "learning_rate": 1.1727215166950519e-05, + "loss": 0.2565, + "step": 5824 + }, + { + "epoch": 0.4614775203010497, + "grad_norm": 1.917623176336931, + "learning_rate": 1.172468764404211e-05, + "loss": 0.2333, + "step": 5825 + }, + { + "epoch": 0.4615567439096851, + "grad_norm": 1.9398954303630949, + "learning_rate": 1.172216000757132e-05, + "loss": 0.2284, + "step": 5826 + }, + { + "epoch": 0.4616359675183205, + "grad_norm": 1.710060950505006, + "learning_rate": 1.1719632257704581e-05, + "loss": 0.2126, + "step": 5827 + }, + { + "epoch": 0.46171519112695586, + "grad_norm": 1.684134186464337, + "learning_rate": 1.171710439460833e-05, + "loss": 0.2009, + "step": 5828 + }, + { + "epoch": 0.4617944147355912, + "grad_norm": 1.7573281326112857, + "learning_rate": 1.1714576418449017e-05, + "loss": 0.2503, + "step": 5829 + }, + { + "epoch": 0.46187363834422657, + "grad_norm": 1.8429373792776358, + "learning_rate": 1.1712048329393097e-05, + "loss": 0.3519, + "step": 5830 + }, + { + "epoch": 0.46195286195286195, + "grad_norm": 1.2035485057140314, + "learning_rate": 1.1709520127607035e-05, + "loss": 0.1374, + "step": 5831 + }, + { + "epoch": 0.46203208556149733, + "grad_norm": 1.4395536329198, + "learning_rate": 1.1706991813257295e-05, + "loss": 0.173, + "step": 5832 + }, + { + "epoch": 0.4621113091701327, + "grad_norm": 1.5777412670779698, + "learning_rate": 1.1704463386510358e-05, + "loss": 0.2446, + "step": 5833 + }, + { + "epoch": 0.4621905327787681, + "grad_norm": 1.2441828370826344, + "learning_rate": 1.170193484753271e-05, + "loss": 0.1502, + "step": 5834 + }, + { + "epoch": 0.4622697563874034, + "grad_norm": 1.8554216741937737, + "learning_rate": 1.169940619649084e-05, + "loss": 0.3251, + "step": 5835 + }, + { + "epoch": 0.4623489799960388, + "grad_norm": 2.1197263971540337, + "learning_rate": 1.1696877433551248e-05, + "loss": 0.3231, + "step": 5836 + }, + { + "epoch": 0.4624282036046742, + "grad_norm": 1.28691621339965, + "learning_rate": 1.1694348558880447e-05, + "loss": 0.1736, + "step": 5837 + }, + { + "epoch": 0.46250742721330956, + "grad_norm": 1.6761553157715703, + "learning_rate": 1.1691819572644941e-05, + "loss": 0.2948, + "step": 5838 + }, + { + "epoch": 0.46258665082194494, + "grad_norm": 1.6913002529398702, + "learning_rate": 1.1689290475011258e-05, + "loss": 0.3126, + "step": 5839 + }, + { + "epoch": 0.4626658744305803, + "grad_norm": 1.6965682391039474, + "learning_rate": 1.1686761266145926e-05, + "loss": 0.2546, + "step": 5840 + }, + { + "epoch": 0.4627450980392157, + "grad_norm": 1.5356746583630596, + "learning_rate": 1.1684231946215478e-05, + "loss": 0.1902, + "step": 5841 + }, + { + "epoch": 0.46282432164785103, + "grad_norm": 1.4642926616693295, + "learning_rate": 1.1681702515386466e-05, + "loss": 0.2615, + "step": 5842 + }, + { + "epoch": 0.4629035452564864, + "grad_norm": 1.463419818752926, + "learning_rate": 1.167917297382543e-05, + "loss": 0.2114, + "step": 5843 + }, + { + "epoch": 0.4629827688651218, + "grad_norm": 1.76563390471955, + "learning_rate": 1.1676643321698934e-05, + "loss": 0.223, + "step": 5844 + }, + { + "epoch": 0.4630619924737572, + "grad_norm": 1.629574698200466, + "learning_rate": 1.1674113559173548e-05, + "loss": 0.2726, + "step": 5845 + }, + { + "epoch": 0.46314121608239256, + "grad_norm": 1.7164989306621083, + "learning_rate": 1.1671583686415833e-05, + "loss": 0.2551, + "step": 5846 + }, + { + "epoch": 0.46322043969102794, + "grad_norm": 2.1183135356475864, + "learning_rate": 1.1669053703592381e-05, + "loss": 0.3137, + "step": 5847 + }, + { + "epoch": 0.4632996632996633, + "grad_norm": 1.8691419432454603, + "learning_rate": 1.1666523610869769e-05, + "loss": 0.2881, + "step": 5848 + }, + { + "epoch": 0.46337888690829865, + "grad_norm": 1.7760078898439706, + "learning_rate": 1.1663993408414597e-05, + "loss": 0.2845, + "step": 5849 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 1.6125545218339132, + "learning_rate": 1.1661463096393468e-05, + "loss": 0.1401, + "step": 5850 + }, + { + "epoch": 0.4635373341255694, + "grad_norm": 1.549884293010929, + "learning_rate": 1.1658932674972985e-05, + "loss": 0.2693, + "step": 5851 + }, + { + "epoch": 0.4636165577342048, + "grad_norm": 1.6752525928036788, + "learning_rate": 1.1656402144319772e-05, + "loss": 0.1787, + "step": 5852 + }, + { + "epoch": 0.4636957813428402, + "grad_norm": 1.4066489473205002, + "learning_rate": 1.1653871504600445e-05, + "loss": 0.1911, + "step": 5853 + }, + { + "epoch": 0.46377500495147556, + "grad_norm": 2.0172052296744876, + "learning_rate": 1.1651340755981634e-05, + "loss": 0.3093, + "step": 5854 + }, + { + "epoch": 0.46385422856011094, + "grad_norm": 1.7226386063640242, + "learning_rate": 1.1648809898629987e-05, + "loss": 0.2696, + "step": 5855 + }, + { + "epoch": 0.46393345216874626, + "grad_norm": 1.673781646983445, + "learning_rate": 1.1646278932712138e-05, + "loss": 0.311, + "step": 5856 + }, + { + "epoch": 0.46401267577738164, + "grad_norm": 1.5792945001326468, + "learning_rate": 1.1643747858394743e-05, + "loss": 0.2205, + "step": 5857 + }, + { + "epoch": 0.464091899386017, + "grad_norm": 1.3162905433832417, + "learning_rate": 1.1641216675844461e-05, + "loss": 0.1358, + "step": 5858 + }, + { + "epoch": 0.4641711229946524, + "grad_norm": 1.793623245600044, + "learning_rate": 1.1638685385227958e-05, + "loss": 0.2384, + "step": 5859 + }, + { + "epoch": 0.4642503466032878, + "grad_norm": 2.2250733701941896, + "learning_rate": 1.1636153986711906e-05, + "loss": 0.3525, + "step": 5860 + }, + { + "epoch": 0.46432957021192317, + "grad_norm": 1.2837086196666392, + "learning_rate": 1.163362248046299e-05, + "loss": 0.194, + "step": 5861 + }, + { + "epoch": 0.46440879382055855, + "grad_norm": 1.6959796571071342, + "learning_rate": 1.1631090866647891e-05, + "loss": 0.2472, + "step": 5862 + }, + { + "epoch": 0.4644880174291939, + "grad_norm": 1.4252640370044873, + "learning_rate": 1.1628559145433308e-05, + "loss": 0.1967, + "step": 5863 + }, + { + "epoch": 0.46456724103782926, + "grad_norm": 1.4949362407868994, + "learning_rate": 1.1626027316985942e-05, + "loss": 0.2233, + "step": 5864 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 1.7117151143093574, + "learning_rate": 1.1623495381472499e-05, + "loss": 0.2572, + "step": 5865 + }, + { + "epoch": 0.4647256882551, + "grad_norm": 1.3120864080302828, + "learning_rate": 1.16209633390597e-05, + "loss": 0.1218, + "step": 5866 + }, + { + "epoch": 0.4648049118637354, + "grad_norm": 1.3455892955024618, + "learning_rate": 1.161843118991426e-05, + "loss": 0.1705, + "step": 5867 + }, + { + "epoch": 0.4648841354723708, + "grad_norm": 1.7686439665846643, + "learning_rate": 1.1615898934202917e-05, + "loss": 0.2905, + "step": 5868 + }, + { + "epoch": 0.46496335908100617, + "grad_norm": 1.6428213140958778, + "learning_rate": 1.1613366572092404e-05, + "loss": 0.325, + "step": 5869 + }, + { + "epoch": 0.4650425826896415, + "grad_norm": 1.5073607155567734, + "learning_rate": 1.1610834103749465e-05, + "loss": 0.2388, + "step": 5870 + }, + { + "epoch": 0.4651218062982769, + "grad_norm": 1.654840474182109, + "learning_rate": 1.1608301529340848e-05, + "loss": 0.2729, + "step": 5871 + }, + { + "epoch": 0.46520102990691226, + "grad_norm": 1.3695519826063307, + "learning_rate": 1.1605768849033318e-05, + "loss": 0.2058, + "step": 5872 + }, + { + "epoch": 0.46528025351554764, + "grad_norm": 1.6496276394374094, + "learning_rate": 1.1603236062993635e-05, + "loss": 0.2051, + "step": 5873 + }, + { + "epoch": 0.465359477124183, + "grad_norm": 1.5837955769256125, + "learning_rate": 1.1600703171388572e-05, + "loss": 0.296, + "step": 5874 + }, + { + "epoch": 0.4654387007328184, + "grad_norm": 1.5858161667344535, + "learning_rate": 1.1598170174384907e-05, + "loss": 0.2137, + "step": 5875 + }, + { + "epoch": 0.4655179243414537, + "grad_norm": 2.2044829966044737, + "learning_rate": 1.1595637072149424e-05, + "loss": 0.3454, + "step": 5876 + }, + { + "epoch": 0.4655971479500891, + "grad_norm": 1.5751336853180717, + "learning_rate": 1.159310386484892e-05, + "loss": 0.244, + "step": 5877 + }, + { + "epoch": 0.4656763715587245, + "grad_norm": 1.4961983415381441, + "learning_rate": 1.159057055265019e-05, + "loss": 0.2294, + "step": 5878 + }, + { + "epoch": 0.46575559516735987, + "grad_norm": 2.2327542940296157, + "learning_rate": 1.1588037135720043e-05, + "loss": 0.293, + "step": 5879 + }, + { + "epoch": 0.46583481877599525, + "grad_norm": 1.5540430246777075, + "learning_rate": 1.1585503614225292e-05, + "loss": 0.2275, + "step": 5880 + }, + { + "epoch": 0.46591404238463063, + "grad_norm": 1.7450219670311224, + "learning_rate": 1.1582969988332757e-05, + "loss": 0.1906, + "step": 5881 + }, + { + "epoch": 0.465993265993266, + "grad_norm": 1.5810265163893842, + "learning_rate": 1.1580436258209266e-05, + "loss": 0.2842, + "step": 5882 + }, + { + "epoch": 0.46607248960190134, + "grad_norm": 1.4339155912305483, + "learning_rate": 1.1577902424021653e-05, + "loss": 0.2104, + "step": 5883 + }, + { + "epoch": 0.4661517132105367, + "grad_norm": 1.8259804906940587, + "learning_rate": 1.1575368485936752e-05, + "loss": 0.3006, + "step": 5884 + }, + { + "epoch": 0.4662309368191721, + "grad_norm": 1.7650234823052482, + "learning_rate": 1.1572834444121424e-05, + "loss": 0.318, + "step": 5885 + }, + { + "epoch": 0.4663101604278075, + "grad_norm": 1.6265532601487545, + "learning_rate": 1.157030029874251e-05, + "loss": 0.2348, + "step": 5886 + }, + { + "epoch": 0.46638938403644287, + "grad_norm": 1.858229168855876, + "learning_rate": 1.1567766049966882e-05, + "loss": 0.3115, + "step": 5887 + }, + { + "epoch": 0.46646860764507825, + "grad_norm": 1.6417037865221684, + "learning_rate": 1.1565231697961398e-05, + "loss": 0.2361, + "step": 5888 + }, + { + "epoch": 0.46654783125371363, + "grad_norm": 1.7781177373037809, + "learning_rate": 1.1562697242892939e-05, + "loss": 0.2121, + "step": 5889 + }, + { + "epoch": 0.46662705486234896, + "grad_norm": 1.5478601069683544, + "learning_rate": 1.156016268492839e-05, + "loss": 0.2505, + "step": 5890 + }, + { + "epoch": 0.46670627847098434, + "grad_norm": 1.6655163893944778, + "learning_rate": 1.155762802423463e-05, + "loss": 0.2772, + "step": 5891 + }, + { + "epoch": 0.4667855020796197, + "grad_norm": 1.3441881432412468, + "learning_rate": 1.1555093260978562e-05, + "loss": 0.1345, + "step": 5892 + }, + { + "epoch": 0.4668647256882551, + "grad_norm": 1.6913774385342362, + "learning_rate": 1.1552558395327087e-05, + "loss": 0.2196, + "step": 5893 + }, + { + "epoch": 0.4669439492968905, + "grad_norm": 1.5443723649287289, + "learning_rate": 1.155002342744711e-05, + "loss": 0.1774, + "step": 5894 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 1.8948337041519354, + "learning_rate": 1.1547488357505549e-05, + "loss": 0.2881, + "step": 5895 + }, + { + "epoch": 0.46710239651416124, + "grad_norm": 1.447661717384583, + "learning_rate": 1.1544953185669327e-05, + "loss": 0.2316, + "step": 5896 + }, + { + "epoch": 0.46718162012279657, + "grad_norm": 1.4724782805804155, + "learning_rate": 1.154241791210537e-05, + "loss": 0.1881, + "step": 5897 + }, + { + "epoch": 0.46726084373143195, + "grad_norm": 1.4247219818288188, + "learning_rate": 1.1539882536980616e-05, + "loss": 0.1581, + "step": 5898 + }, + { + "epoch": 0.46734006734006733, + "grad_norm": 1.9894801439545307, + "learning_rate": 1.1537347060462007e-05, + "loss": 0.2828, + "step": 5899 + }, + { + "epoch": 0.4674192909487027, + "grad_norm": 1.61977154934877, + "learning_rate": 1.1534811482716487e-05, + "loss": 0.224, + "step": 5900 + }, + { + "epoch": 0.4674985145573381, + "grad_norm": 1.7842222259088774, + "learning_rate": 1.1532275803911021e-05, + "loss": 0.2586, + "step": 5901 + }, + { + "epoch": 0.4675777381659735, + "grad_norm": 1.8640470406904812, + "learning_rate": 1.1529740024212566e-05, + "loss": 0.2545, + "step": 5902 + }, + { + "epoch": 0.46765696177460886, + "grad_norm": 1.6857648464660526, + "learning_rate": 1.1527204143788086e-05, + "loss": 0.2761, + "step": 5903 + }, + { + "epoch": 0.4677361853832442, + "grad_norm": 2.12366331228866, + "learning_rate": 1.1524668162804566e-05, + "loss": 0.2602, + "step": 5904 + }, + { + "epoch": 0.46781540899187957, + "grad_norm": 2.053666085077014, + "learning_rate": 1.1522132081428982e-05, + "loss": 0.2526, + "step": 5905 + }, + { + "epoch": 0.46789463260051495, + "grad_norm": 1.7862168777528808, + "learning_rate": 1.1519595899828325e-05, + "loss": 0.3094, + "step": 5906 + }, + { + "epoch": 0.46797385620915033, + "grad_norm": 1.479279108155285, + "learning_rate": 1.151705961816959e-05, + "loss": 0.3455, + "step": 5907 + }, + { + "epoch": 0.4680530798177857, + "grad_norm": 2.078889612531891, + "learning_rate": 1.151452323661978e-05, + "loss": 0.2586, + "step": 5908 + }, + { + "epoch": 0.4681323034264211, + "grad_norm": 1.9682274819897958, + "learning_rate": 1.15119867553459e-05, + "loss": 0.2668, + "step": 5909 + }, + { + "epoch": 0.4682115270350565, + "grad_norm": 1.5883886563625909, + "learning_rate": 1.150945017451497e-05, + "loss": 0.2138, + "step": 5910 + }, + { + "epoch": 0.4682907506436918, + "grad_norm": 1.423357455823298, + "learning_rate": 1.1506913494294005e-05, + "loss": 0.1925, + "step": 5911 + }, + { + "epoch": 0.4683699742523272, + "grad_norm": 1.7328076410115028, + "learning_rate": 1.1504376714850041e-05, + "loss": 0.3198, + "step": 5912 + }, + { + "epoch": 0.46844919786096256, + "grad_norm": 1.7696019392216598, + "learning_rate": 1.1501839836350106e-05, + "loss": 0.2678, + "step": 5913 + }, + { + "epoch": 0.46852842146959794, + "grad_norm": 1.559742855009872, + "learning_rate": 1.1499302858961245e-05, + "loss": 0.2619, + "step": 5914 + }, + { + "epoch": 0.4686076450782333, + "grad_norm": 1.4716090861215645, + "learning_rate": 1.1496765782850507e-05, + "loss": 0.1983, + "step": 5915 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 1.6597534022761087, + "learning_rate": 1.149422860818494e-05, + "loss": 0.2713, + "step": 5916 + }, + { + "epoch": 0.46876609229550403, + "grad_norm": 3.4453795695685083, + "learning_rate": 1.1491691335131614e-05, + "loss": 0.3166, + "step": 5917 + }, + { + "epoch": 0.4688453159041394, + "grad_norm": 1.5004831478370855, + "learning_rate": 1.148915396385759e-05, + "loss": 0.1918, + "step": 5918 + }, + { + "epoch": 0.4689245395127748, + "grad_norm": 1.6768284148802883, + "learning_rate": 1.1486616494529939e-05, + "loss": 0.1988, + "step": 5919 + }, + { + "epoch": 0.4690037631214102, + "grad_norm": 1.5487896672243708, + "learning_rate": 1.1484078927315749e-05, + "loss": 0.2471, + "step": 5920 + }, + { + "epoch": 0.46908298673004556, + "grad_norm": 1.6786426945015251, + "learning_rate": 1.1481541262382102e-05, + "loss": 0.1906, + "step": 5921 + }, + { + "epoch": 0.46916221033868094, + "grad_norm": 1.3558633459947318, + "learning_rate": 1.1479003499896089e-05, + "loss": 0.1621, + "step": 5922 + }, + { + "epoch": 0.4692414339473163, + "grad_norm": 2.1234782517139617, + "learning_rate": 1.1476465640024814e-05, + "loss": 0.2068, + "step": 5923 + }, + { + "epoch": 0.46932065755595165, + "grad_norm": 1.5551094339599982, + "learning_rate": 1.147392768293538e-05, + "loss": 0.2171, + "step": 5924 + }, + { + "epoch": 0.46939988116458703, + "grad_norm": 1.8937768193968965, + "learning_rate": 1.1471389628794902e-05, + "loss": 0.2821, + "step": 5925 + }, + { + "epoch": 0.4694791047732224, + "grad_norm": 1.9901206275209276, + "learning_rate": 1.1468851477770495e-05, + "loss": 0.2766, + "step": 5926 + }, + { + "epoch": 0.4695583283818578, + "grad_norm": 1.69783825328348, + "learning_rate": 1.1466313230029284e-05, + "loss": 0.2788, + "step": 5927 + }, + { + "epoch": 0.4696375519904932, + "grad_norm": 2.0116566575937473, + "learning_rate": 1.1463774885738408e-05, + "loss": 0.2903, + "step": 5928 + }, + { + "epoch": 0.46971677559912856, + "grad_norm": 1.519640718995892, + "learning_rate": 1.1461236445064993e-05, + "loss": 0.2035, + "step": 5929 + }, + { + "epoch": 0.46979599920776394, + "grad_norm": 1.5073273085434886, + "learning_rate": 1.1458697908176194e-05, + "loss": 0.2594, + "step": 5930 + }, + { + "epoch": 0.46987522281639926, + "grad_norm": 1.5409873585497507, + "learning_rate": 1.1456159275239153e-05, + "loss": 0.217, + "step": 5931 + }, + { + "epoch": 0.46995444642503464, + "grad_norm": 2.2870477656879604, + "learning_rate": 1.1453620546421032e-05, + "loss": 0.2058, + "step": 5932 + }, + { + "epoch": 0.47003367003367, + "grad_norm": 1.7464011723006994, + "learning_rate": 1.1451081721888992e-05, + "loss": 0.2147, + "step": 5933 + }, + { + "epoch": 0.4701128936423054, + "grad_norm": 1.7945781383608712, + "learning_rate": 1.1448542801810203e-05, + "loss": 0.2824, + "step": 5934 + }, + { + "epoch": 0.4701921172509408, + "grad_norm": 2.1291540456483418, + "learning_rate": 1.144600378635184e-05, + "loss": 0.232, + "step": 5935 + }, + { + "epoch": 0.47027134085957617, + "grad_norm": 1.8419887876146865, + "learning_rate": 1.1443464675681089e-05, + "loss": 0.2496, + "step": 5936 + }, + { + "epoch": 0.47035056446821155, + "grad_norm": 1.613601013308607, + "learning_rate": 1.1440925469965129e-05, + "loss": 0.2427, + "step": 5937 + }, + { + "epoch": 0.4704297880768469, + "grad_norm": 2.019869067885617, + "learning_rate": 1.1438386169371164e-05, + "loss": 0.3589, + "step": 5938 + }, + { + "epoch": 0.47050901168548226, + "grad_norm": 1.9286486108054686, + "learning_rate": 1.143584677406639e-05, + "loss": 0.3202, + "step": 5939 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.7291428537190483, + "learning_rate": 1.1433307284218014e-05, + "loss": 0.321, + "step": 5940 + }, + { + "epoch": 0.470667458902753, + "grad_norm": 1.3536171989075028, + "learning_rate": 1.1430767699993247e-05, + "loss": 0.1746, + "step": 5941 + }, + { + "epoch": 0.4707466825113884, + "grad_norm": 1.615695800394676, + "learning_rate": 1.1428228021559316e-05, + "loss": 0.2482, + "step": 5942 + }, + { + "epoch": 0.4708259061200238, + "grad_norm": 1.3942045357144677, + "learning_rate": 1.142568824908344e-05, + "loss": 0.1579, + "step": 5943 + }, + { + "epoch": 0.47090512972865917, + "grad_norm": 1.5336238529301847, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.2179, + "step": 5944 + }, + { + "epoch": 0.4709843533372945, + "grad_norm": 1.5990051697855165, + "learning_rate": 1.1420608422674793e-05, + "loss": 0.1587, + "step": 5945 + }, + { + "epoch": 0.4710635769459299, + "grad_norm": 1.6327537852486975, + "learning_rate": 1.1418068369076503e-05, + "loss": 0.1887, + "step": 5946 + }, + { + "epoch": 0.47114280055456526, + "grad_norm": 1.5468378558771658, + "learning_rate": 1.1415528222105237e-05, + "loss": 0.2608, + "step": 5947 + }, + { + "epoch": 0.47122202416320064, + "grad_norm": 1.698229243888137, + "learning_rate": 1.1412987981928245e-05, + "loss": 0.2603, + "step": 5948 + }, + { + "epoch": 0.471301247771836, + "grad_norm": 1.9253088422397535, + "learning_rate": 1.1410447648712795e-05, + "loss": 0.3264, + "step": 5949 + }, + { + "epoch": 0.4713804713804714, + "grad_norm": 1.2722499408306336, + "learning_rate": 1.1407907222626156e-05, + "loss": 0.1986, + "step": 5950 + }, + { + "epoch": 0.4714596949891068, + "grad_norm": 2.007166895575673, + "learning_rate": 1.1405366703835596e-05, + "loss": 0.3448, + "step": 5951 + }, + { + "epoch": 0.4715389185977421, + "grad_norm": 1.5887415609918336, + "learning_rate": 1.1402826092508405e-05, + "loss": 0.2881, + "step": 5952 + }, + { + "epoch": 0.4716181422063775, + "grad_norm": 1.5177232071861904, + "learning_rate": 1.1400285388811862e-05, + "loss": 0.1921, + "step": 5953 + }, + { + "epoch": 0.47169736581501287, + "grad_norm": 1.884934397406955, + "learning_rate": 1.1397744592913268e-05, + "loss": 0.4125, + "step": 5954 + }, + { + "epoch": 0.47177658942364825, + "grad_norm": 1.5871815107255094, + "learning_rate": 1.1395203704979915e-05, + "loss": 0.2222, + "step": 5955 + }, + { + "epoch": 0.47185581303228363, + "grad_norm": 1.6146511416264648, + "learning_rate": 1.1392662725179114e-05, + "loss": 0.215, + "step": 5956 + }, + { + "epoch": 0.471935036640919, + "grad_norm": 1.5308400016523158, + "learning_rate": 1.139012165367817e-05, + "loss": 0.224, + "step": 5957 + }, + { + "epoch": 0.47201426024955434, + "grad_norm": 1.8252632599542646, + "learning_rate": 1.1387580490644408e-05, + "loss": 0.2122, + "step": 5958 + }, + { + "epoch": 0.4720934838581897, + "grad_norm": 1.6595038069429957, + "learning_rate": 1.1385039236245143e-05, + "loss": 0.2207, + "step": 5959 + }, + { + "epoch": 0.4721727074668251, + "grad_norm": 1.722176620058178, + "learning_rate": 1.1382497890647712e-05, + "loss": 0.335, + "step": 5960 + }, + { + "epoch": 0.4722519310754605, + "grad_norm": 1.5006086489666106, + "learning_rate": 1.1379956454019445e-05, + "loss": 0.1928, + "step": 5961 + }, + { + "epoch": 0.47233115468409587, + "grad_norm": 1.4487327488831339, + "learning_rate": 1.1377414926527688e-05, + "loss": 0.2295, + "step": 5962 + }, + { + "epoch": 0.47241037829273125, + "grad_norm": 1.522165827725116, + "learning_rate": 1.1374873308339784e-05, + "loss": 0.2508, + "step": 5963 + }, + { + "epoch": 0.47248960190136663, + "grad_norm": 1.7449739248545317, + "learning_rate": 1.1372331599623088e-05, + "loss": 0.292, + "step": 5964 + }, + { + "epoch": 0.47256882551000196, + "grad_norm": 2.524450810797442, + "learning_rate": 1.136978980054496e-05, + "loss": 0.358, + "step": 5965 + }, + { + "epoch": 0.47264804911863734, + "grad_norm": 1.684912949719457, + "learning_rate": 1.1367247911272765e-05, + "loss": 0.2318, + "step": 5966 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 1.511757310609556, + "learning_rate": 1.1364705931973872e-05, + "loss": 0.2249, + "step": 5967 + }, + { + "epoch": 0.4728064963359081, + "grad_norm": 1.4414956899813705, + "learning_rate": 1.1362163862815663e-05, + "loss": 0.2209, + "step": 5968 + }, + { + "epoch": 0.4728857199445435, + "grad_norm": 1.7214169030508515, + "learning_rate": 1.1359621703965516e-05, + "loss": 0.2885, + "step": 5969 + }, + { + "epoch": 0.47296494355317886, + "grad_norm": 1.9327022011563855, + "learning_rate": 1.135707945559082e-05, + "loss": 0.1959, + "step": 5970 + }, + { + "epoch": 0.47304416716181424, + "grad_norm": 1.8496948642432174, + "learning_rate": 1.1354537117858975e-05, + "loss": 0.2989, + "step": 5971 + }, + { + "epoch": 0.47312339077044957, + "grad_norm": 1.6580687628513424, + "learning_rate": 1.1351994690937377e-05, + "loss": 0.2198, + "step": 5972 + }, + { + "epoch": 0.47320261437908495, + "grad_norm": 1.4220018593357153, + "learning_rate": 1.1349452174993437e-05, + "loss": 0.2145, + "step": 5973 + }, + { + "epoch": 0.47328183798772033, + "grad_norm": 1.8503012229022804, + "learning_rate": 1.1346909570194558e-05, + "loss": 0.2079, + "step": 5974 + }, + { + "epoch": 0.4733610615963557, + "grad_norm": 1.6450962999278003, + "learning_rate": 1.134436687670817e-05, + "loss": 0.2154, + "step": 5975 + }, + { + "epoch": 0.4734402852049911, + "grad_norm": 1.526154492673174, + "learning_rate": 1.134182409470169e-05, + "loss": 0.308, + "step": 5976 + }, + { + "epoch": 0.4735195088136265, + "grad_norm": 1.6582651436337914, + "learning_rate": 1.133928122434255e-05, + "loss": 0.2011, + "step": 5977 + }, + { + "epoch": 0.47359873242226186, + "grad_norm": 1.634210934977372, + "learning_rate": 1.1336738265798187e-05, + "loss": 0.2046, + "step": 5978 + }, + { + "epoch": 0.4736779560308972, + "grad_norm": 1.5620389453324508, + "learning_rate": 1.1334195219236039e-05, + "loss": 0.1813, + "step": 5979 + }, + { + "epoch": 0.47375717963953257, + "grad_norm": 1.6976847037543041, + "learning_rate": 1.1331652084823554e-05, + "loss": 0.208, + "step": 5980 + }, + { + "epoch": 0.47383640324816795, + "grad_norm": 1.5696445005943434, + "learning_rate": 1.1329108862728192e-05, + "loss": 0.2, + "step": 5981 + }, + { + "epoch": 0.47391562685680333, + "grad_norm": 2.1081620043564055, + "learning_rate": 1.1326565553117404e-05, + "loss": 0.3016, + "step": 5982 + }, + { + "epoch": 0.4739948504654387, + "grad_norm": 1.5018030278889556, + "learning_rate": 1.1324022156158654e-05, + "loss": 0.178, + "step": 5983 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 1.5284495533440647, + "learning_rate": 1.132147867201942e-05, + "loss": 0.2049, + "step": 5984 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 1.5992429866247275, + "learning_rate": 1.1318935100867172e-05, + "loss": 0.2148, + "step": 5985 + }, + { + "epoch": 0.4742325212913448, + "grad_norm": 1.3763802567318808, + "learning_rate": 1.1316391442869394e-05, + "loss": 0.1736, + "step": 5986 + }, + { + "epoch": 0.4743117448999802, + "grad_norm": 1.8334936431812883, + "learning_rate": 1.1313847698193577e-05, + "loss": 0.2462, + "step": 5987 + }, + { + "epoch": 0.47439096850861556, + "grad_norm": 1.6878794250562925, + "learning_rate": 1.1311303867007207e-05, + "loss": 0.2422, + "step": 5988 + }, + { + "epoch": 0.47447019211725094, + "grad_norm": 1.468401306601615, + "learning_rate": 1.1308759949477786e-05, + "loss": 0.2444, + "step": 5989 + }, + { + "epoch": 0.4745494157258863, + "grad_norm": 1.664097701219563, + "learning_rate": 1.1306215945772823e-05, + "loss": 0.2615, + "step": 5990 + }, + { + "epoch": 0.4746286393345217, + "grad_norm": 1.6991171924424213, + "learning_rate": 1.1303671856059824e-05, + "loss": 0.2034, + "step": 5991 + }, + { + "epoch": 0.4747078629431571, + "grad_norm": 1.7967240057784828, + "learning_rate": 1.1301127680506305e-05, + "loss": 0.2052, + "step": 5992 + }, + { + "epoch": 0.4747870865517924, + "grad_norm": 2.017587927012344, + "learning_rate": 1.1298583419279792e-05, + "loss": 0.2841, + "step": 5993 + }, + { + "epoch": 0.4748663101604278, + "grad_norm": 1.7319780197345693, + "learning_rate": 1.1296039072547804e-05, + "loss": 0.2915, + "step": 5994 + }, + { + "epoch": 0.4749455337690632, + "grad_norm": 1.6398799752152888, + "learning_rate": 1.1293494640477885e-05, + "loss": 0.2171, + "step": 5995 + }, + { + "epoch": 0.47502475737769856, + "grad_norm": 1.5886829074291113, + "learning_rate": 1.1290950123237564e-05, + "loss": 0.2214, + "step": 5996 + }, + { + "epoch": 0.47510398098633394, + "grad_norm": 1.8176377926816734, + "learning_rate": 1.128840552099439e-05, + "loss": 0.4143, + "step": 5997 + }, + { + "epoch": 0.4751832045949693, + "grad_norm": 1.3686792953141944, + "learning_rate": 1.1285860833915914e-05, + "loss": 0.1308, + "step": 5998 + }, + { + "epoch": 0.47526242820360465, + "grad_norm": 1.4152549698160461, + "learning_rate": 1.1283316062169685e-05, + "loss": 0.1647, + "step": 5999 + }, + { + "epoch": 0.47534165181224003, + "grad_norm": 2.0255083140573436, + "learning_rate": 1.1280771205923269e-05, + "loss": 0.2668, + "step": 6000 + }, + { + "epoch": 0.4754208754208754, + "grad_norm": 1.9224526679175966, + "learning_rate": 1.1278226265344234e-05, + "loss": 0.2819, + "step": 6001 + }, + { + "epoch": 0.4755000990295108, + "grad_norm": 2.536595853740814, + "learning_rate": 1.127568124060015e-05, + "loss": 0.3714, + "step": 6002 + }, + { + "epoch": 0.4755793226381462, + "grad_norm": 1.5779914578874135, + "learning_rate": 1.1273136131858595e-05, + "loss": 0.1928, + "step": 6003 + }, + { + "epoch": 0.47565854624678156, + "grad_norm": 1.7261912670840855, + "learning_rate": 1.1270590939287149e-05, + "loss": 0.2882, + "step": 6004 + }, + { + "epoch": 0.47573776985541694, + "grad_norm": 1.9174987661928171, + "learning_rate": 1.1268045663053404e-05, + "loss": 0.2894, + "step": 6005 + }, + { + "epoch": 0.47581699346405226, + "grad_norm": 1.3406157997038364, + "learning_rate": 1.1265500303324954e-05, + "loss": 0.0995, + "step": 6006 + }, + { + "epoch": 0.47589621707268764, + "grad_norm": 1.9482578310256058, + "learning_rate": 1.12629548602694e-05, + "loss": 0.2356, + "step": 6007 + }, + { + "epoch": 0.475975440681323, + "grad_norm": 1.6795715555418553, + "learning_rate": 1.1260409334054342e-05, + "loss": 0.3422, + "step": 6008 + }, + { + "epoch": 0.4760546642899584, + "grad_norm": 1.741939231866596, + "learning_rate": 1.1257863724847398e-05, + "loss": 0.1875, + "step": 6009 + }, + { + "epoch": 0.4761338878985938, + "grad_norm": 1.6760448139876805, + "learning_rate": 1.1255318032816175e-05, + "loss": 0.2755, + "step": 6010 + }, + { + "epoch": 0.47621311150722917, + "grad_norm": 1.4716612549658066, + "learning_rate": 1.1252772258128303e-05, + "loss": 0.2064, + "step": 6011 + }, + { + "epoch": 0.47629233511586455, + "grad_norm": 1.239437971740309, + "learning_rate": 1.1250226400951408e-05, + "loss": 0.214, + "step": 6012 + }, + { + "epoch": 0.4763715587244999, + "grad_norm": 2.3771186436242586, + "learning_rate": 1.1247680461453114e-05, + "loss": 0.1483, + "step": 6013 + }, + { + "epoch": 0.47645078233313526, + "grad_norm": 1.5484783692359536, + "learning_rate": 1.1245134439801073e-05, + "loss": 0.205, + "step": 6014 + }, + { + "epoch": 0.47653000594177064, + "grad_norm": 1.4540898940185139, + "learning_rate": 1.1242588336162916e-05, + "loss": 0.2185, + "step": 6015 + }, + { + "epoch": 0.476609229550406, + "grad_norm": 1.9212478192320468, + "learning_rate": 1.1240042150706296e-05, + "loss": 0.2741, + "step": 6016 + }, + { + "epoch": 0.4766884531590414, + "grad_norm": 1.6486229658306872, + "learning_rate": 1.1237495883598868e-05, + "loss": 0.2884, + "step": 6017 + }, + { + "epoch": 0.4767676767676768, + "grad_norm": 1.7329416286713635, + "learning_rate": 1.1234949535008289e-05, + "loss": 0.195, + "step": 6018 + }, + { + "epoch": 0.47684690037631217, + "grad_norm": 2.0867858501158154, + "learning_rate": 1.1232403105102226e-05, + "loss": 0.2635, + "step": 6019 + }, + { + "epoch": 0.4769261239849475, + "grad_norm": 1.7178091288967594, + "learning_rate": 1.122985659404835e-05, + "loss": 0.2619, + "step": 6020 + }, + { + "epoch": 0.4770053475935829, + "grad_norm": 1.4308391139353018, + "learning_rate": 1.1227310002014332e-05, + "loss": 0.1804, + "step": 6021 + }, + { + "epoch": 0.47708457120221825, + "grad_norm": 2.0982399160570777, + "learning_rate": 1.1224763329167859e-05, + "loss": 0.2396, + "step": 6022 + }, + { + "epoch": 0.47716379481085364, + "grad_norm": 1.822052514734417, + "learning_rate": 1.122221657567661e-05, + "loss": 0.2148, + "step": 6023 + }, + { + "epoch": 0.477243018419489, + "grad_norm": 1.5489465808995513, + "learning_rate": 1.1219669741708282e-05, + "loss": 0.1746, + "step": 6024 + }, + { + "epoch": 0.4773222420281244, + "grad_norm": 1.869029439141864, + "learning_rate": 1.121712282743057e-05, + "loss": 0.2004, + "step": 6025 + }, + { + "epoch": 0.4774014656367598, + "grad_norm": 2.5212482111338117, + "learning_rate": 1.1214575833011178e-05, + "loss": 0.3236, + "step": 6026 + }, + { + "epoch": 0.4774806892453951, + "grad_norm": 2.152290271707943, + "learning_rate": 1.121202875861781e-05, + "loss": 0.3529, + "step": 6027 + }, + { + "epoch": 0.4775599128540305, + "grad_norm": 1.4701241056569854, + "learning_rate": 1.1209481604418182e-05, + "loss": 0.2313, + "step": 6028 + }, + { + "epoch": 0.47763913646266587, + "grad_norm": 1.63276959640511, + "learning_rate": 1.1206934370580009e-05, + "loss": 0.1793, + "step": 6029 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 2.013942105110664, + "learning_rate": 1.1204387057271016e-05, + "loss": 0.3182, + "step": 6030 + }, + { + "epoch": 0.47779758367993663, + "grad_norm": 1.6732255072232611, + "learning_rate": 1.1201839664658929e-05, + "loss": 0.2155, + "step": 6031 + }, + { + "epoch": 0.477876807288572, + "grad_norm": 2.133456711702031, + "learning_rate": 1.1199292192911482e-05, + "loss": 0.2971, + "step": 6032 + }, + { + "epoch": 0.47795603089720734, + "grad_norm": 1.7458659587843834, + "learning_rate": 1.1196744642196417e-05, + "loss": 0.3109, + "step": 6033 + }, + { + "epoch": 0.4780352545058427, + "grad_norm": 2.1189440187055797, + "learning_rate": 1.1194197012681473e-05, + "loss": 0.2344, + "step": 6034 + }, + { + "epoch": 0.4781144781144781, + "grad_norm": 1.4454238642304105, + "learning_rate": 1.1191649304534405e-05, + "loss": 0.241, + "step": 6035 + }, + { + "epoch": 0.4781937017231135, + "grad_norm": 1.3915525569937994, + "learning_rate": 1.1189101517922961e-05, + "loss": 0.2091, + "step": 6036 + }, + { + "epoch": 0.47827292533174887, + "grad_norm": 1.5003487423239634, + "learning_rate": 1.1186553653014906e-05, + "loss": 0.2044, + "step": 6037 + }, + { + "epoch": 0.47835214894038425, + "grad_norm": 1.4418556987633147, + "learning_rate": 1.1184005709978002e-05, + "loss": 0.2062, + "step": 6038 + }, + { + "epoch": 0.47843137254901963, + "grad_norm": 1.4113814976968677, + "learning_rate": 1.118145768898002e-05, + "loss": 0.1691, + "step": 6039 + }, + { + "epoch": 0.47851059615765495, + "grad_norm": 2.358713378776793, + "learning_rate": 1.1178909590188731e-05, + "loss": 0.3484, + "step": 6040 + }, + { + "epoch": 0.47858981976629034, + "grad_norm": 1.7370461053292707, + "learning_rate": 1.117636141377192e-05, + "loss": 0.2604, + "step": 6041 + }, + { + "epoch": 0.4786690433749257, + "grad_norm": 1.7287466122509574, + "learning_rate": 1.117381315989737e-05, + "loss": 0.193, + "step": 6042 + }, + { + "epoch": 0.4787482669835611, + "grad_norm": 1.7814864273110071, + "learning_rate": 1.117126482873287e-05, + "loss": 0.2757, + "step": 6043 + }, + { + "epoch": 0.4788274905921965, + "grad_norm": 1.6614476707629502, + "learning_rate": 1.1168716420446219e-05, + "loss": 0.2218, + "step": 6044 + }, + { + "epoch": 0.47890671420083186, + "grad_norm": 1.782252623660112, + "learning_rate": 1.1166167935205214e-05, + "loss": 0.2748, + "step": 6045 + }, + { + "epoch": 0.47898593780946724, + "grad_norm": 1.716919730758376, + "learning_rate": 1.1163619373177663e-05, + "loss": 0.2348, + "step": 6046 + }, + { + "epoch": 0.47906516141810257, + "grad_norm": 1.6608149656562998, + "learning_rate": 1.1161070734531375e-05, + "loss": 0.2386, + "step": 6047 + }, + { + "epoch": 0.47914438502673795, + "grad_norm": 1.6732170941344755, + "learning_rate": 1.1158522019434163e-05, + "loss": 0.2691, + "step": 6048 + }, + { + "epoch": 0.47922360863537333, + "grad_norm": 1.849098697206598, + "learning_rate": 1.1155973228053854e-05, + "loss": 0.25, + "step": 6049 + }, + { + "epoch": 0.4793028322440087, + "grad_norm": 1.8325764324480083, + "learning_rate": 1.1153424360558268e-05, + "loss": 0.3329, + "step": 6050 + }, + { + "epoch": 0.4793820558526441, + "grad_norm": 1.5785672134828717, + "learning_rate": 1.115087541711524e-05, + "loss": 0.2506, + "step": 6051 + }, + { + "epoch": 0.4794612794612795, + "grad_norm": 1.8442646354038115, + "learning_rate": 1.1148326397892601e-05, + "loss": 0.2594, + "step": 6052 + }, + { + "epoch": 0.47954050306991486, + "grad_norm": 1.667486586375052, + "learning_rate": 1.1145777303058197e-05, + "loss": 0.2036, + "step": 6053 + }, + { + "epoch": 0.4796197266785502, + "grad_norm": 1.026187163872373, + "learning_rate": 1.1143228132779867e-05, + "loss": 0.1148, + "step": 6054 + }, + { + "epoch": 0.47969895028718557, + "grad_norm": 1.7192609302667536, + "learning_rate": 1.1140678887225468e-05, + "loss": 0.2826, + "step": 6055 + }, + { + "epoch": 0.47977817389582095, + "grad_norm": 1.4925997087287428, + "learning_rate": 1.1138129566562853e-05, + "loss": 0.226, + "step": 6056 + }, + { + "epoch": 0.47985739750445633, + "grad_norm": 1.6097119124164594, + "learning_rate": 1.1135580170959881e-05, + "loss": 0.2118, + "step": 6057 + }, + { + "epoch": 0.4799366211130917, + "grad_norm": 1.950613791592059, + "learning_rate": 1.1133030700584419e-05, + "loss": 0.26, + "step": 6058 + }, + { + "epoch": 0.4800158447217271, + "grad_norm": 1.9227954063980182, + "learning_rate": 1.1130481155604336e-05, + "loss": 0.2271, + "step": 6059 + }, + { + "epoch": 0.4800950683303625, + "grad_norm": 2.3867979844179055, + "learning_rate": 1.1127931536187511e-05, + "loss": 0.275, + "step": 6060 + }, + { + "epoch": 0.4801742919389978, + "grad_norm": 1.678605624774557, + "learning_rate": 1.1125381842501819e-05, + "loss": 0.2486, + "step": 6061 + }, + { + "epoch": 0.4802535155476332, + "grad_norm": 2.0743673676492382, + "learning_rate": 1.1122832074715149e-05, + "loss": 0.2294, + "step": 6062 + }, + { + "epoch": 0.48033273915626856, + "grad_norm": 2.457692304184814, + "learning_rate": 1.1120282232995389e-05, + "loss": 0.2926, + "step": 6063 + }, + { + "epoch": 0.48041196276490394, + "grad_norm": 1.9550436873200134, + "learning_rate": 1.1117732317510437e-05, + "loss": 0.3064, + "step": 6064 + }, + { + "epoch": 0.4804911863735393, + "grad_norm": 1.7388410603182607, + "learning_rate": 1.111518232842819e-05, + "loss": 0.2295, + "step": 6065 + }, + { + "epoch": 0.4805704099821747, + "grad_norm": 1.915879146878589, + "learning_rate": 1.1112632265916548e-05, + "loss": 0.2852, + "step": 6066 + }, + { + "epoch": 0.4806496335908101, + "grad_norm": 1.7567653643174015, + "learning_rate": 1.1110082130143427e-05, + "loss": 0.2075, + "step": 6067 + }, + { + "epoch": 0.4807288571994454, + "grad_norm": 2.1064041973573646, + "learning_rate": 1.1107531921276742e-05, + "loss": 0.2111, + "step": 6068 + }, + { + "epoch": 0.4808080808080808, + "grad_norm": 1.527472576445534, + "learning_rate": 1.1104981639484404e-05, + "loss": 0.1828, + "step": 6069 + }, + { + "epoch": 0.4808873044167162, + "grad_norm": 1.589179987319218, + "learning_rate": 1.1102431284934345e-05, + "loss": 0.1624, + "step": 6070 + }, + { + "epoch": 0.48096652802535156, + "grad_norm": 2.2218854768320124, + "learning_rate": 1.1099880857794491e-05, + "loss": 0.3398, + "step": 6071 + }, + { + "epoch": 0.48104575163398694, + "grad_norm": 2.1902785408532455, + "learning_rate": 1.1097330358232775e-05, + "loss": 0.3716, + "step": 6072 + }, + { + "epoch": 0.4811249752426223, + "grad_norm": 1.314182865750346, + "learning_rate": 1.1094779786417133e-05, + "loss": 0.1787, + "step": 6073 + }, + { + "epoch": 0.48120419885125765, + "grad_norm": 1.7772959445541636, + "learning_rate": 1.1092229142515512e-05, + "loss": 0.2461, + "step": 6074 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 1.6795739735856607, + "learning_rate": 1.1089678426695854e-05, + "loss": 0.227, + "step": 6075 + }, + { + "epoch": 0.4813626460685284, + "grad_norm": 1.400188539829857, + "learning_rate": 1.1087127639126118e-05, + "loss": 0.1749, + "step": 6076 + }, + { + "epoch": 0.4814418696771638, + "grad_norm": 1.1970582619896746, + "learning_rate": 1.1084576779974257e-05, + "loss": 0.1604, + "step": 6077 + }, + { + "epoch": 0.4815210932857992, + "grad_norm": 1.5901052988698938, + "learning_rate": 1.1082025849408231e-05, + "loss": 0.2058, + "step": 6078 + }, + { + "epoch": 0.48160031689443455, + "grad_norm": 1.6262617370031065, + "learning_rate": 1.1079474847596014e-05, + "loss": 0.3286, + "step": 6079 + }, + { + "epoch": 0.48167954050306994, + "grad_norm": 2.1410384017366617, + "learning_rate": 1.1076923774705568e-05, + "loss": 0.2211, + "step": 6080 + }, + { + "epoch": 0.48175876411170526, + "grad_norm": 1.4889292409739858, + "learning_rate": 1.1074372630904878e-05, + "loss": 0.2556, + "step": 6081 + }, + { + "epoch": 0.48183798772034064, + "grad_norm": 1.4150729657725276, + "learning_rate": 1.1071821416361917e-05, + "loss": 0.1733, + "step": 6082 + }, + { + "epoch": 0.481917211328976, + "grad_norm": 1.3223431311529998, + "learning_rate": 1.106927013124467e-05, + "loss": 0.1653, + "step": 6083 + }, + { + "epoch": 0.4819964349376114, + "grad_norm": 1.5384518921365442, + "learning_rate": 1.1066718775721135e-05, + "loss": 0.1959, + "step": 6084 + }, + { + "epoch": 0.4820756585462468, + "grad_norm": 1.554693220212927, + "learning_rate": 1.1064167349959299e-05, + "loss": 0.1866, + "step": 6085 + }, + { + "epoch": 0.48215488215488217, + "grad_norm": 1.5100450086196078, + "learning_rate": 1.1061615854127165e-05, + "loss": 0.1854, + "step": 6086 + }, + { + "epoch": 0.48223410576351755, + "grad_norm": 1.704183664910275, + "learning_rate": 1.1059064288392733e-05, + "loss": 0.2573, + "step": 6087 + }, + { + "epoch": 0.4823133293721529, + "grad_norm": 1.5950789238437713, + "learning_rate": 1.1056512652924014e-05, + "loss": 0.1999, + "step": 6088 + }, + { + "epoch": 0.48239255298078826, + "grad_norm": 1.470918572830471, + "learning_rate": 1.1053960947889021e-05, + "loss": 0.2014, + "step": 6089 + }, + { + "epoch": 0.48247177658942364, + "grad_norm": 1.926235075911013, + "learning_rate": 1.1051409173455771e-05, + "loss": 0.2001, + "step": 6090 + }, + { + "epoch": 0.482551000198059, + "grad_norm": 1.6414267216068597, + "learning_rate": 1.1048857329792284e-05, + "loss": 0.1506, + "step": 6091 + }, + { + "epoch": 0.4826302238066944, + "grad_norm": 1.9329758126442254, + "learning_rate": 1.1046305417066594e-05, + "loss": 0.2904, + "step": 6092 + }, + { + "epoch": 0.4827094474153298, + "grad_norm": 2.0943448014296564, + "learning_rate": 1.1043753435446722e-05, + "loss": 0.3623, + "step": 6093 + }, + { + "epoch": 0.48278867102396517, + "grad_norm": 1.5800268648588092, + "learning_rate": 1.104120138510071e-05, + "loss": 0.2229, + "step": 6094 + }, + { + "epoch": 0.4828678946326005, + "grad_norm": 1.6608987084190456, + "learning_rate": 1.1038649266196597e-05, + "loss": 0.2665, + "step": 6095 + }, + { + "epoch": 0.4829471182412359, + "grad_norm": 1.5531043338342743, + "learning_rate": 1.1036097078902428e-05, + "loss": 0.2502, + "step": 6096 + }, + { + "epoch": 0.48302634184987125, + "grad_norm": 1.655272962009836, + "learning_rate": 1.1033544823386248e-05, + "loss": 0.2664, + "step": 6097 + }, + { + "epoch": 0.48310556545850664, + "grad_norm": 1.7081587714660573, + "learning_rate": 1.103099249981612e-05, + "loss": 0.1943, + "step": 6098 + }, + { + "epoch": 0.483184789067142, + "grad_norm": 1.460811831054638, + "learning_rate": 1.1028440108360092e-05, + "loss": 0.2384, + "step": 6099 + }, + { + "epoch": 0.4832640126757774, + "grad_norm": 2.474806249133684, + "learning_rate": 1.1025887649186236e-05, + "loss": 0.2603, + "step": 6100 + }, + { + "epoch": 0.4833432362844128, + "grad_norm": 1.5206067052244827, + "learning_rate": 1.1023335122462611e-05, + "loss": 0.2223, + "step": 6101 + }, + { + "epoch": 0.4834224598930481, + "grad_norm": 1.588879697676813, + "learning_rate": 1.102078252835729e-05, + "loss": 0.2085, + "step": 6102 + }, + { + "epoch": 0.4835016835016835, + "grad_norm": 1.8732091504836876, + "learning_rate": 1.1018229867038358e-05, + "loss": 0.2595, + "step": 6103 + }, + { + "epoch": 0.48358090711031887, + "grad_norm": 2.0376741281047543, + "learning_rate": 1.1015677138673882e-05, + "loss": 0.1894, + "step": 6104 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 1.6908396734478435, + "learning_rate": 1.1013124343431955e-05, + "loss": 0.2631, + "step": 6105 + }, + { + "epoch": 0.48373935432758963, + "grad_norm": 1.6279099511622805, + "learning_rate": 1.1010571481480668e-05, + "loss": 0.2899, + "step": 6106 + }, + { + "epoch": 0.483818577936225, + "grad_norm": 1.549894099346484, + "learning_rate": 1.1008018552988109e-05, + "loss": 0.188, + "step": 6107 + }, + { + "epoch": 0.4838978015448604, + "grad_norm": 1.6707462565369033, + "learning_rate": 1.1005465558122382e-05, + "loss": 0.2479, + "step": 6108 + }, + { + "epoch": 0.4839770251534957, + "grad_norm": 1.6452858939134958, + "learning_rate": 1.1002912497051582e-05, + "loss": 0.1465, + "step": 6109 + }, + { + "epoch": 0.4840562487621311, + "grad_norm": 1.6808560749976296, + "learning_rate": 1.1000359369943818e-05, + "loss": 0.192, + "step": 6110 + }, + { + "epoch": 0.4841354723707665, + "grad_norm": 1.8481182705182186, + "learning_rate": 1.099780617696721e-05, + "loss": 0.2779, + "step": 6111 + }, + { + "epoch": 0.48421469597940187, + "grad_norm": 1.573521682443844, + "learning_rate": 1.099525291828986e-05, + "loss": 0.1774, + "step": 6112 + }, + { + "epoch": 0.48429391958803725, + "grad_norm": 1.425349550144392, + "learning_rate": 1.0992699594079896e-05, + "loss": 0.2797, + "step": 6113 + }, + { + "epoch": 0.48437314319667263, + "grad_norm": 2.009487270160793, + "learning_rate": 1.0990146204505444e-05, + "loss": 0.261, + "step": 6114 + }, + { + "epoch": 0.48445236680530795, + "grad_norm": 2.1488999585702393, + "learning_rate": 1.0987592749734624e-05, + "loss": 0.2596, + "step": 6115 + }, + { + "epoch": 0.48453159041394334, + "grad_norm": 2.3107121820592575, + "learning_rate": 1.0985039229935575e-05, + "loss": 0.2907, + "step": 6116 + }, + { + "epoch": 0.4846108140225787, + "grad_norm": 1.746708747349439, + "learning_rate": 1.098248564527643e-05, + "loss": 0.2346, + "step": 6117 + }, + { + "epoch": 0.4846900376312141, + "grad_norm": 1.3520786326928806, + "learning_rate": 1.0979931995925335e-05, + "loss": 0.163, + "step": 6118 + }, + { + "epoch": 0.4847692612398495, + "grad_norm": 1.703245748902048, + "learning_rate": 1.0977378282050436e-05, + "loss": 0.2197, + "step": 6119 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 1.843487097522145, + "learning_rate": 1.0974824503819877e-05, + "loss": 0.2457, + "step": 6120 + }, + { + "epoch": 0.48492770845712024, + "grad_norm": 1.709515263640577, + "learning_rate": 1.0972270661401812e-05, + "loss": 0.2491, + "step": 6121 + }, + { + "epoch": 0.48500693206575557, + "grad_norm": 1.7716834428574988, + "learning_rate": 1.0969716754964408e-05, + "loss": 0.2244, + "step": 6122 + }, + { + "epoch": 0.48508615567439095, + "grad_norm": 1.8040739729171662, + "learning_rate": 1.0967162784675818e-05, + "loss": 0.1685, + "step": 6123 + }, + { + "epoch": 0.48516537928302633, + "grad_norm": 1.6617700510457143, + "learning_rate": 1.0964608750704215e-05, + "loss": 0.2344, + "step": 6124 + }, + { + "epoch": 0.4852446028916617, + "grad_norm": 1.4949449581981267, + "learning_rate": 1.0962054653217764e-05, + "loss": 0.2284, + "step": 6125 + }, + { + "epoch": 0.4853238265002971, + "grad_norm": 1.5779260366131818, + "learning_rate": 1.0959500492384646e-05, + "loss": 0.2089, + "step": 6126 + }, + { + "epoch": 0.4854030501089325, + "grad_norm": 1.5850483205953467, + "learning_rate": 1.0956946268373034e-05, + "loss": 0.2093, + "step": 6127 + }, + { + "epoch": 0.48548227371756786, + "grad_norm": 1.6595133946745808, + "learning_rate": 1.0954391981351117e-05, + "loss": 0.214, + "step": 6128 + }, + { + "epoch": 0.4855614973262032, + "grad_norm": 1.300716622134671, + "learning_rate": 1.0951837631487081e-05, + "loss": 0.139, + "step": 6129 + }, + { + "epoch": 0.48564072093483857, + "grad_norm": 1.7663575799646405, + "learning_rate": 1.0949283218949117e-05, + "loss": 0.3728, + "step": 6130 + }, + { + "epoch": 0.48571994454347395, + "grad_norm": 1.5893907586242662, + "learning_rate": 1.094672874390542e-05, + "loss": 0.2762, + "step": 6131 + }, + { + "epoch": 0.48579916815210933, + "grad_norm": 1.5263839071998777, + "learning_rate": 1.094417420652419e-05, + "loss": 0.264, + "step": 6132 + }, + { + "epoch": 0.4858783917607447, + "grad_norm": 1.7614765577610025, + "learning_rate": 1.0941619606973633e-05, + "loss": 0.2399, + "step": 6133 + }, + { + "epoch": 0.4859576153693801, + "grad_norm": 1.3299154787209122, + "learning_rate": 1.0939064945421953e-05, + "loss": 0.2019, + "step": 6134 + }, + { + "epoch": 0.4860368389780155, + "grad_norm": 1.69326226197989, + "learning_rate": 1.0936510222037368e-05, + "loss": 0.2284, + "step": 6135 + }, + { + "epoch": 0.4861160625866508, + "grad_norm": 1.3867158307142968, + "learning_rate": 1.0933955436988088e-05, + "loss": 0.1856, + "step": 6136 + }, + { + "epoch": 0.4861952861952862, + "grad_norm": 2.09728385427811, + "learning_rate": 1.0931400590442337e-05, + "loss": 0.2487, + "step": 6137 + }, + { + "epoch": 0.48627450980392156, + "grad_norm": 2.4672236442999984, + "learning_rate": 1.0928845682568344e-05, + "loss": 0.3023, + "step": 6138 + }, + { + "epoch": 0.48635373341255694, + "grad_norm": 1.3444689656720372, + "learning_rate": 1.0926290713534324e-05, + "loss": 0.2568, + "step": 6139 + }, + { + "epoch": 0.4864329570211923, + "grad_norm": 1.778188754536545, + "learning_rate": 1.0923735683508521e-05, + "loss": 0.217, + "step": 6140 + }, + { + "epoch": 0.4865121806298277, + "grad_norm": 1.780770270725503, + "learning_rate": 1.092118059265917e-05, + "loss": 0.2628, + "step": 6141 + }, + { + "epoch": 0.4865914042384631, + "grad_norm": 1.4176869598579112, + "learning_rate": 1.0918625441154508e-05, + "loss": 0.1981, + "step": 6142 + }, + { + "epoch": 0.4866706278470984, + "grad_norm": 1.9258852800950363, + "learning_rate": 1.091607022916278e-05, + "loss": 0.3447, + "step": 6143 + }, + { + "epoch": 0.4867498514557338, + "grad_norm": 1.3135352212781661, + "learning_rate": 1.0913514956852236e-05, + "loss": 0.1858, + "step": 6144 + }, + { + "epoch": 0.4868290750643692, + "grad_norm": 1.5677065770661087, + "learning_rate": 1.0910959624391127e-05, + "loss": 0.2227, + "step": 6145 + }, + { + "epoch": 0.48690829867300456, + "grad_norm": 1.9099063126692972, + "learning_rate": 1.090840423194771e-05, + "loss": 0.2988, + "step": 6146 + }, + { + "epoch": 0.48698752228163994, + "grad_norm": 1.6363094178026687, + "learning_rate": 1.0905848779690246e-05, + "loss": 0.2852, + "step": 6147 + }, + { + "epoch": 0.4870667458902753, + "grad_norm": 1.4844642554778813, + "learning_rate": 1.0903293267786998e-05, + "loss": 0.1707, + "step": 6148 + }, + { + "epoch": 0.4871459694989107, + "grad_norm": 1.8019475876848121, + "learning_rate": 1.0900737696406235e-05, + "loss": 0.2145, + "step": 6149 + }, + { + "epoch": 0.48722519310754603, + "grad_norm": 1.8317413813612966, + "learning_rate": 1.0898182065716227e-05, + "loss": 0.2077, + "step": 6150 + }, + { + "epoch": 0.4873044167161814, + "grad_norm": 1.5146055599726223, + "learning_rate": 1.0895626375885255e-05, + "loss": 0.2438, + "step": 6151 + }, + { + "epoch": 0.4873836403248168, + "grad_norm": 1.5366255100365689, + "learning_rate": 1.0893070627081595e-05, + "loss": 0.1401, + "step": 6152 + }, + { + "epoch": 0.4874628639334522, + "grad_norm": 1.777147234266726, + "learning_rate": 1.089051481947353e-05, + "loss": 0.2652, + "step": 6153 + }, + { + "epoch": 0.48754208754208755, + "grad_norm": 1.8967143317189588, + "learning_rate": 1.0887958953229349e-05, + "loss": 0.2517, + "step": 6154 + }, + { + "epoch": 0.48762131115072294, + "grad_norm": 1.1405015618207417, + "learning_rate": 1.0885403028517345e-05, + "loss": 0.1332, + "step": 6155 + }, + { + "epoch": 0.48770053475935826, + "grad_norm": 1.6299641167288337, + "learning_rate": 1.0882847045505809e-05, + "loss": 0.3274, + "step": 6156 + }, + { + "epoch": 0.48777975836799364, + "grad_norm": 1.4423745116920934, + "learning_rate": 1.0880291004363047e-05, + "loss": 0.223, + "step": 6157 + }, + { + "epoch": 0.487858981976629, + "grad_norm": 1.6379693450780084, + "learning_rate": 1.0877734905257354e-05, + "loss": 0.1759, + "step": 6158 + }, + { + "epoch": 0.4879382055852644, + "grad_norm": 1.2827861539072318, + "learning_rate": 1.0875178748357045e-05, + "loss": 0.1468, + "step": 6159 + }, + { + "epoch": 0.4880174291938998, + "grad_norm": 1.593309498979362, + "learning_rate": 1.0872622533830423e-05, + "loss": 0.2166, + "step": 6160 + }, + { + "epoch": 0.48809665280253517, + "grad_norm": 1.6965804035508694, + "learning_rate": 1.0870066261845807e-05, + "loss": 0.2574, + "step": 6161 + }, + { + "epoch": 0.48817587641117055, + "grad_norm": 1.4087665406499363, + "learning_rate": 1.0867509932571517e-05, + "loss": 0.2055, + "step": 6162 + }, + { + "epoch": 0.4882551000198059, + "grad_norm": 1.6566225045751006, + "learning_rate": 1.0864953546175867e-05, + "loss": 0.2288, + "step": 6163 + }, + { + "epoch": 0.48833432362844126, + "grad_norm": 2.0026294389901156, + "learning_rate": 1.0862397102827189e-05, + "loss": 0.3402, + "step": 6164 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 1.4476077521092667, + "learning_rate": 1.0859840602693813e-05, + "loss": 0.2276, + "step": 6165 + }, + { + "epoch": 0.488492770845712, + "grad_norm": 1.327480250661797, + "learning_rate": 1.0857284045944071e-05, + "loss": 0.1654, + "step": 6166 + }, + { + "epoch": 0.4885719944543474, + "grad_norm": 1.4607350372368566, + "learning_rate": 1.0854727432746302e-05, + "loss": 0.2383, + "step": 6167 + }, + { + "epoch": 0.4886512180629828, + "grad_norm": 1.5882571358781843, + "learning_rate": 1.0852170763268838e-05, + "loss": 0.1409, + "step": 6168 + }, + { + "epoch": 0.48873044167161817, + "grad_norm": 1.6425249451096904, + "learning_rate": 1.0849614037680032e-05, + "loss": 0.2884, + "step": 6169 + }, + { + "epoch": 0.4888096652802535, + "grad_norm": 1.7509850350459937, + "learning_rate": 1.0847057256148234e-05, + "loss": 0.2006, + "step": 6170 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 1.3753872618805856, + "learning_rate": 1.0844500418841788e-05, + "loss": 0.2015, + "step": 6171 + }, + { + "epoch": 0.48896811249752425, + "grad_norm": 2.269210263720901, + "learning_rate": 1.0841943525929053e-05, + "loss": 0.3382, + "step": 6172 + }, + { + "epoch": 0.48904733610615964, + "grad_norm": 1.7117425630790397, + "learning_rate": 1.0839386577578389e-05, + "loss": 0.2681, + "step": 6173 + }, + { + "epoch": 0.489126559714795, + "grad_norm": 1.5168418983762082, + "learning_rate": 1.0836829573958155e-05, + "loss": 0.1915, + "step": 6174 + }, + { + "epoch": 0.4892057833234304, + "grad_norm": 1.5214665522613349, + "learning_rate": 1.083427251523672e-05, + "loss": 0.2751, + "step": 6175 + }, + { + "epoch": 0.4892850069320658, + "grad_norm": 1.823119457155279, + "learning_rate": 1.0831715401582458e-05, + "loss": 0.193, + "step": 6176 + }, + { + "epoch": 0.4893642305407011, + "grad_norm": 1.4058285661576775, + "learning_rate": 1.0829158233163737e-05, + "loss": 0.204, + "step": 6177 + }, + { + "epoch": 0.4894434541493365, + "grad_norm": 1.6781061166192746, + "learning_rate": 1.0826601010148935e-05, + "loss": 0.3637, + "step": 6178 + }, + { + "epoch": 0.48952267775797187, + "grad_norm": 1.822493057629355, + "learning_rate": 1.0824043732706435e-05, + "loss": 0.2831, + "step": 6179 + }, + { + "epoch": 0.48960190136660725, + "grad_norm": 1.827192957488779, + "learning_rate": 1.0821486401004618e-05, + "loss": 0.2764, + "step": 6180 + }, + { + "epoch": 0.48968112497524263, + "grad_norm": 1.3201956023986317, + "learning_rate": 1.0818929015211877e-05, + "loss": 0.2089, + "step": 6181 + }, + { + "epoch": 0.489760348583878, + "grad_norm": 1.591364500543472, + "learning_rate": 1.0816371575496598e-05, + "loss": 0.1856, + "step": 6182 + }, + { + "epoch": 0.4898395721925134, + "grad_norm": 1.5548064694852972, + "learning_rate": 1.081381408202718e-05, + "loss": 0.2305, + "step": 6183 + }, + { + "epoch": 0.4899187958011487, + "grad_norm": 1.8984068867259387, + "learning_rate": 1.0811256534972024e-05, + "loss": 0.2974, + "step": 6184 + }, + { + "epoch": 0.4899980194097841, + "grad_norm": 1.6690255863176764, + "learning_rate": 1.0808698934499524e-05, + "loss": 0.2095, + "step": 6185 + }, + { + "epoch": 0.4900772430184195, + "grad_norm": 1.6384072389275626, + "learning_rate": 1.0806141280778093e-05, + "loss": 0.2383, + "step": 6186 + }, + { + "epoch": 0.49015646662705487, + "grad_norm": 1.3964898991375696, + "learning_rate": 1.0803583573976137e-05, + "loss": 0.1816, + "step": 6187 + }, + { + "epoch": 0.49023569023569025, + "grad_norm": 1.4869059356215997, + "learning_rate": 1.0801025814262068e-05, + "loss": 0.1526, + "step": 6188 + }, + { + "epoch": 0.49031491384432563, + "grad_norm": 1.6555208196856617, + "learning_rate": 1.0798468001804305e-05, + "loss": 0.2435, + "step": 6189 + }, + { + "epoch": 0.490394137452961, + "grad_norm": 1.707388444778532, + "learning_rate": 1.0795910136771266e-05, + "loss": 0.2595, + "step": 6190 + }, + { + "epoch": 0.49047336106159634, + "grad_norm": 1.7413373290567598, + "learning_rate": 1.0793352219331371e-05, + "loss": 0.2422, + "step": 6191 + }, + { + "epoch": 0.4905525846702317, + "grad_norm": 1.6546228157343745, + "learning_rate": 1.0790794249653056e-05, + "loss": 0.2713, + "step": 6192 + }, + { + "epoch": 0.4906318082788671, + "grad_norm": 2.086706856315569, + "learning_rate": 1.0788236227904738e-05, + "loss": 0.2364, + "step": 6193 + }, + { + "epoch": 0.4907110318875025, + "grad_norm": 1.234681659125135, + "learning_rate": 1.0785678154254865e-05, + "loss": 0.1477, + "step": 6194 + }, + { + "epoch": 0.49079025549613786, + "grad_norm": 1.6177756384041073, + "learning_rate": 1.0783120028871858e-05, + "loss": 0.1776, + "step": 6195 + }, + { + "epoch": 0.49086947910477324, + "grad_norm": 1.3683553128734272, + "learning_rate": 1.0780561851924168e-05, + "loss": 0.1628, + "step": 6196 + }, + { + "epoch": 0.49094870271340857, + "grad_norm": 2.4870664651019707, + "learning_rate": 1.0778003623580237e-05, + "loss": 0.2969, + "step": 6197 + }, + { + "epoch": 0.49102792632204395, + "grad_norm": 1.8429099995761549, + "learning_rate": 1.077544534400851e-05, + "loss": 0.2589, + "step": 6198 + }, + { + "epoch": 0.49110714993067933, + "grad_norm": 1.8614813397662913, + "learning_rate": 1.0772887013377438e-05, + "loss": 0.3398, + "step": 6199 + }, + { + "epoch": 0.4911863735393147, + "grad_norm": 1.8831971955690425, + "learning_rate": 1.0770328631855476e-05, + "loss": 0.3528, + "step": 6200 + }, + { + "epoch": 0.4912655971479501, + "grad_norm": 1.8056050666799466, + "learning_rate": 1.0767770199611078e-05, + "loss": 0.2457, + "step": 6201 + }, + { + "epoch": 0.4913448207565855, + "grad_norm": 1.3202956840548274, + "learning_rate": 1.076521171681271e-05, + "loss": 0.1962, + "step": 6202 + }, + { + "epoch": 0.49142404436522086, + "grad_norm": 1.6439221517095415, + "learning_rate": 1.0762653183628831e-05, + "loss": 0.2307, + "step": 6203 + }, + { + "epoch": 0.4915032679738562, + "grad_norm": 1.6337769615802755, + "learning_rate": 1.0760094600227908e-05, + "loss": 0.3183, + "step": 6204 + }, + { + "epoch": 0.49158249158249157, + "grad_norm": 1.5978413299762448, + "learning_rate": 1.0757535966778416e-05, + "loss": 0.1749, + "step": 6205 + }, + { + "epoch": 0.49166171519112695, + "grad_norm": 1.8764726026369707, + "learning_rate": 1.0754977283448824e-05, + "loss": 0.2052, + "step": 6206 + }, + { + "epoch": 0.49174093879976233, + "grad_norm": 1.9308170808395688, + "learning_rate": 1.0752418550407611e-05, + "loss": 0.365, + "step": 6207 + }, + { + "epoch": 0.4918201624083977, + "grad_norm": 1.7649882877851126, + "learning_rate": 1.0749859767823256e-05, + "loss": 0.3075, + "step": 6208 + }, + { + "epoch": 0.4918993860170331, + "grad_norm": 1.5342336819764126, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.2787, + "step": 6209 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 1.6878091554591756, + "learning_rate": 1.074474205469906e-05, + "loss": 0.2376, + "step": 6210 + }, + { + "epoch": 0.4920578332343038, + "grad_norm": 1.74027552563882, + "learning_rate": 1.0742183124496197e-05, + "loss": 0.2972, + "step": 6211 + }, + { + "epoch": 0.4921370568429392, + "grad_norm": 1.8617598576763665, + "learning_rate": 1.0739624145424146e-05, + "loss": 0.2453, + "step": 6212 + }, + { + "epoch": 0.49221628045157456, + "grad_norm": 2.043202923732088, + "learning_rate": 1.0737065117651404e-05, + "loss": 0.2416, + "step": 6213 + }, + { + "epoch": 0.49229550406020994, + "grad_norm": 1.566968725000622, + "learning_rate": 1.0734506041346468e-05, + "loss": 0.2192, + "step": 6214 + }, + { + "epoch": 0.4923747276688453, + "grad_norm": 1.9960030115157306, + "learning_rate": 1.0731946916677847e-05, + "loss": 0.3274, + "step": 6215 + }, + { + "epoch": 0.4924539512774807, + "grad_norm": 1.5474557160628075, + "learning_rate": 1.0729387743814041e-05, + "loss": 0.1648, + "step": 6216 + }, + { + "epoch": 0.4925331748861161, + "grad_norm": 1.8497404877238057, + "learning_rate": 1.0726828522923563e-05, + "loss": 0.2269, + "step": 6217 + }, + { + "epoch": 0.4926123984947514, + "grad_norm": 1.4643826219838463, + "learning_rate": 1.0724269254174921e-05, + "loss": 0.2405, + "step": 6218 + }, + { + "epoch": 0.4926916221033868, + "grad_norm": 1.917819643524853, + "learning_rate": 1.0721709937736638e-05, + "loss": 0.3066, + "step": 6219 + }, + { + "epoch": 0.4927708457120222, + "grad_norm": 1.5417699427322982, + "learning_rate": 1.0719150573777226e-05, + "loss": 0.1755, + "step": 6220 + }, + { + "epoch": 0.49285006932065756, + "grad_norm": 1.7728099455613477, + "learning_rate": 1.071659116246521e-05, + "loss": 0.2175, + "step": 6221 + }, + { + "epoch": 0.49292929292929294, + "grad_norm": 1.8008396342929096, + "learning_rate": 1.0714031703969112e-05, + "loss": 0.2176, + "step": 6222 + }, + { + "epoch": 0.4930085165379283, + "grad_norm": 1.736444090475803, + "learning_rate": 1.0711472198457462e-05, + "loss": 0.2584, + "step": 6223 + }, + { + "epoch": 0.4930877401465637, + "grad_norm": 2.22861107242774, + "learning_rate": 1.0708912646098795e-05, + "loss": 0.2427, + "step": 6224 + }, + { + "epoch": 0.49316696375519903, + "grad_norm": 1.8247391087058051, + "learning_rate": 1.0706353047061638e-05, + "loss": 0.2421, + "step": 6225 + }, + { + "epoch": 0.4932461873638344, + "grad_norm": 1.8661070820992816, + "learning_rate": 1.070379340151453e-05, + "loss": 0.229, + "step": 6226 + }, + { + "epoch": 0.4933254109724698, + "grad_norm": 1.3987662440318664, + "learning_rate": 1.0701233709626018e-05, + "loss": 0.1682, + "step": 6227 + }, + { + "epoch": 0.4934046345811052, + "grad_norm": 1.5510543220236561, + "learning_rate": 1.0698673971564637e-05, + "loss": 0.1478, + "step": 6228 + }, + { + "epoch": 0.49348385818974055, + "grad_norm": 1.6480913304124614, + "learning_rate": 1.0696114187498938e-05, + "loss": 0.2309, + "step": 6229 + }, + { + "epoch": 0.49356308179837594, + "grad_norm": 1.5842404649955728, + "learning_rate": 1.0693554357597469e-05, + "loss": 0.205, + "step": 6230 + }, + { + "epoch": 0.4936423054070113, + "grad_norm": 2.406755385993806, + "learning_rate": 1.069099448202878e-05, + "loss": 0.1801, + "step": 6231 + }, + { + "epoch": 0.49372152901564664, + "grad_norm": 1.9988435795022914, + "learning_rate": 1.0688434560961434e-05, + "loss": 0.2474, + "step": 6232 + }, + { + "epoch": 0.493800752624282, + "grad_norm": 1.950887030012949, + "learning_rate": 1.068587459456398e-05, + "loss": 0.2217, + "step": 6233 + }, + { + "epoch": 0.4938799762329174, + "grad_norm": 1.6089132951068137, + "learning_rate": 1.0683314583004986e-05, + "loss": 0.1949, + "step": 6234 + }, + { + "epoch": 0.4939591998415528, + "grad_norm": 1.8548905958510753, + "learning_rate": 1.0680754526453017e-05, + "loss": 0.2357, + "step": 6235 + }, + { + "epoch": 0.49403842345018817, + "grad_norm": 1.943167464036002, + "learning_rate": 1.0678194425076633e-05, + "loss": 0.3554, + "step": 6236 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 1.700182530393419, + "learning_rate": 1.0675634279044416e-05, + "loss": 0.189, + "step": 6237 + }, + { + "epoch": 0.4941968706674589, + "grad_norm": 1.6672715305641543, + "learning_rate": 1.0673074088524926e-05, + "loss": 0.2305, + "step": 6238 + }, + { + "epoch": 0.49427609427609426, + "grad_norm": 1.7138439404596937, + "learning_rate": 1.067051385368675e-05, + "loss": 0.1855, + "step": 6239 + }, + { + "epoch": 0.49435531788472964, + "grad_norm": 2.02775499115976, + "learning_rate": 1.0667953574698461e-05, + "loss": 0.2236, + "step": 6240 + }, + { + "epoch": 0.494434541493365, + "grad_norm": 1.520874435187763, + "learning_rate": 1.0665393251728645e-05, + "loss": 0.1385, + "step": 6241 + }, + { + "epoch": 0.4945137651020004, + "grad_norm": 1.274025128194307, + "learning_rate": 1.0662832884945884e-05, + "loss": 0.1908, + "step": 6242 + }, + { + "epoch": 0.4945929887106358, + "grad_norm": 1.2739953099257084, + "learning_rate": 1.0660272474518767e-05, + "loss": 0.1984, + "step": 6243 + }, + { + "epoch": 0.49467221231927117, + "grad_norm": 1.7728928823269645, + "learning_rate": 1.0657712020615885e-05, + "loss": 0.2525, + "step": 6244 + }, + { + "epoch": 0.4947514359279065, + "grad_norm": 1.7285672362809472, + "learning_rate": 1.0655151523405831e-05, + "loss": 0.2871, + "step": 6245 + }, + { + "epoch": 0.4948306595365419, + "grad_norm": 1.3269595406192067, + "learning_rate": 1.06525909830572e-05, + "loss": 0.1822, + "step": 6246 + }, + { + "epoch": 0.49490988314517725, + "grad_norm": 1.300431502959002, + "learning_rate": 1.0650030399738594e-05, + "loss": 0.1724, + "step": 6247 + }, + { + "epoch": 0.49498910675381264, + "grad_norm": 1.2556205744629074, + "learning_rate": 1.0647469773618617e-05, + "loss": 0.1525, + "step": 6248 + }, + { + "epoch": 0.495068330362448, + "grad_norm": 1.4014797610476046, + "learning_rate": 1.0644909104865869e-05, + "loss": 0.2244, + "step": 6249 + }, + { + "epoch": 0.4951475539710834, + "grad_norm": 1.8979961708367814, + "learning_rate": 1.0642348393648956e-05, + "loss": 0.2852, + "step": 6250 + }, + { + "epoch": 0.4952267775797188, + "grad_norm": 1.6449565023773456, + "learning_rate": 1.0639787640136497e-05, + "loss": 0.2212, + "step": 6251 + }, + { + "epoch": 0.4953060011883541, + "grad_norm": 1.6999320369453943, + "learning_rate": 1.0637226844497096e-05, + "loss": 0.2247, + "step": 6252 + }, + { + "epoch": 0.4953852247969895, + "grad_norm": 1.581359536928781, + "learning_rate": 1.0634666006899375e-05, + "loss": 0.1958, + "step": 6253 + }, + { + "epoch": 0.49546444840562487, + "grad_norm": 1.4700221070014095, + "learning_rate": 1.0632105127511952e-05, + "loss": 0.1691, + "step": 6254 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 1.7447688111101973, + "learning_rate": 1.0629544206503445e-05, + "loss": 0.3034, + "step": 6255 + }, + { + "epoch": 0.49562289562289563, + "grad_norm": 2.0028160642807524, + "learning_rate": 1.0626983244042486e-05, + "loss": 0.1967, + "step": 6256 + }, + { + "epoch": 0.495702119231531, + "grad_norm": 1.9759915357404259, + "learning_rate": 1.0624422240297694e-05, + "loss": 0.3106, + "step": 6257 + }, + { + "epoch": 0.4957813428401664, + "grad_norm": 2.2473552691511673, + "learning_rate": 1.0621861195437703e-05, + "loss": 0.2636, + "step": 6258 + }, + { + "epoch": 0.4958605664488017, + "grad_norm": 1.7080124839243644, + "learning_rate": 1.0619300109631146e-05, + "loss": 0.2223, + "step": 6259 + }, + { + "epoch": 0.4959397900574371, + "grad_norm": 1.9493290404083532, + "learning_rate": 1.0616738983046652e-05, + "loss": 0.2912, + "step": 6260 + }, + { + "epoch": 0.4960190136660725, + "grad_norm": 1.774180884373201, + "learning_rate": 1.0614177815852866e-05, + "loss": 0.2662, + "step": 6261 + }, + { + "epoch": 0.49609823727470787, + "grad_norm": 2.104038298922806, + "learning_rate": 1.0611616608218429e-05, + "loss": 0.2715, + "step": 6262 + }, + { + "epoch": 0.49617746088334325, + "grad_norm": 2.0133323885762953, + "learning_rate": 1.0609055360311978e-05, + "loss": 0.291, + "step": 6263 + }, + { + "epoch": 0.49625668449197863, + "grad_norm": 1.6995673928053876, + "learning_rate": 1.0606494072302164e-05, + "loss": 0.1957, + "step": 6264 + }, + { + "epoch": 0.496335908100614, + "grad_norm": 1.4371850695635577, + "learning_rate": 1.0603932744357632e-05, + "loss": 0.243, + "step": 6265 + }, + { + "epoch": 0.49641513170924934, + "grad_norm": 1.5461293396673728, + "learning_rate": 1.0601371376647034e-05, + "loss": 0.2754, + "step": 6266 + }, + { + "epoch": 0.4964943553178847, + "grad_norm": 2.272889394052342, + "learning_rate": 1.0598809969339028e-05, + "loss": 0.389, + "step": 6267 + }, + { + "epoch": 0.4965735789265201, + "grad_norm": 1.9938155741621413, + "learning_rate": 1.0596248522602264e-05, + "loss": 0.2671, + "step": 6268 + }, + { + "epoch": 0.4966528025351555, + "grad_norm": 1.5923380031549277, + "learning_rate": 1.0593687036605402e-05, + "loss": 0.161, + "step": 6269 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 1.5391093068331152, + "learning_rate": 1.0591125511517108e-05, + "loss": 0.1889, + "step": 6270 + }, + { + "epoch": 0.49681124975242624, + "grad_norm": 1.6143157315481598, + "learning_rate": 1.0588563947506043e-05, + "loss": 0.2365, + "step": 6271 + }, + { + "epoch": 0.49689047336106157, + "grad_norm": 1.6028930820603855, + "learning_rate": 1.0586002344740875e-05, + "loss": 0.2258, + "step": 6272 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 1.8658451845775084, + "learning_rate": 1.0583440703390271e-05, + "loss": 0.2944, + "step": 6273 + }, + { + "epoch": 0.49704892057833233, + "grad_norm": 1.8139205198244914, + "learning_rate": 1.0580879023622903e-05, + "loss": 0.2128, + "step": 6274 + }, + { + "epoch": 0.4971281441869677, + "grad_norm": 1.6876639323764948, + "learning_rate": 1.0578317305607451e-05, + "loss": 0.2267, + "step": 6275 + }, + { + "epoch": 0.4972073677956031, + "grad_norm": 1.6145051881163168, + "learning_rate": 1.057575554951258e-05, + "loss": 0.2353, + "step": 6276 + }, + { + "epoch": 0.4972865914042385, + "grad_norm": 1.541671363012306, + "learning_rate": 1.0573193755506982e-05, + "loss": 0.1943, + "step": 6277 + }, + { + "epoch": 0.49736581501287386, + "grad_norm": 1.7805912243183721, + "learning_rate": 1.0570631923759331e-05, + "loss": 0.2255, + "step": 6278 + }, + { + "epoch": 0.4974450386215092, + "grad_norm": 1.6582149202006344, + "learning_rate": 1.0568070054438314e-05, + "loss": 0.2408, + "step": 6279 + }, + { + "epoch": 0.49752426223014456, + "grad_norm": 1.542038882128615, + "learning_rate": 1.0565508147712618e-05, + "loss": 0.2329, + "step": 6280 + }, + { + "epoch": 0.49760348583877995, + "grad_norm": 1.9793328516101398, + "learning_rate": 1.056294620375093e-05, + "loss": 0.3322, + "step": 6281 + }, + { + "epoch": 0.49768270944741533, + "grad_norm": 2.1944197763631634, + "learning_rate": 1.0560384222721943e-05, + "loss": 0.2469, + "step": 6282 + }, + { + "epoch": 0.4977619330560507, + "grad_norm": 1.7035911251232576, + "learning_rate": 1.0557822204794353e-05, + "loss": 0.2668, + "step": 6283 + }, + { + "epoch": 0.4978411566646861, + "grad_norm": 1.4342672728497798, + "learning_rate": 1.0555260150136852e-05, + "loss": 0.2024, + "step": 6284 + }, + { + "epoch": 0.4979203802733215, + "grad_norm": 1.6415210042146333, + "learning_rate": 1.0552698058918146e-05, + "loss": 0.1334, + "step": 6285 + }, + { + "epoch": 0.4979996038819568, + "grad_norm": 1.406911814066835, + "learning_rate": 1.055013593130693e-05, + "loss": 0.2135, + "step": 6286 + }, + { + "epoch": 0.4980788274905922, + "grad_norm": 1.518470094211511, + "learning_rate": 1.0547573767471913e-05, + "loss": 0.2229, + "step": 6287 + }, + { + "epoch": 0.49815805109922756, + "grad_norm": 1.4365778785115295, + "learning_rate": 1.0545011567581794e-05, + "loss": 0.1693, + "step": 6288 + }, + { + "epoch": 0.49823727470786294, + "grad_norm": 1.7249480014507745, + "learning_rate": 1.0542449331805287e-05, + "loss": 0.2155, + "step": 6289 + }, + { + "epoch": 0.4983164983164983, + "grad_norm": 1.6063150206212504, + "learning_rate": 1.05398870603111e-05, + "loss": 0.2411, + "step": 6290 + }, + { + "epoch": 0.4983957219251337, + "grad_norm": 1.5541308965441618, + "learning_rate": 1.0537324753267952e-05, + "loss": 0.1631, + "step": 6291 + }, + { + "epoch": 0.4984749455337691, + "grad_norm": 1.6414487467018528, + "learning_rate": 1.053476241084455e-05, + "loss": 0.2322, + "step": 6292 + }, + { + "epoch": 0.4985541691424044, + "grad_norm": 1.2846477326502914, + "learning_rate": 1.0532200033209618e-05, + "loss": 0.1725, + "step": 6293 + }, + { + "epoch": 0.4986333927510398, + "grad_norm": 1.3955416200753925, + "learning_rate": 1.0529637620531876e-05, + "loss": 0.1839, + "step": 6294 + }, + { + "epoch": 0.4987126163596752, + "grad_norm": 2.0885884034288345, + "learning_rate": 1.0527075172980043e-05, + "loss": 0.2279, + "step": 6295 + }, + { + "epoch": 0.49879183996831056, + "grad_norm": 1.9742302365033042, + "learning_rate": 1.0524512690722848e-05, + "loss": 0.3424, + "step": 6296 + }, + { + "epoch": 0.49887106357694594, + "grad_norm": 1.8781039276787217, + "learning_rate": 1.0521950173929017e-05, + "loss": 0.3458, + "step": 6297 + }, + { + "epoch": 0.4989502871855813, + "grad_norm": 1.717886744370777, + "learning_rate": 1.0519387622767274e-05, + "loss": 0.2295, + "step": 6298 + }, + { + "epoch": 0.4990295107942167, + "grad_norm": 1.5564022869339005, + "learning_rate": 1.051682503740636e-05, + "loss": 0.2135, + "step": 6299 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 1.6977806638522623, + "learning_rate": 1.0514262418015e-05, + "loss": 0.3145, + "step": 6300 + }, + { + "epoch": 0.4991879580114874, + "grad_norm": 1.7819218782039914, + "learning_rate": 1.0511699764761935e-05, + "loss": 0.2523, + "step": 6301 + }, + { + "epoch": 0.4992671816201228, + "grad_norm": 2.1345146772996775, + "learning_rate": 1.0509137077815906e-05, + "loss": 0.3132, + "step": 6302 + }, + { + "epoch": 0.4993464052287582, + "grad_norm": 1.804156608044116, + "learning_rate": 1.0506574357345647e-05, + "loss": 0.2572, + "step": 6303 + }, + { + "epoch": 0.49942562883739355, + "grad_norm": 1.0471112911205103, + "learning_rate": 1.0504011603519904e-05, + "loss": 0.116, + "step": 6304 + }, + { + "epoch": 0.49950485244602894, + "grad_norm": 2.028595283466379, + "learning_rate": 1.0501448816507425e-05, + "loss": 0.2445, + "step": 6305 + }, + { + "epoch": 0.4995840760546643, + "grad_norm": 1.8165845922227482, + "learning_rate": 1.0498885996476952e-05, + "loss": 0.2396, + "step": 6306 + }, + { + "epoch": 0.49966329966329964, + "grad_norm": 2.015380461207272, + "learning_rate": 1.0496323143597237e-05, + "loss": 0.2768, + "step": 6307 + }, + { + "epoch": 0.499742523271935, + "grad_norm": 1.6462508798604103, + "learning_rate": 1.049376025803703e-05, + "loss": 0.2161, + "step": 6308 + }, + { + "epoch": 0.4998217468805704, + "grad_norm": 1.3749269335494156, + "learning_rate": 1.0491197339965087e-05, + "loss": 0.2323, + "step": 6309 + }, + { + "epoch": 0.4999009704892058, + "grad_norm": 1.3425668212128603, + "learning_rate": 1.0488634389550166e-05, + "loss": 0.1674, + "step": 6310 + }, + { + "epoch": 0.49998019409784117, + "grad_norm": 1.6571866474508175, + "learning_rate": 1.0486071406961017e-05, + "loss": 0.1912, + "step": 6311 + }, + { + "epoch": 0.5000594177064765, + "grad_norm": 1.827655913078546, + "learning_rate": 1.0483508392366404e-05, + "loss": 0.2727, + "step": 6312 + }, + { + "epoch": 0.5001386413151119, + "grad_norm": 1.7156013230434588, + "learning_rate": 1.0480945345935094e-05, + "loss": 0.2837, + "step": 6313 + }, + { + "epoch": 0.5002178649237473, + "grad_norm": 1.846517908626761, + "learning_rate": 1.0478382267835843e-05, + "loss": 0.2715, + "step": 6314 + }, + { + "epoch": 0.5002970885323826, + "grad_norm": 1.5305723207481021, + "learning_rate": 1.0475819158237426e-05, + "loss": 0.2572, + "step": 6315 + }, + { + "epoch": 0.5003763121410181, + "grad_norm": 1.9127396577335278, + "learning_rate": 1.0473256017308601e-05, + "loss": 0.2569, + "step": 6316 + }, + { + "epoch": 0.5004555357496534, + "grad_norm": 1.7553345457736045, + "learning_rate": 1.047069284521815e-05, + "loss": 0.2083, + "step": 6317 + }, + { + "epoch": 0.5005347593582887, + "grad_norm": 1.660106564245848, + "learning_rate": 1.0468129642134837e-05, + "loss": 0.2752, + "step": 6318 + }, + { + "epoch": 0.5006139829669242, + "grad_norm": 2.41784987591571, + "learning_rate": 1.046556640822744e-05, + "loss": 0.2469, + "step": 6319 + }, + { + "epoch": 0.5006932065755595, + "grad_norm": 1.4166745255418107, + "learning_rate": 1.0463003143664734e-05, + "loss": 0.2187, + "step": 6320 + }, + { + "epoch": 0.5007724301841949, + "grad_norm": 1.5265887074114244, + "learning_rate": 1.0460439848615502e-05, + "loss": 0.2423, + "step": 6321 + }, + { + "epoch": 0.5008516537928303, + "grad_norm": 1.3372642398829884, + "learning_rate": 1.0457876523248518e-05, + "loss": 0.1326, + "step": 6322 + }, + { + "epoch": 0.5009308774014657, + "grad_norm": 1.8058364796939765, + "learning_rate": 1.0455313167732573e-05, + "loss": 0.2854, + "step": 6323 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 1.5342150322869643, + "learning_rate": 1.0452749782236443e-05, + "loss": 0.2121, + "step": 6324 + }, + { + "epoch": 0.5010893246187363, + "grad_norm": 1.3407703992101323, + "learning_rate": 1.0450186366928917e-05, + "loss": 0.1819, + "step": 6325 + }, + { + "epoch": 0.5011685482273718, + "grad_norm": 1.3182866296450149, + "learning_rate": 1.044762292197879e-05, + "loss": 0.1994, + "step": 6326 + }, + { + "epoch": 0.5012477718360071, + "grad_norm": 1.7854218945014915, + "learning_rate": 1.0445059447554844e-05, + "loss": 0.2794, + "step": 6327 + }, + { + "epoch": 0.5013269954446425, + "grad_norm": 1.6298876965130429, + "learning_rate": 1.0442495943825874e-05, + "loss": 0.2383, + "step": 6328 + }, + { + "epoch": 0.5014062190532779, + "grad_norm": 1.784303345781785, + "learning_rate": 1.0439932410960678e-05, + "loss": 0.2529, + "step": 6329 + }, + { + "epoch": 0.5014854426619133, + "grad_norm": 1.3647047954385347, + "learning_rate": 1.0437368849128046e-05, + "loss": 0.1602, + "step": 6330 + }, + { + "epoch": 0.5015646662705486, + "grad_norm": 1.6015511226112418, + "learning_rate": 1.043480525849678e-05, + "loss": 0.1831, + "step": 6331 + }, + { + "epoch": 0.501643889879184, + "grad_norm": 1.4893033260344972, + "learning_rate": 1.0432241639235686e-05, + "loss": 0.2519, + "step": 6332 + }, + { + "epoch": 0.5017231134878194, + "grad_norm": 1.5525755437047382, + "learning_rate": 1.0429677991513554e-05, + "loss": 0.1652, + "step": 6333 + }, + { + "epoch": 0.5018023370964547, + "grad_norm": 1.6681135791132164, + "learning_rate": 1.0427114315499196e-05, + "loss": 0.2545, + "step": 6334 + }, + { + "epoch": 0.5018815607050902, + "grad_norm": 2.090941206219267, + "learning_rate": 1.0424550611361412e-05, + "loss": 0.2507, + "step": 6335 + }, + { + "epoch": 0.5019607843137255, + "grad_norm": 1.909480806462905, + "learning_rate": 1.0421986879269017e-05, + "loss": 0.308, + "step": 6336 + }, + { + "epoch": 0.5020400079223608, + "grad_norm": 2.4079728072568067, + "learning_rate": 1.0419423119390815e-05, + "loss": 0.2683, + "step": 6337 + }, + { + "epoch": 0.5021192315309962, + "grad_norm": 1.5609549837679797, + "learning_rate": 1.041685933189562e-05, + "loss": 0.1649, + "step": 6338 + }, + { + "epoch": 0.5021984551396316, + "grad_norm": 1.5757878775776288, + "learning_rate": 1.041429551695224e-05, + "loss": 0.2232, + "step": 6339 + }, + { + "epoch": 0.502277678748267, + "grad_norm": 1.5012421595006589, + "learning_rate": 1.0411731674729497e-05, + "loss": 0.238, + "step": 6340 + }, + { + "epoch": 0.5023569023569023, + "grad_norm": 1.362074463931762, + "learning_rate": 1.0409167805396202e-05, + "loss": 0.1397, + "step": 6341 + }, + { + "epoch": 0.5024361259655378, + "grad_norm": 1.7840272250382005, + "learning_rate": 1.040660390912118e-05, + "loss": 0.3561, + "step": 6342 + }, + { + "epoch": 0.5025153495741731, + "grad_norm": 1.79990790691543, + "learning_rate": 1.0404039986073244e-05, + "loss": 0.2492, + "step": 6343 + }, + { + "epoch": 0.5025945731828084, + "grad_norm": 1.7865429314491876, + "learning_rate": 1.0401476036421219e-05, + "loss": 0.2447, + "step": 6344 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 1.6012233207399875, + "learning_rate": 1.039891206033393e-05, + "loss": 0.171, + "step": 6345 + }, + { + "epoch": 0.5027530204000792, + "grad_norm": 1.7536637726753614, + "learning_rate": 1.0396348057980202e-05, + "loss": 0.2535, + "step": 6346 + }, + { + "epoch": 0.5028322440087146, + "grad_norm": 1.6610435307688316, + "learning_rate": 1.0393784029528858e-05, + "loss": 0.2158, + "step": 6347 + }, + { + "epoch": 0.50291146761735, + "grad_norm": 1.6140842049935724, + "learning_rate": 1.0391219975148734e-05, + "loss": 0.3368, + "step": 6348 + }, + { + "epoch": 0.5029906912259854, + "grad_norm": 1.666984643226144, + "learning_rate": 1.0388655895008654e-05, + "loss": 0.214, + "step": 6349 + }, + { + "epoch": 0.5030699148346207, + "grad_norm": 1.840109797374884, + "learning_rate": 1.0386091789277458e-05, + "loss": 0.2698, + "step": 6350 + }, + { + "epoch": 0.503149138443256, + "grad_norm": 1.7346545437050827, + "learning_rate": 1.038352765812397e-05, + "loss": 0.2612, + "step": 6351 + }, + { + "epoch": 0.5032283620518915, + "grad_norm": 1.3521963542225746, + "learning_rate": 1.0380963501717034e-05, + "loss": 0.2983, + "step": 6352 + }, + { + "epoch": 0.5033075856605268, + "grad_norm": 2.1323357641259952, + "learning_rate": 1.0378399320225486e-05, + "loss": 0.2248, + "step": 6353 + }, + { + "epoch": 0.5033868092691622, + "grad_norm": 1.416993577424726, + "learning_rate": 1.037583511381816e-05, + "loss": 0.1824, + "step": 6354 + }, + { + "epoch": 0.5034660328777976, + "grad_norm": 1.6675615195126514, + "learning_rate": 1.0373270882663899e-05, + "loss": 0.3483, + "step": 6355 + }, + { + "epoch": 0.503545256486433, + "grad_norm": 1.7897076886151495, + "learning_rate": 1.0370706626931553e-05, + "loss": 0.283, + "step": 6356 + }, + { + "epoch": 0.5036244800950683, + "grad_norm": 1.429445325993827, + "learning_rate": 1.0368142346789954e-05, + "loss": 0.216, + "step": 6357 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 1.8603280964120084, + "learning_rate": 1.0365578042407956e-05, + "loss": 0.2864, + "step": 6358 + }, + { + "epoch": 0.5037829273123391, + "grad_norm": 1.7968005399613283, + "learning_rate": 1.03630137139544e-05, + "loss": 0.2399, + "step": 6359 + }, + { + "epoch": 0.5038621509209744, + "grad_norm": 1.7691777966579632, + "learning_rate": 1.0360449361598137e-05, + "loss": 0.2415, + "step": 6360 + }, + { + "epoch": 0.5039413745296099, + "grad_norm": 1.7441744503718601, + "learning_rate": 1.0357884985508022e-05, + "loss": 0.2448, + "step": 6361 + }, + { + "epoch": 0.5040205981382452, + "grad_norm": 2.1249670969669716, + "learning_rate": 1.03553205858529e-05, + "loss": 0.3841, + "step": 6362 + }, + { + "epoch": 0.5040998217468806, + "grad_norm": 2.2016152688141992, + "learning_rate": 1.0352756162801626e-05, + "loss": 0.3419, + "step": 6363 + }, + { + "epoch": 0.5041790453555159, + "grad_norm": 1.5333408621875049, + "learning_rate": 1.035019171652306e-05, + "loss": 0.2508, + "step": 6364 + }, + { + "epoch": 0.5042582689641513, + "grad_norm": 1.247294378243989, + "learning_rate": 1.0347627247186053e-05, + "loss": 0.177, + "step": 6365 + }, + { + "epoch": 0.5043374925727867, + "grad_norm": 1.7516479384347678, + "learning_rate": 1.0345062754959463e-05, + "loss": 0.3175, + "step": 6366 + }, + { + "epoch": 0.504416716181422, + "grad_norm": 1.6041067932546424, + "learning_rate": 1.0342498240012153e-05, + "loss": 0.2357, + "step": 6367 + }, + { + "epoch": 0.5044959397900575, + "grad_norm": 1.3959411332554874, + "learning_rate": 1.0339933702512978e-05, + "loss": 0.2041, + "step": 6368 + }, + { + "epoch": 0.5045751633986928, + "grad_norm": 1.3170003386496107, + "learning_rate": 1.0337369142630808e-05, + "loss": 0.1721, + "step": 6369 + }, + { + "epoch": 0.5046543870073282, + "grad_norm": 1.3809195771000535, + "learning_rate": 1.0334804560534504e-05, + "loss": 0.21, + "step": 6370 + }, + { + "epoch": 0.5047336106159636, + "grad_norm": 1.5716623222593202, + "learning_rate": 1.0332239956392926e-05, + "loss": 0.2344, + "step": 6371 + }, + { + "epoch": 0.5048128342245989, + "grad_norm": 1.7380172335496857, + "learning_rate": 1.032967533037495e-05, + "loss": 0.2536, + "step": 6372 + }, + { + "epoch": 0.5048920578332343, + "grad_norm": 1.4957415004431371, + "learning_rate": 1.0327110682649436e-05, + "loss": 0.2212, + "step": 6373 + }, + { + "epoch": 0.5049712814418696, + "grad_norm": 1.517018916348201, + "learning_rate": 1.0324546013385258e-05, + "loss": 0.1815, + "step": 6374 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 2.2578757051998917, + "learning_rate": 1.0321981322751291e-05, + "loss": 0.2644, + "step": 6375 + }, + { + "epoch": 0.5051297286591404, + "grad_norm": 1.6413500872988736, + "learning_rate": 1.03194166109164e-05, + "loss": 0.2902, + "step": 6376 + }, + { + "epoch": 0.5052089522677758, + "grad_norm": 1.6785424842887002, + "learning_rate": 1.0316851878049465e-05, + "loss": 0.1986, + "step": 6377 + }, + { + "epoch": 0.5052881758764112, + "grad_norm": 1.671676268887419, + "learning_rate": 1.0314287124319353e-05, + "loss": 0.2984, + "step": 6378 + }, + { + "epoch": 0.5053673994850465, + "grad_norm": 1.499333670173624, + "learning_rate": 1.031172234989495e-05, + "loss": 0.2448, + "step": 6379 + }, + { + "epoch": 0.5054466230936819, + "grad_norm": 1.8918815385312764, + "learning_rate": 1.030915755494513e-05, + "loss": 0.3522, + "step": 6380 + }, + { + "epoch": 0.5055258467023173, + "grad_norm": 1.8474202065018004, + "learning_rate": 1.030659273963877e-05, + "loss": 0.3227, + "step": 6381 + }, + { + "epoch": 0.5056050703109527, + "grad_norm": 1.7072562151528647, + "learning_rate": 1.0304027904144756e-05, + "loss": 0.2689, + "step": 6382 + }, + { + "epoch": 0.505684293919588, + "grad_norm": 1.7229229482924233, + "learning_rate": 1.0301463048631968e-05, + "loss": 0.3086, + "step": 6383 + }, + { + "epoch": 0.5057635175282235, + "grad_norm": 1.5392113989401297, + "learning_rate": 1.0298898173269285e-05, + "loss": 0.1874, + "step": 6384 + }, + { + "epoch": 0.5058427411368588, + "grad_norm": 1.7649293517453457, + "learning_rate": 1.0296333278225599e-05, + "loss": 0.1865, + "step": 6385 + }, + { + "epoch": 0.5059219647454941, + "grad_norm": 1.3623992651479446, + "learning_rate": 1.0293768363669791e-05, + "loss": 0.1731, + "step": 6386 + }, + { + "epoch": 0.5060011883541295, + "grad_norm": 2.0554672693123064, + "learning_rate": 1.0291203429770749e-05, + "loss": 0.241, + "step": 6387 + }, + { + "epoch": 0.5060804119627649, + "grad_norm": 1.8977700782921658, + "learning_rate": 1.0288638476697365e-05, + "loss": 0.2363, + "step": 6388 + }, + { + "epoch": 0.5061596355714003, + "grad_norm": 1.8018294158848267, + "learning_rate": 1.0286073504618524e-05, + "loss": 0.2462, + "step": 6389 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 1.5086785074066884, + "learning_rate": 1.0283508513703118e-05, + "loss": 0.2675, + "step": 6390 + }, + { + "epoch": 0.5063180827886711, + "grad_norm": 1.7316826770681022, + "learning_rate": 1.0280943504120045e-05, + "loss": 0.2275, + "step": 6391 + }, + { + "epoch": 0.5063973063973064, + "grad_norm": 1.6406645468220442, + "learning_rate": 1.027837847603819e-05, + "loss": 0.24, + "step": 6392 + }, + { + "epoch": 0.5064765300059417, + "grad_norm": 1.7896699181154339, + "learning_rate": 1.0275813429626456e-05, + "loss": 0.1906, + "step": 6393 + }, + { + "epoch": 0.5065557536145772, + "grad_norm": 1.6423856572343716, + "learning_rate": 1.027324836505373e-05, + "loss": 0.3236, + "step": 6394 + }, + { + "epoch": 0.5066349772232125, + "grad_norm": 1.7250301833281598, + "learning_rate": 1.0270683282488913e-05, + "loss": 0.2207, + "step": 6395 + }, + { + "epoch": 0.5067142008318479, + "grad_norm": 1.5731810279847769, + "learning_rate": 1.026811818210091e-05, + "loss": 0.1528, + "step": 6396 + }, + { + "epoch": 0.5067934244404833, + "grad_norm": 1.4487655673473139, + "learning_rate": 1.0265553064058612e-05, + "loss": 0.1318, + "step": 6397 + }, + { + "epoch": 0.5068726480491187, + "grad_norm": 1.5064157461612981, + "learning_rate": 1.0262987928530921e-05, + "loss": 0.206, + "step": 6398 + }, + { + "epoch": 0.506951871657754, + "grad_norm": 1.6435504834148675, + "learning_rate": 1.0260422775686743e-05, + "loss": 0.3464, + "step": 6399 + }, + { + "epoch": 0.5070310952663893, + "grad_norm": 1.761862850803852, + "learning_rate": 1.0257857605694976e-05, + "loss": 0.1726, + "step": 6400 + }, + { + "epoch": 0.5071103188750248, + "grad_norm": 1.5048964862319045, + "learning_rate": 1.025529241872453e-05, + "loss": 0.1936, + "step": 6401 + }, + { + "epoch": 0.5071895424836601, + "grad_norm": 1.394518329988283, + "learning_rate": 1.0252727214944302e-05, + "loss": 0.1708, + "step": 6402 + }, + { + "epoch": 0.5072687660922955, + "grad_norm": 1.7545048642906012, + "learning_rate": 1.0250161994523205e-05, + "loss": 0.2023, + "step": 6403 + }, + { + "epoch": 0.5073479897009309, + "grad_norm": 1.5467169383848067, + "learning_rate": 1.0247596757630147e-05, + "loss": 0.1865, + "step": 6404 + }, + { + "epoch": 0.5074272133095663, + "grad_norm": 1.6518271012331747, + "learning_rate": 1.0245031504434032e-05, + "loss": 0.203, + "step": 6405 + }, + { + "epoch": 0.5075064369182016, + "grad_norm": 2.216923039993798, + "learning_rate": 1.024246623510377e-05, + "loss": 0.3118, + "step": 6406 + }, + { + "epoch": 0.507585660526837, + "grad_norm": 2.011569551005349, + "learning_rate": 1.0239900949808274e-05, + "loss": 0.3412, + "step": 6407 + }, + { + "epoch": 0.5076648841354724, + "grad_norm": 1.552065811383536, + "learning_rate": 1.0237335648716456e-05, + "loss": 0.2009, + "step": 6408 + }, + { + "epoch": 0.5077441077441077, + "grad_norm": 2.092905558406769, + "learning_rate": 1.0234770331997224e-05, + "loss": 0.2606, + "step": 6409 + }, + { + "epoch": 0.5078233313527432, + "grad_norm": 1.334211172803728, + "learning_rate": 1.02322049998195e-05, + "loss": 0.2531, + "step": 6410 + }, + { + "epoch": 0.5079025549613785, + "grad_norm": 1.4945791457771147, + "learning_rate": 1.022963965235219e-05, + "loss": 0.1663, + "step": 6411 + }, + { + "epoch": 0.5079817785700138, + "grad_norm": 1.6558749508619934, + "learning_rate": 1.0227074289764216e-05, + "loss": 0.2476, + "step": 6412 + }, + { + "epoch": 0.5080610021786492, + "grad_norm": 1.7620882004001486, + "learning_rate": 1.0224508912224491e-05, + "loss": 0.2317, + "step": 6413 + }, + { + "epoch": 0.5081402257872846, + "grad_norm": 1.7345229090289764, + "learning_rate": 1.0221943519901935e-05, + "loss": 0.2235, + "step": 6414 + }, + { + "epoch": 0.50821944939592, + "grad_norm": 1.6271777181032696, + "learning_rate": 1.0219378112965468e-05, + "loss": 0.2814, + "step": 6415 + }, + { + "epoch": 0.5082986730045553, + "grad_norm": 1.468712072624976, + "learning_rate": 1.0216812691584005e-05, + "loss": 0.1843, + "step": 6416 + }, + { + "epoch": 0.5083778966131908, + "grad_norm": 1.4048311497207364, + "learning_rate": 1.021424725592647e-05, + "loss": 0.1714, + "step": 6417 + }, + { + "epoch": 0.5084571202218261, + "grad_norm": 1.5878285725090504, + "learning_rate": 1.0211681806161787e-05, + "loss": 0.2162, + "step": 6418 + }, + { + "epoch": 0.5085363438304614, + "grad_norm": 2.0226357300503603, + "learning_rate": 1.0209116342458872e-05, + "loss": 0.2872, + "step": 6419 + }, + { + "epoch": 0.5086155674390969, + "grad_norm": 1.6647916303425558, + "learning_rate": 1.0206550864986656e-05, + "loss": 0.2014, + "step": 6420 + }, + { + "epoch": 0.5086947910477322, + "grad_norm": 1.7010751940968944, + "learning_rate": 1.0203985373914056e-05, + "loss": 0.2903, + "step": 6421 + }, + { + "epoch": 0.5087740146563676, + "grad_norm": 1.871187061792545, + "learning_rate": 1.0201419869410001e-05, + "loss": 0.242, + "step": 6422 + }, + { + "epoch": 0.508853238265003, + "grad_norm": 1.558602497284484, + "learning_rate": 1.0198854351643416e-05, + "loss": 0.2132, + "step": 6423 + }, + { + "epoch": 0.5089324618736384, + "grad_norm": 1.8563580399794568, + "learning_rate": 1.0196288820783232e-05, + "loss": 0.2735, + "step": 6424 + }, + { + "epoch": 0.5090116854822737, + "grad_norm": 1.42370795225079, + "learning_rate": 1.0193723276998371e-05, + "loss": 0.2022, + "step": 6425 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.9281495104563846, + "learning_rate": 1.0191157720457765e-05, + "loss": 0.292, + "step": 6426 + }, + { + "epoch": 0.5091701326995445, + "grad_norm": 1.5901155472651172, + "learning_rate": 1.0188592151330343e-05, + "loss": 0.1882, + "step": 6427 + }, + { + "epoch": 0.5092493563081798, + "grad_norm": 1.5890069662185857, + "learning_rate": 1.0186026569785037e-05, + "loss": 0.1839, + "step": 6428 + }, + { + "epoch": 0.5093285799168152, + "grad_norm": 1.8886193273457028, + "learning_rate": 1.0183460975990773e-05, + "loss": 0.3836, + "step": 6429 + }, + { + "epoch": 0.5094078035254506, + "grad_norm": 2.281804303827039, + "learning_rate": 1.0180895370116488e-05, + "loss": 0.2251, + "step": 6430 + }, + { + "epoch": 0.509487027134086, + "grad_norm": 1.5691845024170556, + "learning_rate": 1.0178329752331116e-05, + "loss": 0.2219, + "step": 6431 + }, + { + "epoch": 0.5095662507427213, + "grad_norm": 1.508113269730381, + "learning_rate": 1.0175764122803584e-05, + "loss": 0.2213, + "step": 6432 + }, + { + "epoch": 0.5096454743513567, + "grad_norm": 1.3733557420376317, + "learning_rate": 1.017319848170283e-05, + "loss": 0.2069, + "step": 6433 + }, + { + "epoch": 0.5097246979599921, + "grad_norm": 1.6388112085454838, + "learning_rate": 1.0170632829197792e-05, + "loss": 0.282, + "step": 6434 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 1.573158355579066, + "learning_rate": 1.0168067165457403e-05, + "loss": 0.1994, + "step": 6435 + }, + { + "epoch": 0.5098831451772629, + "grad_norm": 1.7446573717928093, + "learning_rate": 1.01655014906506e-05, + "loss": 0.2512, + "step": 6436 + }, + { + "epoch": 0.5099623687858982, + "grad_norm": 1.9753472245369228, + "learning_rate": 1.016293580494632e-05, + "loss": 0.2256, + "step": 6437 + }, + { + "epoch": 0.5100415923945336, + "grad_norm": 1.8120790617315887, + "learning_rate": 1.0160370108513497e-05, + "loss": 0.2942, + "step": 6438 + }, + { + "epoch": 0.5101208160031689, + "grad_norm": 1.5463563927654693, + "learning_rate": 1.015780440152108e-05, + "loss": 0.196, + "step": 6439 + }, + { + "epoch": 0.5102000396118043, + "grad_norm": 1.993879172981582, + "learning_rate": 1.0155238684138e-05, + "loss": 0.2418, + "step": 6440 + }, + { + "epoch": 0.5102792632204397, + "grad_norm": 1.473014134080035, + "learning_rate": 1.0152672956533198e-05, + "loss": 0.1642, + "step": 6441 + }, + { + "epoch": 0.510358486829075, + "grad_norm": 1.5195950134969411, + "learning_rate": 1.015010721887562e-05, + "loss": 0.1991, + "step": 6442 + }, + { + "epoch": 0.5104377104377105, + "grad_norm": 1.4990310244649965, + "learning_rate": 1.0147541471334204e-05, + "loss": 0.1822, + "step": 6443 + }, + { + "epoch": 0.5105169340463458, + "grad_norm": 1.5539345408551732, + "learning_rate": 1.0144975714077889e-05, + "loss": 0.2007, + "step": 6444 + }, + { + "epoch": 0.5105961576549812, + "grad_norm": 1.886014078203292, + "learning_rate": 1.0142409947275621e-05, + "loss": 0.3137, + "step": 6445 + }, + { + "epoch": 0.5106753812636166, + "grad_norm": 1.9082891268777655, + "learning_rate": 1.0139844171096345e-05, + "loss": 0.2721, + "step": 6446 + }, + { + "epoch": 0.5107546048722519, + "grad_norm": 1.7820543694869206, + "learning_rate": 1.0137278385709004e-05, + "loss": 0.2137, + "step": 6447 + }, + { + "epoch": 0.5108338284808873, + "grad_norm": 1.4216125943418878, + "learning_rate": 1.0134712591282539e-05, + "loss": 0.2321, + "step": 6448 + }, + { + "epoch": 0.5109130520895226, + "grad_norm": 1.4828076788256601, + "learning_rate": 1.0132146787985898e-05, + "loss": 0.222, + "step": 6449 + }, + { + "epoch": 0.5109922756981581, + "grad_norm": 1.6226094682086722, + "learning_rate": 1.0129580975988029e-05, + "loss": 0.2279, + "step": 6450 + }, + { + "epoch": 0.5110714993067934, + "grad_norm": 1.4555474390002656, + "learning_rate": 1.0127015155457875e-05, + "loss": 0.2145, + "step": 6451 + }, + { + "epoch": 0.5111507229154288, + "grad_norm": 1.957626993857304, + "learning_rate": 1.0124449326564383e-05, + "loss": 0.2467, + "step": 6452 + }, + { + "epoch": 0.5112299465240642, + "grad_norm": 1.5258791194181638, + "learning_rate": 1.0121883489476505e-05, + "loss": 0.2057, + "step": 6453 + }, + { + "epoch": 0.5113091701326995, + "grad_norm": 2.258191570728336, + "learning_rate": 1.0119317644363182e-05, + "loss": 0.2725, + "step": 6454 + }, + { + "epoch": 0.5113883937413349, + "grad_norm": 1.4826078654861197, + "learning_rate": 1.0116751791393371e-05, + "loss": 0.1997, + "step": 6455 + }, + { + "epoch": 0.5114676173499703, + "grad_norm": 1.927519351867951, + "learning_rate": 1.011418593073601e-05, + "loss": 0.2836, + "step": 6456 + }, + { + "epoch": 0.5115468409586057, + "grad_norm": 1.5548591916658565, + "learning_rate": 1.0111620062560059e-05, + "loss": 0.1915, + "step": 6457 + }, + { + "epoch": 0.511626064567241, + "grad_norm": 1.630621271094443, + "learning_rate": 1.0109054187034463e-05, + "loss": 0.2337, + "step": 6458 + }, + { + "epoch": 0.5117052881758765, + "grad_norm": 1.5688562493583613, + "learning_rate": 1.0106488304328175e-05, + "loss": 0.2321, + "step": 6459 + }, + { + "epoch": 0.5117845117845118, + "grad_norm": 1.5215635271519548, + "learning_rate": 1.010392241461014e-05, + "loss": 0.1617, + "step": 6460 + }, + { + "epoch": 0.5118637353931471, + "grad_norm": 1.6455981623625469, + "learning_rate": 1.010135651804932e-05, + "loss": 0.251, + "step": 6461 + }, + { + "epoch": 0.5119429590017825, + "grad_norm": 1.7908248150126298, + "learning_rate": 1.0098790614814658e-05, + "loss": 0.2544, + "step": 6462 + }, + { + "epoch": 0.5120221826104179, + "grad_norm": 1.6203023694841237, + "learning_rate": 1.009622470507511e-05, + "loss": 0.3074, + "step": 6463 + }, + { + "epoch": 0.5121014062190533, + "grad_norm": 1.748613987489867, + "learning_rate": 1.0093658788999628e-05, + "loss": 0.2823, + "step": 6464 + }, + { + "epoch": 0.5121806298276886, + "grad_norm": 1.7982651670894434, + "learning_rate": 1.0091092866757164e-05, + "loss": 0.2229, + "step": 6465 + }, + { + "epoch": 0.5122598534363241, + "grad_norm": 1.7470309445326715, + "learning_rate": 1.0088526938516676e-05, + "loss": 0.1567, + "step": 6466 + }, + { + "epoch": 0.5123390770449594, + "grad_norm": 1.368737317056197, + "learning_rate": 1.0085961004447114e-05, + "loss": 0.203, + "step": 6467 + }, + { + "epoch": 0.5124183006535947, + "grad_norm": 1.6142680754896392, + "learning_rate": 1.0083395064717429e-05, + "loss": 0.2096, + "step": 6468 + }, + { + "epoch": 0.5124975242622302, + "grad_norm": 1.398851658263601, + "learning_rate": 1.0080829119496587e-05, + "loss": 0.2239, + "step": 6469 + }, + { + "epoch": 0.5125767478708655, + "grad_norm": 1.476244120589003, + "learning_rate": 1.0078263168953532e-05, + "loss": 0.2374, + "step": 6470 + }, + { + "epoch": 0.5126559714795009, + "grad_norm": 1.8834724833440701, + "learning_rate": 1.0075697213257227e-05, + "loss": 0.2132, + "step": 6471 + }, + { + "epoch": 0.5127351950881363, + "grad_norm": 1.5999307775822558, + "learning_rate": 1.0073131252576622e-05, + "loss": 0.201, + "step": 6472 + }, + { + "epoch": 0.5128144186967717, + "grad_norm": 1.6201269162385037, + "learning_rate": 1.0070565287080676e-05, + "loss": 0.2692, + "step": 6473 + }, + { + "epoch": 0.512893642305407, + "grad_norm": 2.179901360762801, + "learning_rate": 1.0067999316938348e-05, + "loss": 0.3759, + "step": 6474 + }, + { + "epoch": 0.5129728659140423, + "grad_norm": 1.6245384222977508, + "learning_rate": 1.006543334231859e-05, + "loss": 0.2696, + "step": 6475 + }, + { + "epoch": 0.5130520895226778, + "grad_norm": 1.6588856658611313, + "learning_rate": 1.0062867363390361e-05, + "loss": 0.2018, + "step": 6476 + }, + { + "epoch": 0.5131313131313131, + "grad_norm": 1.4462668490416088, + "learning_rate": 1.0060301380322622e-05, + "loss": 0.274, + "step": 6477 + }, + { + "epoch": 0.5132105367399485, + "grad_norm": 1.7701758979904199, + "learning_rate": 1.0057735393284322e-05, + "loss": 0.2409, + "step": 6478 + }, + { + "epoch": 0.5132897603485839, + "grad_norm": 1.8788610060010669, + "learning_rate": 1.0055169402444429e-05, + "loss": 0.2684, + "step": 6479 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 1.651947366616138, + "learning_rate": 1.0052603407971892e-05, + "loss": 0.2606, + "step": 6480 + }, + { + "epoch": 0.5134482075658546, + "grad_norm": 1.4559351542456724, + "learning_rate": 1.0050037410035676e-05, + "loss": 0.1859, + "step": 6481 + }, + { + "epoch": 0.51352743117449, + "grad_norm": 1.7119476031427558, + "learning_rate": 1.004747140880474e-05, + "loss": 0.2641, + "step": 6482 + }, + { + "epoch": 0.5136066547831254, + "grad_norm": 1.9305920148387345, + "learning_rate": 1.0044905404448037e-05, + "loss": 0.2798, + "step": 6483 + }, + { + "epoch": 0.5136858783917607, + "grad_norm": 1.438286786375271, + "learning_rate": 1.0042339397134528e-05, + "loss": 0.2061, + "step": 6484 + }, + { + "epoch": 0.5137651020003962, + "grad_norm": 1.6862281209284795, + "learning_rate": 1.0039773387033178e-05, + "loss": 0.2195, + "step": 6485 + }, + { + "epoch": 0.5138443256090315, + "grad_norm": 1.9216246369267767, + "learning_rate": 1.0037207374312936e-05, + "loss": 0.2356, + "step": 6486 + }, + { + "epoch": 0.5139235492176669, + "grad_norm": 2.1234420696718534, + "learning_rate": 1.003464135914277e-05, + "loss": 0.2915, + "step": 6487 + }, + { + "epoch": 0.5140027728263022, + "grad_norm": 1.9906207243768332, + "learning_rate": 1.0032075341691639e-05, + "loss": 0.276, + "step": 6488 + }, + { + "epoch": 0.5140819964349376, + "grad_norm": 1.6081362312171994, + "learning_rate": 1.0029509322128499e-05, + "loss": 0.1747, + "step": 6489 + }, + { + "epoch": 0.514161220043573, + "grad_norm": 2.2938857165541737, + "learning_rate": 1.0026943300622313e-05, + "loss": 0.2342, + "step": 6490 + }, + { + "epoch": 0.5142404436522083, + "grad_norm": 1.3324365747926863, + "learning_rate": 1.0024377277342038e-05, + "loss": 0.1751, + "step": 6491 + }, + { + "epoch": 0.5143196672608438, + "grad_norm": 1.8221635347817944, + "learning_rate": 1.002181125245664e-05, + "loss": 0.3477, + "step": 6492 + }, + { + "epoch": 0.5143988908694791, + "grad_norm": 1.6905355523014922, + "learning_rate": 1.0019245226135075e-05, + "loss": 0.3393, + "step": 6493 + }, + { + "epoch": 0.5144781144781144, + "grad_norm": 1.6086216908061008, + "learning_rate": 1.0016679198546304e-05, + "loss": 0.2739, + "step": 6494 + }, + { + "epoch": 0.5145573380867499, + "grad_norm": 1.648337491237927, + "learning_rate": 1.0014113169859285e-05, + "loss": 0.2466, + "step": 6495 + }, + { + "epoch": 0.5146365616953852, + "grad_norm": 1.9889232952540248, + "learning_rate": 1.0011547140242987e-05, + "loss": 0.2221, + "step": 6496 + }, + { + "epoch": 0.5147157853040206, + "grad_norm": 1.2471124576251928, + "learning_rate": 1.0008981109866363e-05, + "loss": 0.1429, + "step": 6497 + }, + { + "epoch": 0.514795008912656, + "grad_norm": 1.1779611503791618, + "learning_rate": 1.0006415078898377e-05, + "loss": 0.1462, + "step": 6498 + }, + { + "epoch": 0.5148742325212914, + "grad_norm": 1.7394107681046125, + "learning_rate": 1.0003849047507987e-05, + "loss": 0.2431, + "step": 6499 + }, + { + "epoch": 0.5149534561299267, + "grad_norm": 1.6497103485621865, + "learning_rate": 1.0001283015864157e-05, + "loss": 0.2454, + "step": 6500 + }, + { + "epoch": 0.515032679738562, + "grad_norm": 2.154975406255334, + "learning_rate": 9.998716984135847e-06, + "loss": 0.3811, + "step": 6501 + }, + { + "epoch": 0.5151119033471975, + "grad_norm": 1.8248567788152668, + "learning_rate": 9.996150952492018e-06, + "loss": 0.3131, + "step": 6502 + }, + { + "epoch": 0.5151911269558328, + "grad_norm": 1.5164802093096938, + "learning_rate": 9.993584921101628e-06, + "loss": 0.2696, + "step": 6503 + }, + { + "epoch": 0.5152703505644682, + "grad_norm": 1.5755242780437846, + "learning_rate": 9.991018890133642e-06, + "loss": 0.1882, + "step": 6504 + }, + { + "epoch": 0.5153495741731036, + "grad_norm": 1.6795659349338317, + "learning_rate": 9.988452859757017e-06, + "loss": 0.2888, + "step": 6505 + }, + { + "epoch": 0.515428797781739, + "grad_norm": 1.3401355952455904, + "learning_rate": 9.985886830140717e-06, + "loss": 0.1058, + "step": 6506 + }, + { + "epoch": 0.5155080213903743, + "grad_norm": 1.9767542541509284, + "learning_rate": 9.983320801453702e-06, + "loss": 0.2207, + "step": 6507 + }, + { + "epoch": 0.5155872449990097, + "grad_norm": 1.8604908163040235, + "learning_rate": 9.98075477386493e-06, + "loss": 0.2235, + "step": 6508 + }, + { + "epoch": 0.5156664686076451, + "grad_norm": 1.8188640367413023, + "learning_rate": 9.978188747543364e-06, + "loss": 0.2067, + "step": 6509 + }, + { + "epoch": 0.5157456922162804, + "grad_norm": 1.945383034910453, + "learning_rate": 9.975622722657965e-06, + "loss": 0.2682, + "step": 6510 + }, + { + "epoch": 0.5158249158249159, + "grad_norm": 1.6440759120641035, + "learning_rate": 9.973056699377692e-06, + "loss": 0.2008, + "step": 6511 + }, + { + "epoch": 0.5159041394335512, + "grad_norm": 1.5296992790532355, + "learning_rate": 9.970490677871506e-06, + "loss": 0.2041, + "step": 6512 + }, + { + "epoch": 0.5159833630421866, + "grad_norm": 2.2574940211746375, + "learning_rate": 9.967924658308366e-06, + "loss": 0.286, + "step": 6513 + }, + { + "epoch": 0.5160625866508219, + "grad_norm": 1.5644936848412478, + "learning_rate": 9.965358640857231e-06, + "loss": 0.1476, + "step": 6514 + }, + { + "epoch": 0.5161418102594573, + "grad_norm": 1.853619464609191, + "learning_rate": 9.962792625687067e-06, + "loss": 0.2685, + "step": 6515 + }, + { + "epoch": 0.5162210338680927, + "grad_norm": 1.8784718728183394, + "learning_rate": 9.960226612966828e-06, + "loss": 0.2651, + "step": 6516 + }, + { + "epoch": 0.516300257476728, + "grad_norm": 1.584619426456885, + "learning_rate": 9.957660602865477e-06, + "loss": 0.2345, + "step": 6517 + }, + { + "epoch": 0.5163794810853635, + "grad_norm": 1.9929728898305445, + "learning_rate": 9.955094595551968e-06, + "loss": 0.2416, + "step": 6518 + }, + { + "epoch": 0.5164587046939988, + "grad_norm": 1.505456797458076, + "learning_rate": 9.952528591195265e-06, + "loss": 0.1944, + "step": 6519 + }, + { + "epoch": 0.5165379283026342, + "grad_norm": 2.1043779710972372, + "learning_rate": 9.949962589964327e-06, + "loss": 0.2284, + "step": 6520 + }, + { + "epoch": 0.5166171519112696, + "grad_norm": 1.8986433601644745, + "learning_rate": 9.94739659202811e-06, + "loss": 0.2095, + "step": 6521 + }, + { + "epoch": 0.5166963755199049, + "grad_norm": 1.603464216578882, + "learning_rate": 9.944830597555573e-06, + "loss": 0.206, + "step": 6522 + }, + { + "epoch": 0.5167755991285403, + "grad_norm": 1.365060663800126, + "learning_rate": 9.94226460671568e-06, + "loss": 0.1536, + "step": 6523 + }, + { + "epoch": 0.5168548227371756, + "grad_norm": 1.570949045137009, + "learning_rate": 9.939698619677383e-06, + "loss": 0.2344, + "step": 6524 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 1.58363180586898, + "learning_rate": 9.937132636609642e-06, + "loss": 0.1936, + "step": 6525 + }, + { + "epoch": 0.5170132699544464, + "grad_norm": 1.6625818977250058, + "learning_rate": 9.934566657681412e-06, + "loss": 0.2312, + "step": 6526 + }, + { + "epoch": 0.5170924935630818, + "grad_norm": 1.632393397693878, + "learning_rate": 9.932000683061654e-06, + "loss": 0.2811, + "step": 6527 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 2.325660313764123, + "learning_rate": 9.929434712919327e-06, + "loss": 0.2905, + "step": 6528 + }, + { + "epoch": 0.5172509407803525, + "grad_norm": 1.9542836586370809, + "learning_rate": 9.926868747423381e-06, + "loss": 0.3113, + "step": 6529 + }, + { + "epoch": 0.5173301643889879, + "grad_norm": 1.4193671361261082, + "learning_rate": 9.924302786742775e-06, + "loss": 0.1913, + "step": 6530 + }, + { + "epoch": 0.5174093879976233, + "grad_norm": 2.0511049777232726, + "learning_rate": 9.92173683104647e-06, + "loss": 0.2859, + "step": 6531 + }, + { + "epoch": 0.5174886116062587, + "grad_norm": 1.2591616892251185, + "learning_rate": 9.919170880503416e-06, + "loss": 0.1377, + "step": 6532 + }, + { + "epoch": 0.517567835214894, + "grad_norm": 1.7607508610627813, + "learning_rate": 9.916604935282573e-06, + "loss": 0.2959, + "step": 6533 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 1.7008477722434605, + "learning_rate": 9.914038995552891e-06, + "loss": 0.2384, + "step": 6534 + }, + { + "epoch": 0.5177262824321648, + "grad_norm": 1.5155639025430583, + "learning_rate": 9.911473061483326e-06, + "loss": 0.1866, + "step": 6535 + }, + { + "epoch": 0.5178055060408001, + "grad_norm": 1.2256016833439958, + "learning_rate": 9.908907133242838e-06, + "loss": 0.1537, + "step": 6536 + }, + { + "epoch": 0.5178847296494355, + "grad_norm": 1.771213535735738, + "learning_rate": 9.906341211000375e-06, + "loss": 0.2421, + "step": 6537 + }, + { + "epoch": 0.5179639532580709, + "grad_norm": 1.7659075221832585, + "learning_rate": 9.903775294924892e-06, + "loss": 0.1875, + "step": 6538 + }, + { + "epoch": 0.5180431768667063, + "grad_norm": 1.3359416558093027, + "learning_rate": 9.901209385185345e-06, + "loss": 0.1674, + "step": 6539 + }, + { + "epoch": 0.5181224004753416, + "grad_norm": 1.9456109291691057, + "learning_rate": 9.898643481950683e-06, + "loss": 0.2839, + "step": 6540 + }, + { + "epoch": 0.5182016240839771, + "grad_norm": 1.526768999538284, + "learning_rate": 9.89607758538986e-06, + "loss": 0.2095, + "step": 6541 + }, + { + "epoch": 0.5182808476926124, + "grad_norm": 1.941398953329438, + "learning_rate": 9.893511695671828e-06, + "loss": 0.2512, + "step": 6542 + }, + { + "epoch": 0.5183600713012477, + "grad_norm": 1.5815811107710815, + "learning_rate": 9.890945812965538e-06, + "loss": 0.1874, + "step": 6543 + }, + { + "epoch": 0.5184392949098832, + "grad_norm": 1.7060226776628804, + "learning_rate": 9.888379937439944e-06, + "loss": 0.2101, + "step": 6544 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 1.5443405070785197, + "learning_rate": 9.885814069263991e-06, + "loss": 0.1507, + "step": 6545 + }, + { + "epoch": 0.5185977421271539, + "grad_norm": 1.823041231258554, + "learning_rate": 9.883248208606632e-06, + "loss": 0.2526, + "step": 6546 + }, + { + "epoch": 0.5186769657357893, + "grad_norm": 1.6248133547875379, + "learning_rate": 9.880682355636821e-06, + "loss": 0.2107, + "step": 6547 + }, + { + "epoch": 0.5187561893444247, + "grad_norm": 1.8220732762870397, + "learning_rate": 9.878116510523498e-06, + "loss": 0.2527, + "step": 6548 + }, + { + "epoch": 0.51883541295306, + "grad_norm": 1.3542736862929772, + "learning_rate": 9.87555067343562e-06, + "loss": 0.1861, + "step": 6549 + }, + { + "epoch": 0.5189146365616953, + "grad_norm": 2.4472864387334843, + "learning_rate": 9.872984844542128e-06, + "loss": 0.2237, + "step": 6550 + }, + { + "epoch": 0.5189938601703308, + "grad_norm": 2.00422345654755, + "learning_rate": 9.870419024011973e-06, + "loss": 0.3057, + "step": 6551 + }, + { + "epoch": 0.5190730837789661, + "grad_norm": 1.71440329717328, + "learning_rate": 9.867853212014104e-06, + "loss": 0.1759, + "step": 6552 + }, + { + "epoch": 0.5191523073876015, + "grad_norm": 1.4883916148667324, + "learning_rate": 9.865287408717464e-06, + "loss": 0.2014, + "step": 6553 + }, + { + "epoch": 0.5192315309962369, + "grad_norm": 1.5453193744707012, + "learning_rate": 9.862721614291e-06, + "loss": 0.2129, + "step": 6554 + }, + { + "epoch": 0.5193107546048723, + "grad_norm": 1.712344328056902, + "learning_rate": 9.860155828903658e-06, + "loss": 0.2344, + "step": 6555 + }, + { + "epoch": 0.5193899782135076, + "grad_norm": 1.7040747603656183, + "learning_rate": 9.85759005272438e-06, + "loss": 0.197, + "step": 6556 + }, + { + "epoch": 0.519469201822143, + "grad_norm": 1.8715707541439077, + "learning_rate": 9.855024285922114e-06, + "loss": 0.2829, + "step": 6557 + }, + { + "epoch": 0.5195484254307784, + "grad_norm": 1.4763358867820018, + "learning_rate": 9.8524585286658e-06, + "loss": 0.2671, + "step": 6558 + }, + { + "epoch": 0.5196276490394137, + "grad_norm": 1.5999989200747373, + "learning_rate": 9.84989278112438e-06, + "loss": 0.1599, + "step": 6559 + }, + { + "epoch": 0.5197068726480492, + "grad_norm": 1.8424211993894914, + "learning_rate": 9.847327043466802e-06, + "loss": 0.1662, + "step": 6560 + }, + { + "epoch": 0.5197860962566845, + "grad_norm": 1.552895134645817, + "learning_rate": 9.844761315862002e-06, + "loss": 0.2166, + "step": 6561 + }, + { + "epoch": 0.5198653198653199, + "grad_norm": 1.4998254529502497, + "learning_rate": 9.842195598478922e-06, + "loss": 0.2063, + "step": 6562 + }, + { + "epoch": 0.5199445434739552, + "grad_norm": 1.7673422779636208, + "learning_rate": 9.839629891486503e-06, + "loss": 0.1904, + "step": 6563 + }, + { + "epoch": 0.5200237670825906, + "grad_norm": 1.4665955444457102, + "learning_rate": 9.83706419505368e-06, + "loss": 0.2428, + "step": 6564 + }, + { + "epoch": 0.520102990691226, + "grad_norm": 1.6208884192042912, + "learning_rate": 9.834498509349402e-06, + "loss": 0.2357, + "step": 6565 + }, + { + "epoch": 0.5201822142998613, + "grad_norm": 1.5932399929774363, + "learning_rate": 9.831932834542598e-06, + "loss": 0.252, + "step": 6566 + }, + { + "epoch": 0.5202614379084968, + "grad_norm": 2.015415844296962, + "learning_rate": 9.829367170802208e-06, + "loss": 0.3385, + "step": 6567 + }, + { + "epoch": 0.5203406615171321, + "grad_norm": 1.6745329730599214, + "learning_rate": 9.82680151829717e-06, + "loss": 0.2413, + "step": 6568 + }, + { + "epoch": 0.5204198851257675, + "grad_norm": 1.8267065809484035, + "learning_rate": 9.824235877196418e-06, + "loss": 0.2205, + "step": 6569 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 1.2384389776442932, + "learning_rate": 9.821670247668887e-06, + "loss": 0.2188, + "step": 6570 + }, + { + "epoch": 0.5205783323430382, + "grad_norm": 1.8113765211854829, + "learning_rate": 9.819104629883513e-06, + "loss": 0.1699, + "step": 6571 + }, + { + "epoch": 0.5206575559516736, + "grad_norm": 1.672072546281759, + "learning_rate": 9.816539024009227e-06, + "loss": 0.2521, + "step": 6572 + }, + { + "epoch": 0.520736779560309, + "grad_norm": 1.657661842027711, + "learning_rate": 9.813973430214965e-06, + "loss": 0.1934, + "step": 6573 + }, + { + "epoch": 0.5208160031689444, + "grad_norm": 1.8574733120630307, + "learning_rate": 9.811407848669657e-06, + "loss": 0.3483, + "step": 6574 + }, + { + "epoch": 0.5208952267775797, + "grad_norm": 1.7650044155731195, + "learning_rate": 9.808842279542235e-06, + "loss": 0.237, + "step": 6575 + }, + { + "epoch": 0.520974450386215, + "grad_norm": 1.7481276613970684, + "learning_rate": 9.80627672300163e-06, + "loss": 0.1959, + "step": 6576 + }, + { + "epoch": 0.5210536739948505, + "grad_norm": 1.8051628508398276, + "learning_rate": 9.80371117921677e-06, + "loss": 0.2802, + "step": 6577 + }, + { + "epoch": 0.5211328976034858, + "grad_norm": 1.2362706753648025, + "learning_rate": 9.801145648356585e-06, + "loss": 0.1639, + "step": 6578 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 1.4328823513851627, + "learning_rate": 9.798580130590004e-06, + "loss": 0.2086, + "step": 6579 + }, + { + "epoch": 0.5212913448207566, + "grad_norm": 1.7484169598594526, + "learning_rate": 9.79601462608595e-06, + "loss": 0.2593, + "step": 6580 + }, + { + "epoch": 0.521370568429392, + "grad_norm": 1.5034539086792649, + "learning_rate": 9.79344913501335e-06, + "loss": 0.2579, + "step": 6581 + }, + { + "epoch": 0.5214497920380273, + "grad_norm": 1.7124036603017196, + "learning_rate": 9.790883657541133e-06, + "loss": 0.219, + "step": 6582 + }, + { + "epoch": 0.5215290156466627, + "grad_norm": 1.9862851399739558, + "learning_rate": 9.788318193838218e-06, + "loss": 0.233, + "step": 6583 + }, + { + "epoch": 0.5216082392552981, + "grad_norm": 1.483207523021381, + "learning_rate": 9.785752744073534e-06, + "loss": 0.1491, + "step": 6584 + }, + { + "epoch": 0.5216874628639334, + "grad_norm": 2.2097305945009804, + "learning_rate": 9.783187308416e-06, + "loss": 0.3521, + "step": 6585 + }, + { + "epoch": 0.5217666864725689, + "grad_norm": 1.4592939064313366, + "learning_rate": 9.780621887034537e-06, + "loss": 0.2081, + "step": 6586 + }, + { + "epoch": 0.5218459100812042, + "grad_norm": 1.558753586449951, + "learning_rate": 9.778056480098068e-06, + "loss": 0.2058, + "step": 6587 + }, + { + "epoch": 0.5219251336898396, + "grad_norm": 1.813196324028363, + "learning_rate": 9.775491087775514e-06, + "loss": 0.2254, + "step": 6588 + }, + { + "epoch": 0.5220043572984749, + "grad_norm": 1.6105331121955642, + "learning_rate": 9.772925710235789e-06, + "loss": 0.2324, + "step": 6589 + }, + { + "epoch": 0.5220835809071103, + "grad_norm": 1.9637127379313561, + "learning_rate": 9.770360347647817e-06, + "loss": 0.3305, + "step": 6590 + }, + { + "epoch": 0.5221628045157457, + "grad_norm": 1.5397209821030116, + "learning_rate": 9.767795000180507e-06, + "loss": 0.1261, + "step": 6591 + }, + { + "epoch": 0.522242028124381, + "grad_norm": 1.817566382058787, + "learning_rate": 9.76522966800278e-06, + "loss": 0.2491, + "step": 6592 + }, + { + "epoch": 0.5223212517330165, + "grad_norm": 2.018775193405404, + "learning_rate": 9.76266435128355e-06, + "loss": 0.386, + "step": 6593 + }, + { + "epoch": 0.5224004753416518, + "grad_norm": 1.8274604206698914, + "learning_rate": 9.76009905019173e-06, + "loss": 0.2382, + "step": 6594 + }, + { + "epoch": 0.5224796989502872, + "grad_norm": 1.7260849909373308, + "learning_rate": 9.757533764896235e-06, + "loss": 0.1918, + "step": 6595 + }, + { + "epoch": 0.5225589225589226, + "grad_norm": 1.318506626886228, + "learning_rate": 9.754968495565973e-06, + "loss": 0.1565, + "step": 6596 + }, + { + "epoch": 0.5226381461675579, + "grad_norm": 1.687609626616036, + "learning_rate": 9.752403242369857e-06, + "loss": 0.2252, + "step": 6597 + }, + { + "epoch": 0.5227173697761933, + "grad_norm": 1.6038874822009532, + "learning_rate": 9.749838005476798e-06, + "loss": 0.1796, + "step": 6598 + }, + { + "epoch": 0.5227965933848286, + "grad_norm": 2.264460955607835, + "learning_rate": 9.7472727850557e-06, + "loss": 0.2401, + "step": 6599 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 1.7600127058743176, + "learning_rate": 9.744707581275473e-06, + "loss": 0.2514, + "step": 6600 + }, + { + "epoch": 0.5229550406020994, + "grad_norm": 1.6454397006077333, + "learning_rate": 9.742142394305026e-06, + "loss": 0.2406, + "step": 6601 + }, + { + "epoch": 0.5230342642107348, + "grad_norm": 1.6536626576896352, + "learning_rate": 9.739577224313258e-06, + "loss": 0.2099, + "step": 6602 + }, + { + "epoch": 0.5231134878193702, + "grad_norm": 1.2611062806032343, + "learning_rate": 9.737012071469082e-06, + "loss": 0.1673, + "step": 6603 + }, + { + "epoch": 0.5231927114280055, + "grad_norm": 1.6670450104890164, + "learning_rate": 9.734446935941392e-06, + "loss": 0.1761, + "step": 6604 + }, + { + "epoch": 0.5232719350366409, + "grad_norm": 1.5716250864543366, + "learning_rate": 9.731881817899092e-06, + "loss": 0.1819, + "step": 6605 + }, + { + "epoch": 0.5233511586452763, + "grad_norm": 1.5426166047086236, + "learning_rate": 9.729316717511088e-06, + "loss": 0.2412, + "step": 6606 + }, + { + "epoch": 0.5234303822539117, + "grad_norm": 1.9558353963362065, + "learning_rate": 9.726751634946272e-06, + "loss": 0.2241, + "step": 6607 + }, + { + "epoch": 0.523509605862547, + "grad_norm": 1.834414036964593, + "learning_rate": 9.724186570373548e-06, + "loss": 0.2642, + "step": 6608 + }, + { + "epoch": 0.5235888294711825, + "grad_norm": 1.5297948985591396, + "learning_rate": 9.721621523961812e-06, + "loss": 0.1945, + "step": 6609 + }, + { + "epoch": 0.5236680530798178, + "grad_norm": 1.6316690372842118, + "learning_rate": 9.719056495879958e-06, + "loss": 0.1967, + "step": 6610 + }, + { + "epoch": 0.5237472766884531, + "grad_norm": 1.712998210634295, + "learning_rate": 9.716491486296883e-06, + "loss": 0.2877, + "step": 6611 + }, + { + "epoch": 0.5238265002970885, + "grad_norm": 1.5298057464968875, + "learning_rate": 9.71392649538148e-06, + "loss": 0.2295, + "step": 6612 + }, + { + "epoch": 0.5239057239057239, + "grad_norm": 1.8130814438376366, + "learning_rate": 9.711361523302638e-06, + "loss": 0.2246, + "step": 6613 + }, + { + "epoch": 0.5239849475143593, + "grad_norm": 1.5509073841618803, + "learning_rate": 9.708796570229253e-06, + "loss": 0.186, + "step": 6614 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 1.4937879691190525, + "learning_rate": 9.706231636330212e-06, + "loss": 0.1683, + "step": 6615 + }, + { + "epoch": 0.5241433947316301, + "grad_norm": 1.7531062992220119, + "learning_rate": 9.703666721774403e-06, + "loss": 0.2508, + "step": 6616 + }, + { + "epoch": 0.5242226183402654, + "grad_norm": 1.575594756517319, + "learning_rate": 9.701101826730718e-06, + "loss": 0.1831, + "step": 6617 + }, + { + "epoch": 0.5243018419489007, + "grad_norm": 1.559798698958206, + "learning_rate": 9.698536951368035e-06, + "loss": 0.2373, + "step": 6618 + }, + { + "epoch": 0.5243810655575362, + "grad_norm": 1.2948045633210499, + "learning_rate": 9.695972095855248e-06, + "loss": 0.1585, + "step": 6619 + }, + { + "epoch": 0.5244602891661715, + "grad_norm": 1.5587486981796865, + "learning_rate": 9.693407260361231e-06, + "loss": 0.1669, + "step": 6620 + }, + { + "epoch": 0.5245395127748069, + "grad_norm": 1.9306859801846217, + "learning_rate": 9.690842445054873e-06, + "loss": 0.2645, + "step": 6621 + }, + { + "epoch": 0.5246187363834423, + "grad_norm": 1.489265346961083, + "learning_rate": 9.688277650105053e-06, + "loss": 0.25, + "step": 6622 + }, + { + "epoch": 0.5246979599920777, + "grad_norm": 1.5738148767507962, + "learning_rate": 9.685712875680649e-06, + "loss": 0.1943, + "step": 6623 + }, + { + "epoch": 0.524777183600713, + "grad_norm": 1.6541461425395876, + "learning_rate": 9.683148121950539e-06, + "loss": 0.2048, + "step": 6624 + }, + { + "epoch": 0.5248564072093483, + "grad_norm": 1.2756810027881804, + "learning_rate": 9.680583389083602e-06, + "loss": 0.1904, + "step": 6625 + }, + { + "epoch": 0.5249356308179838, + "grad_norm": 1.6290058645319117, + "learning_rate": 9.67801867724871e-06, + "loss": 0.2947, + "step": 6626 + }, + { + "epoch": 0.5250148544266191, + "grad_norm": 1.5592634810850434, + "learning_rate": 9.675453986614743e-06, + "loss": 0.2482, + "step": 6627 + }, + { + "epoch": 0.5250940780352545, + "grad_norm": 1.7993948906587451, + "learning_rate": 9.672889317350565e-06, + "loss": 0.2261, + "step": 6628 + }, + { + "epoch": 0.5251733016438899, + "grad_norm": 1.5006090749941114, + "learning_rate": 9.670324669625053e-06, + "loss": 0.1606, + "step": 6629 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 1.675191994429445, + "learning_rate": 9.667760043607077e-06, + "loss": 0.2044, + "step": 6630 + }, + { + "epoch": 0.5253317488611606, + "grad_norm": 1.3882031316943497, + "learning_rate": 9.6651954394655e-06, + "loss": 0.2, + "step": 6631 + }, + { + "epoch": 0.525410972469796, + "grad_norm": 1.750119177385897, + "learning_rate": 9.662630857369194e-06, + "loss": 0.2777, + "step": 6632 + }, + { + "epoch": 0.5254901960784314, + "grad_norm": 1.5913265087833688, + "learning_rate": 9.660066297487024e-06, + "loss": 0.1686, + "step": 6633 + }, + { + "epoch": 0.5255694196870667, + "grad_norm": 1.2583668423759147, + "learning_rate": 9.65750175998785e-06, + "loss": 0.1765, + "step": 6634 + }, + { + "epoch": 0.5256486432957022, + "grad_norm": 1.5942490905504325, + "learning_rate": 9.65493724504054e-06, + "loss": 0.2177, + "step": 6635 + }, + { + "epoch": 0.5257278669043375, + "grad_norm": 1.93664542991259, + "learning_rate": 9.65237275281395e-06, + "loss": 0.2293, + "step": 6636 + }, + { + "epoch": 0.5258070905129729, + "grad_norm": 1.9715943510944687, + "learning_rate": 9.64980828347694e-06, + "loss": 0.2478, + "step": 6637 + }, + { + "epoch": 0.5258863141216082, + "grad_norm": 1.6583933411282301, + "learning_rate": 9.647243837198375e-06, + "loss": 0.2722, + "step": 6638 + }, + { + "epoch": 0.5259655377302436, + "grad_norm": 1.7626576783291303, + "learning_rate": 9.644679414147102e-06, + "loss": 0.231, + "step": 6639 + }, + { + "epoch": 0.526044761338879, + "grad_norm": 1.8629875495914667, + "learning_rate": 9.64211501449198e-06, + "loss": 0.1849, + "step": 6640 + }, + { + "epoch": 0.5261239849475143, + "grad_norm": 1.8247654282660322, + "learning_rate": 9.639550638401863e-06, + "loss": 0.2797, + "step": 6641 + }, + { + "epoch": 0.5262032085561498, + "grad_norm": 1.8397659809916322, + "learning_rate": 9.6369862860456e-06, + "loss": 0.2822, + "step": 6642 + }, + { + "epoch": 0.5262824321647851, + "grad_norm": 1.4548770472068142, + "learning_rate": 9.634421957592048e-06, + "loss": 0.2004, + "step": 6643 + }, + { + "epoch": 0.5263616557734205, + "grad_norm": 2.611513313692533, + "learning_rate": 9.631857653210048e-06, + "loss": 0.2488, + "step": 6644 + }, + { + "epoch": 0.5264408793820559, + "grad_norm": 1.8321557010262473, + "learning_rate": 9.629293373068449e-06, + "loss": 0.252, + "step": 6645 + }, + { + "epoch": 0.5265201029906912, + "grad_norm": 1.709438947012418, + "learning_rate": 9.626729117336101e-06, + "loss": 0.2659, + "step": 6646 + }, + { + "epoch": 0.5265993265993266, + "grad_norm": 1.572182796128045, + "learning_rate": 9.624164886181841e-06, + "loss": 0.1842, + "step": 6647 + }, + { + "epoch": 0.526678550207962, + "grad_norm": 1.3683525626723403, + "learning_rate": 9.621600679774516e-06, + "loss": 0.231, + "step": 6648 + }, + { + "epoch": 0.5267577738165974, + "grad_norm": 1.7854714537760052, + "learning_rate": 9.619036498282968e-06, + "loss": 0.3232, + "step": 6649 + }, + { + "epoch": 0.5268369974252327, + "grad_norm": 1.4651685607183893, + "learning_rate": 9.61647234187603e-06, + "loss": 0.1906, + "step": 6650 + }, + { + "epoch": 0.526916221033868, + "grad_norm": 1.8337084587744106, + "learning_rate": 9.613908210722546e-06, + "loss": 0.2134, + "step": 6651 + }, + { + "epoch": 0.5269954446425035, + "grad_norm": 1.5941261922743575, + "learning_rate": 9.611344104991346e-06, + "loss": 0.2399, + "step": 6652 + }, + { + "epoch": 0.5270746682511388, + "grad_norm": 1.511790159172521, + "learning_rate": 9.608780024851266e-06, + "loss": 0.2017, + "step": 6653 + }, + { + "epoch": 0.5271538918597742, + "grad_norm": 1.5008775012960016, + "learning_rate": 9.606215970471142e-06, + "loss": 0.1346, + "step": 6654 + }, + { + "epoch": 0.5272331154684096, + "grad_norm": 1.7168626710340569, + "learning_rate": 9.6036519420198e-06, + "loss": 0.2133, + "step": 6655 + }, + { + "epoch": 0.527312339077045, + "grad_norm": 1.5667632846405024, + "learning_rate": 9.601087939666071e-06, + "loss": 0.1479, + "step": 6656 + }, + { + "epoch": 0.5273915626856803, + "grad_norm": 1.7926740422311847, + "learning_rate": 9.598523963578785e-06, + "loss": 0.2803, + "step": 6657 + }, + { + "epoch": 0.5274707862943157, + "grad_norm": 1.6059482067695043, + "learning_rate": 9.595960013926761e-06, + "loss": 0.1931, + "step": 6658 + }, + { + "epoch": 0.5275500099029511, + "grad_norm": 1.5127165446548374, + "learning_rate": 9.593396090878823e-06, + "loss": 0.1644, + "step": 6659 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 1.4892583823446155, + "learning_rate": 9.590832194603801e-06, + "loss": 0.1836, + "step": 6660 + }, + { + "epoch": 0.5277084571202219, + "grad_norm": 2.497602606422469, + "learning_rate": 9.588268325270506e-06, + "loss": 0.3302, + "step": 6661 + }, + { + "epoch": 0.5277876807288572, + "grad_norm": 1.362954333513193, + "learning_rate": 9.585704483047761e-06, + "loss": 0.1366, + "step": 6662 + }, + { + "epoch": 0.5278669043374926, + "grad_norm": 1.6089875736780563, + "learning_rate": 9.583140668104387e-06, + "loss": 0.2361, + "step": 6663 + }, + { + "epoch": 0.5279461279461279, + "grad_norm": 1.773462069836003, + "learning_rate": 9.58057688060919e-06, + "loss": 0.2527, + "step": 6664 + }, + { + "epoch": 0.5280253515547633, + "grad_norm": 1.8801863525211449, + "learning_rate": 9.578013120730987e-06, + "loss": 0.2627, + "step": 6665 + }, + { + "epoch": 0.5281045751633987, + "grad_norm": 1.5152957647957932, + "learning_rate": 9.575449388638592e-06, + "loss": 0.1907, + "step": 6666 + }, + { + "epoch": 0.528183798772034, + "grad_norm": 1.8036361744117901, + "learning_rate": 9.57288568450081e-06, + "loss": 0.3015, + "step": 6667 + }, + { + "epoch": 0.5282630223806695, + "grad_norm": 1.8838291322583762, + "learning_rate": 9.570322008486453e-06, + "loss": 0.2451, + "step": 6668 + }, + { + "epoch": 0.5283422459893048, + "grad_norm": 1.5034239764256554, + "learning_rate": 9.567758360764321e-06, + "loss": 0.2089, + "step": 6669 + }, + { + "epoch": 0.5284214695979402, + "grad_norm": 1.6195194920710483, + "learning_rate": 9.565194741503221e-06, + "loss": 0.2313, + "step": 6670 + }, + { + "epoch": 0.5285006932065756, + "grad_norm": 1.5463357780081222, + "learning_rate": 9.562631150871959e-06, + "loss": 0.2487, + "step": 6671 + }, + { + "epoch": 0.5285799168152109, + "grad_norm": 1.4321370214688554, + "learning_rate": 9.560067589039327e-06, + "loss": 0.1816, + "step": 6672 + }, + { + "epoch": 0.5286591404238463, + "grad_norm": 1.4905937188873857, + "learning_rate": 9.55750405617413e-06, + "loss": 0.1569, + "step": 6673 + }, + { + "epoch": 0.5287383640324816, + "grad_norm": 2.3833644922495125, + "learning_rate": 9.554940552445161e-06, + "loss": 0.2216, + "step": 6674 + }, + { + "epoch": 0.5288175876411171, + "grad_norm": 1.661038410211544, + "learning_rate": 9.552377078021215e-06, + "loss": 0.2827, + "step": 6675 + }, + { + "epoch": 0.5288968112497524, + "grad_norm": 1.3943359011090264, + "learning_rate": 9.549813633071085e-06, + "loss": 0.2179, + "step": 6676 + }, + { + "epoch": 0.5289760348583878, + "grad_norm": 1.7915763129053264, + "learning_rate": 9.54725021776356e-06, + "loss": 0.2427, + "step": 6677 + }, + { + "epoch": 0.5290552584670232, + "grad_norm": 2.2491499294279307, + "learning_rate": 9.54468683226743e-06, + "loss": 0.2505, + "step": 6678 + }, + { + "epoch": 0.5291344820756585, + "grad_norm": 1.4029439571383027, + "learning_rate": 9.542123476751484e-06, + "loss": 0.1428, + "step": 6679 + }, + { + "epoch": 0.5292137056842939, + "grad_norm": 1.632703332472555, + "learning_rate": 9.5395601513845e-06, + "loss": 0.1874, + "step": 6680 + }, + { + "epoch": 0.5292929292929293, + "grad_norm": 2.1049649496709404, + "learning_rate": 9.536996856335269e-06, + "loss": 0.2929, + "step": 6681 + }, + { + "epoch": 0.5293721529015647, + "grad_norm": 1.8296071181548896, + "learning_rate": 9.534433591772562e-06, + "loss": 0.2617, + "step": 6682 + }, + { + "epoch": 0.5294513765102, + "grad_norm": 1.9450150476803743, + "learning_rate": 9.531870357865165e-06, + "loss": 0.2145, + "step": 6683 + }, + { + "epoch": 0.5295306001188355, + "grad_norm": 1.47473320876335, + "learning_rate": 9.529307154781855e-06, + "loss": 0.1979, + "step": 6684 + }, + { + "epoch": 0.5296098237274708, + "grad_norm": 1.338591728638656, + "learning_rate": 9.5267439826914e-06, + "loss": 0.1645, + "step": 6685 + }, + { + "epoch": 0.5296890473361061, + "grad_norm": 1.931847108495125, + "learning_rate": 9.524180841762577e-06, + "loss": 0.2417, + "step": 6686 + }, + { + "epoch": 0.5297682709447415, + "grad_norm": 2.210729370265415, + "learning_rate": 9.52161773216416e-06, + "loss": 0.2576, + "step": 6687 + }, + { + "epoch": 0.5298474945533769, + "grad_norm": 1.6437102160453194, + "learning_rate": 9.519054654064909e-06, + "loss": 0.2029, + "step": 6688 + }, + { + "epoch": 0.5299267181620123, + "grad_norm": 1.5214474428938953, + "learning_rate": 9.5164916076336e-06, + "loss": 0.188, + "step": 6689 + }, + { + "epoch": 0.5300059417706476, + "grad_norm": 1.845370636968282, + "learning_rate": 9.513928593038987e-06, + "loss": 0.2865, + "step": 6690 + }, + { + "epoch": 0.5300851653792831, + "grad_norm": 1.8787258609209871, + "learning_rate": 9.51136561044984e-06, + "loss": 0.2375, + "step": 6691 + }, + { + "epoch": 0.5301643889879184, + "grad_norm": 1.5151714693359617, + "learning_rate": 9.508802660034915e-06, + "loss": 0.1878, + "step": 6692 + }, + { + "epoch": 0.5302436125965537, + "grad_norm": 2.1548326623262173, + "learning_rate": 9.506239741962971e-06, + "loss": 0.3541, + "step": 6693 + }, + { + "epoch": 0.5303228362051892, + "grad_norm": 1.7527924388536071, + "learning_rate": 9.503676856402764e-06, + "loss": 0.2741, + "step": 6694 + }, + { + "epoch": 0.5304020598138245, + "grad_norm": 1.2962090526177372, + "learning_rate": 9.50111400352305e-06, + "loss": 0.1803, + "step": 6695 + }, + { + "epoch": 0.5304812834224599, + "grad_norm": 1.5194447912503397, + "learning_rate": 9.498551183492578e-06, + "loss": 0.1909, + "step": 6696 + }, + { + "epoch": 0.5305605070310953, + "grad_norm": 1.412741052417984, + "learning_rate": 9.495988396480097e-06, + "loss": 0.247, + "step": 6697 + }, + { + "epoch": 0.5306397306397307, + "grad_norm": 1.8234868886306743, + "learning_rate": 9.493425642654356e-06, + "loss": 0.3234, + "step": 6698 + }, + { + "epoch": 0.530718954248366, + "grad_norm": 1.4803751124426618, + "learning_rate": 9.490862922184096e-06, + "loss": 0.2625, + "step": 6699 + }, + { + "epoch": 0.5307981778570013, + "grad_norm": 1.6934213292679765, + "learning_rate": 9.488300235238067e-06, + "loss": 0.2256, + "step": 6700 + }, + { + "epoch": 0.5308774014656368, + "grad_norm": 1.8482265898384123, + "learning_rate": 9.485737581985002e-06, + "loss": 0.1978, + "step": 6701 + }, + { + "epoch": 0.5309566250742721, + "grad_norm": 1.148427836684831, + "learning_rate": 9.483174962593644e-06, + "loss": 0.0733, + "step": 6702 + }, + { + "epoch": 0.5310358486829075, + "grad_norm": 1.5065220042849732, + "learning_rate": 9.480612377232728e-06, + "loss": 0.1749, + "step": 6703 + }, + { + "epoch": 0.5311150722915429, + "grad_norm": 1.3931876475465617, + "learning_rate": 9.478049826070988e-06, + "loss": 0.2005, + "step": 6704 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 1.547844036626027, + "learning_rate": 9.475487309277156e-06, + "loss": 0.2169, + "step": 6705 + }, + { + "epoch": 0.5312735195088136, + "grad_norm": 1.3564708885872676, + "learning_rate": 9.472924827019959e-06, + "loss": 0.1944, + "step": 6706 + }, + { + "epoch": 0.531352743117449, + "grad_norm": 1.4215847582519736, + "learning_rate": 9.470362379468125e-06, + "loss": 0.1887, + "step": 6707 + }, + { + "epoch": 0.5314319667260844, + "grad_norm": 1.6639037965063888, + "learning_rate": 9.467799966790384e-06, + "loss": 0.1971, + "step": 6708 + }, + { + "epoch": 0.5315111903347197, + "grad_norm": 1.7872959741549863, + "learning_rate": 9.465237589155452e-06, + "loss": 0.2035, + "step": 6709 + }, + { + "epoch": 0.5315904139433552, + "grad_norm": 1.5266494139563354, + "learning_rate": 9.462675246732051e-06, + "loss": 0.1816, + "step": 6710 + }, + { + "epoch": 0.5316696375519905, + "grad_norm": 1.7236073652852932, + "learning_rate": 9.460112939688901e-06, + "loss": 0.2813, + "step": 6711 + }, + { + "epoch": 0.5317488611606259, + "grad_norm": 1.7651528915167407, + "learning_rate": 9.457550668194714e-06, + "loss": 0.2247, + "step": 6712 + }, + { + "epoch": 0.5318280847692612, + "grad_norm": 2.0036443265858663, + "learning_rate": 9.45498843241821e-06, + "loss": 0.2509, + "step": 6713 + }, + { + "epoch": 0.5319073083778966, + "grad_norm": 1.5789654364978454, + "learning_rate": 9.452426232528092e-06, + "loss": 0.2065, + "step": 6714 + }, + { + "epoch": 0.531986531986532, + "grad_norm": 1.4504747767975756, + "learning_rate": 9.449864068693072e-06, + "loss": 0.1802, + "step": 6715 + }, + { + "epoch": 0.5320657555951673, + "grad_norm": 1.6378494980445526, + "learning_rate": 9.447301941081856e-06, + "loss": 0.2352, + "step": 6716 + }, + { + "epoch": 0.5321449792038028, + "grad_norm": 1.6869247085174373, + "learning_rate": 9.444739849863146e-06, + "loss": 0.2835, + "step": 6717 + }, + { + "epoch": 0.5322242028124381, + "grad_norm": 1.4819390163273016, + "learning_rate": 9.442177795205647e-06, + "loss": 0.177, + "step": 6718 + }, + { + "epoch": 0.5323034264210735, + "grad_norm": 1.6452706542873483, + "learning_rate": 9.439615777278059e-06, + "loss": 0.1653, + "step": 6719 + }, + { + "epoch": 0.5323826500297089, + "grad_norm": 1.5842742962511123, + "learning_rate": 9.437053796249071e-06, + "loss": 0.1877, + "step": 6720 + }, + { + "epoch": 0.5324618736383442, + "grad_norm": 1.5325389961654612, + "learning_rate": 9.434491852287385e-06, + "loss": 0.1717, + "step": 6721 + }, + { + "epoch": 0.5325410972469796, + "grad_norm": 1.4712055393041057, + "learning_rate": 9.431929945561688e-06, + "loss": 0.1968, + "step": 6722 + }, + { + "epoch": 0.532620320855615, + "grad_norm": 1.847194933031637, + "learning_rate": 9.429368076240669e-06, + "loss": 0.2141, + "step": 6723 + }, + { + "epoch": 0.5326995444642504, + "grad_norm": 2.010953931471911, + "learning_rate": 9.42680624449302e-06, + "loss": 0.2813, + "step": 6724 + }, + { + "epoch": 0.5327787680728857, + "grad_norm": 1.9905392212703936, + "learning_rate": 9.42424445048742e-06, + "loss": 0.2785, + "step": 6725 + }, + { + "epoch": 0.5328579916815211, + "grad_norm": 1.5519898076389937, + "learning_rate": 9.42168269439255e-06, + "loss": 0.1744, + "step": 6726 + }, + { + "epoch": 0.5329372152901565, + "grad_norm": 1.2489815884023632, + "learning_rate": 9.419120976377098e-06, + "loss": 0.1867, + "step": 6727 + }, + { + "epoch": 0.5330164388987918, + "grad_norm": 1.556647804415713, + "learning_rate": 9.41655929660973e-06, + "loss": 0.2544, + "step": 6728 + }, + { + "epoch": 0.5330956625074272, + "grad_norm": 1.4873696218984018, + "learning_rate": 9.413997655259126e-06, + "loss": 0.2269, + "step": 6729 + }, + { + "epoch": 0.5331748861160626, + "grad_norm": 1.5828225641722773, + "learning_rate": 9.411436052493957e-06, + "loss": 0.2174, + "step": 6730 + }, + { + "epoch": 0.533254109724698, + "grad_norm": 1.438560811196052, + "learning_rate": 9.40887448848289e-06, + "loss": 0.2025, + "step": 6731 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.7749281631140923, + "learning_rate": 9.406312963394598e-06, + "loss": 0.1845, + "step": 6732 + }, + { + "epoch": 0.5334125569419687, + "grad_norm": 1.7563477771745766, + "learning_rate": 9.403751477397738e-06, + "loss": 0.3005, + "step": 6733 + }, + { + "epoch": 0.5334917805506041, + "grad_norm": 1.5707082099560055, + "learning_rate": 9.401190030660975e-06, + "loss": 0.2546, + "step": 6734 + }, + { + "epoch": 0.5335710041592394, + "grad_norm": 1.6766375692726976, + "learning_rate": 9.398628623352969e-06, + "loss": 0.1494, + "step": 6735 + }, + { + "epoch": 0.5336502277678749, + "grad_norm": 1.3285417095349021, + "learning_rate": 9.396067255642373e-06, + "loss": 0.1746, + "step": 6736 + }, + { + "epoch": 0.5337294513765102, + "grad_norm": 1.4059534324614433, + "learning_rate": 9.39350592769784e-06, + "loss": 0.2389, + "step": 6737 + }, + { + "epoch": 0.5338086749851456, + "grad_norm": 1.355034071935565, + "learning_rate": 9.390944639688027e-06, + "loss": 0.1767, + "step": 6738 + }, + { + "epoch": 0.5338878985937809, + "grad_norm": 1.5922845269765298, + "learning_rate": 9.388383391781576e-06, + "loss": 0.1984, + "step": 6739 + }, + { + "epoch": 0.5339671222024163, + "grad_norm": 1.719551087336993, + "learning_rate": 9.385822184147136e-06, + "loss": 0.2512, + "step": 6740 + }, + { + "epoch": 0.5340463458110517, + "grad_norm": 1.7142518922085583, + "learning_rate": 9.383261016953351e-06, + "loss": 0.2326, + "step": 6741 + }, + { + "epoch": 0.534125569419687, + "grad_norm": 1.763326968766568, + "learning_rate": 9.38069989036886e-06, + "loss": 0.2904, + "step": 6742 + }, + { + "epoch": 0.5342047930283225, + "grad_norm": 1.3857171277720168, + "learning_rate": 9.3781388045623e-06, + "loss": 0.1528, + "step": 6743 + }, + { + "epoch": 0.5342840166369578, + "grad_norm": 1.8505391177845503, + "learning_rate": 9.37557775970231e-06, + "loss": 0.2994, + "step": 6744 + }, + { + "epoch": 0.5343632402455932, + "grad_norm": 1.8492972044462455, + "learning_rate": 9.373016755957519e-06, + "loss": 0.2947, + "step": 6745 + }, + { + "epoch": 0.5344424638542286, + "grad_norm": 1.7409432756076915, + "learning_rate": 9.370455793496558e-06, + "loss": 0.2772, + "step": 6746 + }, + { + "epoch": 0.5345216874628639, + "grad_norm": 2.028965044254343, + "learning_rate": 9.367894872488053e-06, + "loss": 0.3436, + "step": 6747 + }, + { + "epoch": 0.5346009110714993, + "grad_norm": 3.0423782411705593, + "learning_rate": 9.365333993100628e-06, + "loss": 0.194, + "step": 6748 + }, + { + "epoch": 0.5346801346801346, + "grad_norm": 1.4699543873430614, + "learning_rate": 9.362773155502909e-06, + "loss": 0.2351, + "step": 6749 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 1.9852768916268493, + "learning_rate": 9.360212359863508e-06, + "loss": 0.318, + "step": 6750 + }, + { + "epoch": 0.5348385818974054, + "grad_norm": 1.265955168851745, + "learning_rate": 9.357651606351047e-06, + "loss": 0.1414, + "step": 6751 + }, + { + "epoch": 0.5349178055060408, + "grad_norm": 1.3497356084849454, + "learning_rate": 9.355090895134138e-06, + "loss": 0.1965, + "step": 6752 + }, + { + "epoch": 0.5349970291146762, + "grad_norm": 1.566259114016372, + "learning_rate": 9.352530226381388e-06, + "loss": 0.2507, + "step": 6753 + }, + { + "epoch": 0.5350762527233115, + "grad_norm": 1.442853681254746, + "learning_rate": 9.349969600261408e-06, + "loss": 0.1829, + "step": 6754 + }, + { + "epoch": 0.5351554763319469, + "grad_norm": 1.6751769314191742, + "learning_rate": 9.347409016942803e-06, + "loss": 0.2513, + "step": 6755 + }, + { + "epoch": 0.5352346999405823, + "grad_norm": 2.0313269456821863, + "learning_rate": 9.344848476594172e-06, + "loss": 0.3085, + "step": 6756 + }, + { + "epoch": 0.5353139235492177, + "grad_norm": 1.5124093873092133, + "learning_rate": 9.342287979384118e-06, + "loss": 0.1808, + "step": 6757 + }, + { + "epoch": 0.535393147157853, + "grad_norm": 1.7329358612448034, + "learning_rate": 9.339727525481234e-06, + "loss": 0.2653, + "step": 6758 + }, + { + "epoch": 0.5354723707664885, + "grad_norm": 1.6803824824168774, + "learning_rate": 9.33716711505412e-06, + "loss": 0.2128, + "step": 6759 + }, + { + "epoch": 0.5355515943751238, + "grad_norm": 1.7964456294819626, + "learning_rate": 9.334606748271357e-06, + "loss": 0.1935, + "step": 6760 + }, + { + "epoch": 0.5356308179837591, + "grad_norm": 1.551025873097992, + "learning_rate": 9.33204642530154e-06, + "loss": 0.193, + "step": 6761 + }, + { + "epoch": 0.5357100415923945, + "grad_norm": 1.4885285664233845, + "learning_rate": 9.329486146313254e-06, + "loss": 0.1571, + "step": 6762 + }, + { + "epoch": 0.5357892652010299, + "grad_norm": 1.7748101804275802, + "learning_rate": 9.326925911475075e-06, + "loss": 0.212, + "step": 6763 + }, + { + "epoch": 0.5358684888096653, + "grad_norm": 1.723281768491179, + "learning_rate": 9.324365720955589e-06, + "loss": 0.2359, + "step": 6764 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 1.516767592161019, + "learning_rate": 9.321805574923369e-06, + "loss": 0.1937, + "step": 6765 + }, + { + "epoch": 0.5360269360269361, + "grad_norm": 1.5222957327946214, + "learning_rate": 9.319245473546987e-06, + "loss": 0.1482, + "step": 6766 + }, + { + "epoch": 0.5361061596355714, + "grad_norm": 1.531027295440473, + "learning_rate": 9.316685416995017e-06, + "loss": 0.2105, + "step": 6767 + }, + { + "epoch": 0.5361853832442067, + "grad_norm": 1.5008528912792312, + "learning_rate": 9.314125405436023e-06, + "loss": 0.1662, + "step": 6768 + }, + { + "epoch": 0.5362646068528422, + "grad_norm": 1.3968376128273732, + "learning_rate": 9.311565439038571e-06, + "loss": 0.1696, + "step": 6769 + }, + { + "epoch": 0.5363438304614775, + "grad_norm": 1.7978002793367236, + "learning_rate": 9.309005517971222e-06, + "loss": 0.2559, + "step": 6770 + }, + { + "epoch": 0.5364230540701129, + "grad_norm": 1.538462798294783, + "learning_rate": 9.306445642402534e-06, + "loss": 0.1735, + "step": 6771 + }, + { + "epoch": 0.5365022776787483, + "grad_norm": 1.8106540947748968, + "learning_rate": 9.303885812501064e-06, + "loss": 0.2323, + "step": 6772 + }, + { + "epoch": 0.5365815012873837, + "grad_norm": 1.4969367501225739, + "learning_rate": 9.301326028435367e-06, + "loss": 0.1896, + "step": 6773 + }, + { + "epoch": 0.536660724896019, + "grad_norm": 2.9130970804528684, + "learning_rate": 9.298766290373986e-06, + "loss": 0.2045, + "step": 6774 + }, + { + "epoch": 0.5367399485046543, + "grad_norm": 1.6689027093276674, + "learning_rate": 9.296206598485471e-06, + "loss": 0.2609, + "step": 6775 + }, + { + "epoch": 0.5368191721132898, + "grad_norm": 1.884687721845564, + "learning_rate": 9.293646952938365e-06, + "loss": 0.2781, + "step": 6776 + }, + { + "epoch": 0.5368983957219251, + "grad_norm": 1.475400807052087, + "learning_rate": 9.291087353901208e-06, + "loss": 0.2238, + "step": 6777 + }, + { + "epoch": 0.5369776193305605, + "grad_norm": 1.6800887026100833, + "learning_rate": 9.28852780154254e-06, + "loss": 0.1718, + "step": 6778 + }, + { + "epoch": 0.5370568429391959, + "grad_norm": 1.594646250244442, + "learning_rate": 9.285968296030891e-06, + "loss": 0.1928, + "step": 6779 + }, + { + "epoch": 0.5371360665478313, + "grad_norm": 1.6684572489481007, + "learning_rate": 9.283408837534793e-06, + "loss": 0.2109, + "step": 6780 + }, + { + "epoch": 0.5372152901564666, + "grad_norm": 1.5809270240124893, + "learning_rate": 9.280849426222778e-06, + "loss": 0.2158, + "step": 6781 + }, + { + "epoch": 0.537294513765102, + "grad_norm": 1.5205443732991, + "learning_rate": 9.278290062263364e-06, + "loss": 0.1733, + "step": 6782 + }, + { + "epoch": 0.5373737373737374, + "grad_norm": 1.3913000406741118, + "learning_rate": 9.27573074582508e-06, + "loss": 0.1599, + "step": 6783 + }, + { + "epoch": 0.5374529609823727, + "grad_norm": 1.368914089211404, + "learning_rate": 9.27317147707644e-06, + "loss": 0.1134, + "step": 6784 + }, + { + "epoch": 0.5375321845910082, + "grad_norm": 1.7023713108481129, + "learning_rate": 9.270612256185962e-06, + "loss": 0.2073, + "step": 6785 + }, + { + "epoch": 0.5376114081996435, + "grad_norm": 1.9357893966901718, + "learning_rate": 9.268053083322157e-06, + "loss": 0.2758, + "step": 6786 + }, + { + "epoch": 0.5376906318082789, + "grad_norm": 1.6010082869327436, + "learning_rate": 9.265493958653533e-06, + "loss": 0.1937, + "step": 6787 + }, + { + "epoch": 0.5377698554169142, + "grad_norm": 1.6361248143081104, + "learning_rate": 9.262934882348599e-06, + "loss": 0.1854, + "step": 6788 + }, + { + "epoch": 0.5378490790255496, + "grad_norm": 1.6057876065624228, + "learning_rate": 9.260375854575857e-06, + "loss": 0.1508, + "step": 6789 + }, + { + "epoch": 0.537928302634185, + "grad_norm": 1.517449055211345, + "learning_rate": 9.257816875503805e-06, + "loss": 0.2046, + "step": 6790 + }, + { + "epoch": 0.5380075262428203, + "grad_norm": 1.780544364691931, + "learning_rate": 9.255257945300941e-06, + "loss": 0.2048, + "step": 6791 + }, + { + "epoch": 0.5380867498514558, + "grad_norm": 1.3008160343959099, + "learning_rate": 9.252699064135759e-06, + "loss": 0.1145, + "step": 6792 + }, + { + "epoch": 0.5381659734600911, + "grad_norm": 1.6095789214980698, + "learning_rate": 9.250140232176746e-06, + "loss": 0.1947, + "step": 6793 + }, + { + "epoch": 0.5382451970687265, + "grad_norm": 1.5748436730638056, + "learning_rate": 9.247581449592392e-06, + "loss": 0.2151, + "step": 6794 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 1.911311198120432, + "learning_rate": 9.245022716551178e-06, + "loss": 0.2478, + "step": 6795 + }, + { + "epoch": 0.5384036442859972, + "grad_norm": 1.5673164170569804, + "learning_rate": 9.242464033221584e-06, + "loss": 0.2216, + "step": 6796 + }, + { + "epoch": 0.5384828678946326, + "grad_norm": 2.007826362531275, + "learning_rate": 9.239905399772092e-06, + "loss": 0.3116, + "step": 6797 + }, + { + "epoch": 0.538562091503268, + "grad_norm": 2.073217020398983, + "learning_rate": 9.237346816371169e-06, + "loss": 0.2895, + "step": 6798 + }, + { + "epoch": 0.5386413151119034, + "grad_norm": 2.5203239544483353, + "learning_rate": 9.234788283187291e-06, + "loss": 0.2933, + "step": 6799 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 1.4939708121844684, + "learning_rate": 9.23222980038892e-06, + "loss": 0.2006, + "step": 6800 + }, + { + "epoch": 0.5387997623291741, + "grad_norm": 2.242616962081198, + "learning_rate": 9.229671368144524e-06, + "loss": 0.3136, + "step": 6801 + }, + { + "epoch": 0.5388789859378095, + "grad_norm": 1.7522087979966592, + "learning_rate": 9.227112986622562e-06, + "loss": 0.2659, + "step": 6802 + }, + { + "epoch": 0.5389582095464448, + "grad_norm": 1.3821852784587902, + "learning_rate": 9.224554655991492e-06, + "loss": 0.1914, + "step": 6803 + }, + { + "epoch": 0.5390374331550802, + "grad_norm": 2.1661008444848857, + "learning_rate": 9.221996376419763e-06, + "loss": 0.3001, + "step": 6804 + }, + { + "epoch": 0.5391166567637156, + "grad_norm": 1.5375078321927154, + "learning_rate": 9.219438148075834e-06, + "loss": 0.2076, + "step": 6805 + }, + { + "epoch": 0.539195880372351, + "grad_norm": 1.7228165284179944, + "learning_rate": 9.216879971128142e-06, + "loss": 0.2797, + "step": 6806 + }, + { + "epoch": 0.5392751039809863, + "grad_norm": 1.6069938870513174, + "learning_rate": 9.21432184574514e-06, + "loss": 0.1893, + "step": 6807 + }, + { + "epoch": 0.5393543275896218, + "grad_norm": 1.4410638900423316, + "learning_rate": 9.21176377209526e-06, + "loss": 0.2318, + "step": 6808 + }, + { + "epoch": 0.5394335511982571, + "grad_norm": 1.7264312231655994, + "learning_rate": 9.209205750346945e-06, + "loss": 0.2414, + "step": 6809 + }, + { + "epoch": 0.5395127748068924, + "grad_norm": 1.967709664441678, + "learning_rate": 9.206647780668629e-06, + "loss": 0.2487, + "step": 6810 + }, + { + "epoch": 0.5395919984155279, + "grad_norm": 1.5089670342628774, + "learning_rate": 9.204089863228736e-06, + "loss": 0.2166, + "step": 6811 + }, + { + "epoch": 0.5396712220241632, + "grad_norm": 1.4059351759289564, + "learning_rate": 9.201531998195697e-06, + "loss": 0.1653, + "step": 6812 + }, + { + "epoch": 0.5397504456327986, + "grad_norm": 1.542161201926313, + "learning_rate": 9.198974185737934e-06, + "loss": 0.187, + "step": 6813 + }, + { + "epoch": 0.5398296692414339, + "grad_norm": 1.6291370136864782, + "learning_rate": 9.196416426023868e-06, + "loss": 0.2021, + "step": 6814 + }, + { + "epoch": 0.5399088928500693, + "grad_norm": 1.6932647921587172, + "learning_rate": 9.193858719221912e-06, + "loss": 0.2505, + "step": 6815 + }, + { + "epoch": 0.5399881164587047, + "grad_norm": 1.6916988956884456, + "learning_rate": 9.19130106550048e-06, + "loss": 0.1859, + "step": 6816 + }, + { + "epoch": 0.54006734006734, + "grad_norm": 1.39481475664272, + "learning_rate": 9.188743465027981e-06, + "loss": 0.1397, + "step": 6817 + }, + { + "epoch": 0.5401465636759755, + "grad_norm": 1.520532635046863, + "learning_rate": 9.186185917972821e-06, + "loss": 0.1564, + "step": 6818 + }, + { + "epoch": 0.5402257872846108, + "grad_norm": 1.8713052680090447, + "learning_rate": 9.183628424503405e-06, + "loss": 0.2362, + "step": 6819 + }, + { + "epoch": 0.5403050108932462, + "grad_norm": 1.617227033114138, + "learning_rate": 9.181070984788127e-06, + "loss": 0.2031, + "step": 6820 + }, + { + "epoch": 0.5403842345018816, + "grad_norm": 1.5983723180223535, + "learning_rate": 9.178513598995384e-06, + "loss": 0.2095, + "step": 6821 + }, + { + "epoch": 0.5404634581105169, + "grad_norm": 1.6445190312669158, + "learning_rate": 9.17595626729357e-06, + "loss": 0.2308, + "step": 6822 + }, + { + "epoch": 0.5405426817191523, + "grad_norm": 1.6140072254229159, + "learning_rate": 9.17339898985107e-06, + "loss": 0.2284, + "step": 6823 + }, + { + "epoch": 0.5406219053277876, + "grad_norm": 1.8511931150082803, + "learning_rate": 9.170841766836268e-06, + "loss": 0.2776, + "step": 6824 + }, + { + "epoch": 0.5407011289364231, + "grad_norm": 1.8773750824975899, + "learning_rate": 9.168284598417547e-06, + "loss": 0.282, + "step": 6825 + }, + { + "epoch": 0.5407803525450584, + "grad_norm": 1.2955582505278562, + "learning_rate": 9.165727484763283e-06, + "loss": 0.1764, + "step": 6826 + }, + { + "epoch": 0.5408595761536938, + "grad_norm": 1.709261923249498, + "learning_rate": 9.16317042604185e-06, + "loss": 0.312, + "step": 6827 + }, + { + "epoch": 0.5409387997623292, + "grad_norm": 1.5455408348596222, + "learning_rate": 9.160613422421616e-06, + "loss": 0.2232, + "step": 6828 + }, + { + "epoch": 0.5410180233709645, + "grad_norm": 1.683470175310256, + "learning_rate": 9.158056474070952e-06, + "loss": 0.1741, + "step": 6829 + }, + { + "epoch": 0.5410972469795999, + "grad_norm": 1.390657859679486, + "learning_rate": 9.155499581158217e-06, + "loss": 0.1521, + "step": 6830 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 1.342939265783351, + "learning_rate": 9.152942743851771e-06, + "loss": 0.1481, + "step": 6831 + }, + { + "epoch": 0.5412556941968707, + "grad_norm": 1.5404597635699995, + "learning_rate": 9.15038596231997e-06, + "loss": 0.1657, + "step": 6832 + }, + { + "epoch": 0.541334917805506, + "grad_norm": 1.7452959716462286, + "learning_rate": 9.147829236731164e-06, + "loss": 0.2441, + "step": 6833 + }, + { + "epoch": 0.5414141414141415, + "grad_norm": 1.8456251227756069, + "learning_rate": 9.145272567253703e-06, + "loss": 0.2412, + "step": 6834 + }, + { + "epoch": 0.5414933650227768, + "grad_norm": 1.6205294600626676, + "learning_rate": 9.142715954055932e-06, + "loss": 0.2302, + "step": 6835 + }, + { + "epoch": 0.5415725886314121, + "grad_norm": 1.293886952678985, + "learning_rate": 9.140159397306188e-06, + "loss": 0.1517, + "step": 6836 + }, + { + "epoch": 0.5416518122400475, + "grad_norm": 1.2160053073034036, + "learning_rate": 9.137602897172814e-06, + "loss": 0.1257, + "step": 6837 + }, + { + "epoch": 0.5417310358486829, + "grad_norm": 1.4257844619134687, + "learning_rate": 9.135046453824136e-06, + "loss": 0.1576, + "step": 6838 + }, + { + "epoch": 0.5418102594573183, + "grad_norm": 1.7166193055659036, + "learning_rate": 9.132490067428488e-06, + "loss": 0.2489, + "step": 6839 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 1.685833293793044, + "learning_rate": 9.129933738154196e-06, + "loss": 0.2176, + "step": 6840 + }, + { + "epoch": 0.5419687066745891, + "grad_norm": 1.8485552936770095, + "learning_rate": 9.12737746616958e-06, + "loss": 0.2867, + "step": 6841 + }, + { + "epoch": 0.5420479302832244, + "grad_norm": 1.4195525431059022, + "learning_rate": 9.124821251642959e-06, + "loss": 0.1862, + "step": 6842 + }, + { + "epoch": 0.5421271538918597, + "grad_norm": 1.7481022330162914, + "learning_rate": 9.122265094742648e-06, + "loss": 0.2348, + "step": 6843 + }, + { + "epoch": 0.5422063775004952, + "grad_norm": 1.520956774038658, + "learning_rate": 9.119708995636957e-06, + "loss": 0.2061, + "step": 6844 + }, + { + "epoch": 0.5422856011091305, + "grad_norm": 1.6501130389696557, + "learning_rate": 9.117152954494195e-06, + "loss": 0.2284, + "step": 6845 + }, + { + "epoch": 0.5423648247177659, + "grad_norm": 1.4134937676704153, + "learning_rate": 9.114596971482658e-06, + "loss": 0.2372, + "step": 6846 + }, + { + "epoch": 0.5424440483264013, + "grad_norm": 1.48107579491038, + "learning_rate": 9.112041046770653e-06, + "loss": 0.2205, + "step": 6847 + }, + { + "epoch": 0.5425232719350367, + "grad_norm": 1.5267428398691307, + "learning_rate": 9.109485180526474e-06, + "loss": 0.2025, + "step": 6848 + }, + { + "epoch": 0.542602495543672, + "grad_norm": 1.5472428285904707, + "learning_rate": 9.106929372918408e-06, + "loss": 0.2239, + "step": 6849 + }, + { + "epoch": 0.5426817191523073, + "grad_norm": 1.6485598906539451, + "learning_rate": 9.104373624114746e-06, + "loss": 0.1905, + "step": 6850 + }, + { + "epoch": 0.5427609427609428, + "grad_norm": 1.7167078014689015, + "learning_rate": 9.101817934283775e-06, + "loss": 0.2033, + "step": 6851 + }, + { + "epoch": 0.5428401663695781, + "grad_norm": 1.7845452277875142, + "learning_rate": 9.099262303593768e-06, + "loss": 0.2484, + "step": 6852 + }, + { + "epoch": 0.5429193899782135, + "grad_norm": 1.7053247342526823, + "learning_rate": 9.096706732213005e-06, + "loss": 0.274, + "step": 6853 + }, + { + "epoch": 0.5429986135868489, + "grad_norm": 1.8035295520221963, + "learning_rate": 9.094151220309757e-06, + "loss": 0.2479, + "step": 6854 + }, + { + "epoch": 0.5430778371954843, + "grad_norm": 2.2689328059763216, + "learning_rate": 9.091595768052291e-06, + "loss": 0.3611, + "step": 6855 + }, + { + "epoch": 0.5431570608041196, + "grad_norm": 1.5834899513868252, + "learning_rate": 9.089040375608876e-06, + "loss": 0.1943, + "step": 6856 + }, + { + "epoch": 0.543236284412755, + "grad_norm": 2.14135252106152, + "learning_rate": 9.086485043147768e-06, + "loss": 0.4022, + "step": 6857 + }, + { + "epoch": 0.5433155080213904, + "grad_norm": 1.7820685706167885, + "learning_rate": 9.083929770837222e-06, + "loss": 0.2008, + "step": 6858 + }, + { + "epoch": 0.5433947316300257, + "grad_norm": 1.6906391788519166, + "learning_rate": 9.081374558845496e-06, + "loss": 0.1718, + "step": 6859 + }, + { + "epoch": 0.5434739552386612, + "grad_norm": 1.4666154262466298, + "learning_rate": 9.078819407340833e-06, + "loss": 0.2146, + "step": 6860 + }, + { + "epoch": 0.5435531788472965, + "grad_norm": 1.9349018814647516, + "learning_rate": 9.07626431649148e-06, + "loss": 0.2326, + "step": 6861 + }, + { + "epoch": 0.5436324024559319, + "grad_norm": 1.9896868511337484, + "learning_rate": 9.073709286465678e-06, + "loss": 0.2692, + "step": 6862 + }, + { + "epoch": 0.5437116260645672, + "grad_norm": 1.9799856737716437, + "learning_rate": 9.071154317431661e-06, + "loss": 0.2285, + "step": 6863 + }, + { + "epoch": 0.5437908496732026, + "grad_norm": 1.6326176647506145, + "learning_rate": 9.068599409557664e-06, + "loss": 0.223, + "step": 6864 + }, + { + "epoch": 0.543870073281838, + "grad_norm": 1.5168782514780899, + "learning_rate": 9.066044563011914e-06, + "loss": 0.2702, + "step": 6865 + }, + { + "epoch": 0.5439492968904733, + "grad_norm": 1.716583370934031, + "learning_rate": 9.063489777962634e-06, + "loss": 0.3145, + "step": 6866 + }, + { + "epoch": 0.5440285204991088, + "grad_norm": 1.287990390476046, + "learning_rate": 9.06093505457805e-06, + "loss": 0.1591, + "step": 6867 + }, + { + "epoch": 0.5441077441077441, + "grad_norm": 1.5966533475058962, + "learning_rate": 9.058380393026369e-06, + "loss": 0.2272, + "step": 6868 + }, + { + "epoch": 0.5441869677163795, + "grad_norm": 1.7230728770952775, + "learning_rate": 9.055825793475814e-06, + "loss": 0.2604, + "step": 6869 + }, + { + "epoch": 0.5442661913250149, + "grad_norm": 2.036746860379108, + "learning_rate": 9.053271256094582e-06, + "loss": 0.2219, + "step": 6870 + }, + { + "epoch": 0.5443454149336502, + "grad_norm": 1.8948099519392991, + "learning_rate": 9.050716781050885e-06, + "loss": 0.2507, + "step": 6871 + }, + { + "epoch": 0.5444246385422856, + "grad_norm": 1.3430327041262002, + "learning_rate": 9.04816236851292e-06, + "loss": 0.1968, + "step": 6872 + }, + { + "epoch": 0.544503862150921, + "grad_norm": 1.621941606237805, + "learning_rate": 9.045608018648884e-06, + "loss": 0.2082, + "step": 6873 + }, + { + "epoch": 0.5445830857595564, + "grad_norm": 1.5529975101368307, + "learning_rate": 9.043053731626964e-06, + "loss": 0.1695, + "step": 6874 + }, + { + "epoch": 0.5446623093681917, + "grad_norm": 1.6020556779178161, + "learning_rate": 9.040499507615356e-06, + "loss": 0.2584, + "step": 6875 + }, + { + "epoch": 0.5447415329768271, + "grad_norm": 1.7088753541123156, + "learning_rate": 9.037945346782236e-06, + "loss": 0.1704, + "step": 6876 + }, + { + "epoch": 0.5448207565854625, + "grad_norm": 1.3635660552877806, + "learning_rate": 9.035391249295788e-06, + "loss": 0.2199, + "step": 6877 + }, + { + "epoch": 0.5448999801940978, + "grad_norm": 1.6698307081465469, + "learning_rate": 9.032837215324183e-06, + "loss": 0.2058, + "step": 6878 + }, + { + "epoch": 0.5449792038027332, + "grad_norm": 1.61733190614819, + "learning_rate": 9.030283245035594e-06, + "loss": 0.274, + "step": 6879 + }, + { + "epoch": 0.5450584274113686, + "grad_norm": 1.4739765052193636, + "learning_rate": 9.027729338598188e-06, + "loss": 0.2064, + "step": 6880 + }, + { + "epoch": 0.545137651020004, + "grad_norm": 1.3168806467659693, + "learning_rate": 9.025175496180125e-06, + "loss": 0.1464, + "step": 6881 + }, + { + "epoch": 0.5452168746286393, + "grad_norm": 1.5843242248814735, + "learning_rate": 9.022621717949566e-06, + "loss": 0.1816, + "step": 6882 + }, + { + "epoch": 0.5452960982372748, + "grad_norm": 1.6291385199715893, + "learning_rate": 9.020068004074665e-06, + "loss": 0.1827, + "step": 6883 + }, + { + "epoch": 0.5453753218459101, + "grad_norm": 2.1919884741023283, + "learning_rate": 9.01751435472357e-06, + "loss": 0.2886, + "step": 6884 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.8100975323825783, + "learning_rate": 9.014960770064429e-06, + "loss": 0.2819, + "step": 6885 + }, + { + "epoch": 0.5455337690631809, + "grad_norm": 1.6905134710212093, + "learning_rate": 9.012407250265377e-06, + "loss": 0.2282, + "step": 6886 + }, + { + "epoch": 0.5456129926718162, + "grad_norm": 2.0570904728909833, + "learning_rate": 9.009853795494558e-06, + "loss": 0.2597, + "step": 6887 + }, + { + "epoch": 0.5456922162804516, + "grad_norm": 1.4600820729356667, + "learning_rate": 9.007300405920105e-06, + "loss": 0.24, + "step": 6888 + }, + { + "epoch": 0.5457714398890869, + "grad_norm": 1.7745493208224687, + "learning_rate": 9.00474708171014e-06, + "loss": 0.2247, + "step": 6889 + }, + { + "epoch": 0.5458506634977223, + "grad_norm": 1.6931209458367236, + "learning_rate": 9.002193823032791e-06, + "loss": 0.1876, + "step": 6890 + }, + { + "epoch": 0.5459298871063577, + "grad_norm": 1.5057381067310711, + "learning_rate": 8.999640630056183e-06, + "loss": 0.2032, + "step": 6891 + }, + { + "epoch": 0.546009110714993, + "grad_norm": 1.7450983974246, + "learning_rate": 8.997087502948423e-06, + "loss": 0.2253, + "step": 6892 + }, + { + "epoch": 0.5460883343236285, + "grad_norm": 1.4917284206479513, + "learning_rate": 8.994534441877625e-06, + "loss": 0.2274, + "step": 6893 + }, + { + "epoch": 0.5461675579322638, + "grad_norm": 2.001259630946284, + "learning_rate": 8.991981447011896e-06, + "loss": 0.2456, + "step": 6894 + }, + { + "epoch": 0.5462467815408992, + "grad_norm": 1.5615594623000488, + "learning_rate": 8.989428518519336e-06, + "loss": 0.2113, + "step": 6895 + }, + { + "epoch": 0.5463260051495346, + "grad_norm": 2.1911577341485944, + "learning_rate": 8.986875656568047e-06, + "loss": 0.2718, + "step": 6896 + }, + { + "epoch": 0.5464052287581699, + "grad_norm": 1.7277958646873435, + "learning_rate": 8.984322861326122e-06, + "loss": 0.2703, + "step": 6897 + }, + { + "epoch": 0.5464844523668053, + "grad_norm": 1.774571443720264, + "learning_rate": 8.981770132961649e-06, + "loss": 0.2203, + "step": 6898 + }, + { + "epoch": 0.5465636759754406, + "grad_norm": 1.5073142137142648, + "learning_rate": 8.979217471642712e-06, + "loss": 0.2412, + "step": 6899 + }, + { + "epoch": 0.5466428995840761, + "grad_norm": 1.3002400252869717, + "learning_rate": 8.976664877537395e-06, + "loss": 0.1795, + "step": 6900 + }, + { + "epoch": 0.5467221231927114, + "grad_norm": 1.8578127862288634, + "learning_rate": 8.974112350813771e-06, + "loss": 0.2882, + "step": 6901 + }, + { + "epoch": 0.5468013468013468, + "grad_norm": 1.4904534882014997, + "learning_rate": 8.971559891639913e-06, + "loss": 0.2295, + "step": 6902 + }, + { + "epoch": 0.5468805704099822, + "grad_norm": 1.3041952674075796, + "learning_rate": 8.969007500183886e-06, + "loss": 0.1451, + "step": 6903 + }, + { + "epoch": 0.5469597940186175, + "grad_norm": 1.8156540611505874, + "learning_rate": 8.966455176613754e-06, + "loss": 0.2618, + "step": 6904 + }, + { + "epoch": 0.5470390176272529, + "grad_norm": 1.2825538006730177, + "learning_rate": 8.963902921097579e-06, + "loss": 0.2486, + "step": 6905 + }, + { + "epoch": 0.5471182412358883, + "grad_norm": 1.8631402156611554, + "learning_rate": 8.961350733803406e-06, + "loss": 0.1946, + "step": 6906 + }, + { + "epoch": 0.5471974648445237, + "grad_norm": 1.6470375724717945, + "learning_rate": 8.958798614899291e-06, + "loss": 0.2371, + "step": 6907 + }, + { + "epoch": 0.547276688453159, + "grad_norm": 1.294652547885192, + "learning_rate": 8.956246564553282e-06, + "loss": 0.2098, + "step": 6908 + }, + { + "epoch": 0.5473559120617945, + "grad_norm": 1.2678980500082684, + "learning_rate": 8.95369458293341e-06, + "loss": 0.1126, + "step": 6909 + }, + { + "epoch": 0.5474351356704298, + "grad_norm": 1.5585670775659421, + "learning_rate": 8.951142670207718e-06, + "loss": 0.2345, + "step": 6910 + }, + { + "epoch": 0.5475143592790651, + "grad_norm": 1.7024952271500027, + "learning_rate": 8.948590826544232e-06, + "loss": 0.2969, + "step": 6911 + }, + { + "epoch": 0.5475935828877005, + "grad_norm": 1.6387397227618559, + "learning_rate": 8.94603905211098e-06, + "loss": 0.1659, + "step": 6912 + }, + { + "epoch": 0.5476728064963359, + "grad_norm": 2.1061774324591918, + "learning_rate": 8.943487347075988e-06, + "loss": 0.314, + "step": 6913 + }, + { + "epoch": 0.5477520301049713, + "grad_norm": 1.6830690489349167, + "learning_rate": 8.94093571160727e-06, + "loss": 0.2152, + "step": 6914 + }, + { + "epoch": 0.5478312537136066, + "grad_norm": 1.5511579142019574, + "learning_rate": 8.938384145872838e-06, + "loss": 0.1727, + "step": 6915 + }, + { + "epoch": 0.5479104773222421, + "grad_norm": 1.911199163525638, + "learning_rate": 8.935832650040703e-06, + "loss": 0.1795, + "step": 6916 + }, + { + "epoch": 0.5479897009308774, + "grad_norm": 1.950852220217877, + "learning_rate": 8.933281224278867e-06, + "loss": 0.2546, + "step": 6917 + }, + { + "epoch": 0.5480689245395127, + "grad_norm": 1.7313546153517896, + "learning_rate": 8.930729868755333e-06, + "loss": 0.2049, + "step": 6918 + }, + { + "epoch": 0.5481481481481482, + "grad_norm": 1.4208556376706853, + "learning_rate": 8.928178583638088e-06, + "loss": 0.1915, + "step": 6919 + }, + { + "epoch": 0.5482273717567835, + "grad_norm": 1.8168622063001627, + "learning_rate": 8.925627369095125e-06, + "loss": 0.2143, + "step": 6920 + }, + { + "epoch": 0.5483065953654189, + "grad_norm": 1.5562270733643324, + "learning_rate": 8.923076225294434e-06, + "loss": 0.2199, + "step": 6921 + }, + { + "epoch": 0.5483858189740543, + "grad_norm": 1.4541212312269738, + "learning_rate": 8.920525152403989e-06, + "loss": 0.1766, + "step": 6922 + }, + { + "epoch": 0.5484650425826897, + "grad_norm": 1.7193693110797912, + "learning_rate": 8.917974150591772e-06, + "loss": 0.3321, + "step": 6923 + }, + { + "epoch": 0.548544266191325, + "grad_norm": 1.6081911826509625, + "learning_rate": 8.915423220025747e-06, + "loss": 0.1312, + "step": 6924 + }, + { + "epoch": 0.5486234897999603, + "grad_norm": 1.5676540063857207, + "learning_rate": 8.912872360873885e-06, + "loss": 0.2028, + "step": 6925 + }, + { + "epoch": 0.5487027134085958, + "grad_norm": 1.5704225650977457, + "learning_rate": 8.91032157330415e-06, + "loss": 0.1777, + "step": 6926 + }, + { + "epoch": 0.5487819370172311, + "grad_norm": 1.745454871079652, + "learning_rate": 8.907770857484493e-06, + "loss": 0.2213, + "step": 6927 + }, + { + "epoch": 0.5488611606258665, + "grad_norm": 1.6703369921751172, + "learning_rate": 8.90522021358287e-06, + "loss": 0.2198, + "step": 6928 + }, + { + "epoch": 0.5489403842345019, + "grad_norm": 2.0965192803406456, + "learning_rate": 8.90266964176723e-06, + "loss": 0.2699, + "step": 6929 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 1.3559635350774237, + "learning_rate": 8.90011914220551e-06, + "loss": 0.2283, + "step": 6930 + }, + { + "epoch": 0.5490988314517726, + "grad_norm": 1.5489031735078498, + "learning_rate": 8.897568715065658e-06, + "loss": 0.1436, + "step": 6931 + }, + { + "epoch": 0.549178055060408, + "grad_norm": 1.852828862896294, + "learning_rate": 8.895018360515597e-06, + "loss": 0.2383, + "step": 6932 + }, + { + "epoch": 0.5492572786690434, + "grad_norm": 1.7942279606738023, + "learning_rate": 8.892468078723262e-06, + "loss": 0.1639, + "step": 6933 + }, + { + "epoch": 0.5493365022776787, + "grad_norm": 2.163698070692759, + "learning_rate": 8.889917869856576e-06, + "loss": 0.3001, + "step": 6934 + }, + { + "epoch": 0.5494157258863142, + "grad_norm": 1.4347570990915213, + "learning_rate": 8.887367734083454e-06, + "loss": 0.1299, + "step": 6935 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 1.6020948842533544, + "learning_rate": 8.884817671571815e-06, + "loss": 0.2203, + "step": 6936 + }, + { + "epoch": 0.5495741731035849, + "grad_norm": 1.7404160690046266, + "learning_rate": 8.882267682489566e-06, + "loss": 0.2077, + "step": 6937 + }, + { + "epoch": 0.5496533967122202, + "grad_norm": 1.4280478928277809, + "learning_rate": 8.879717767004613e-06, + "loss": 0.1662, + "step": 6938 + }, + { + "epoch": 0.5497326203208556, + "grad_norm": 1.5522585438454959, + "learning_rate": 8.877167925284855e-06, + "loss": 0.1782, + "step": 6939 + }, + { + "epoch": 0.549811843929491, + "grad_norm": 1.625170206623002, + "learning_rate": 8.874618157498183e-06, + "loss": 0.1876, + "step": 6940 + }, + { + "epoch": 0.5498910675381263, + "grad_norm": 1.550319006406342, + "learning_rate": 8.872068463812492e-06, + "loss": 0.2436, + "step": 6941 + }, + { + "epoch": 0.5499702911467618, + "grad_norm": 1.7177067006925795, + "learning_rate": 8.869518844395667e-06, + "loss": 0.2373, + "step": 6942 + }, + { + "epoch": 0.5500495147553971, + "grad_norm": 1.6049615452117618, + "learning_rate": 8.866969299415585e-06, + "loss": 0.2415, + "step": 6943 + }, + { + "epoch": 0.5501287383640325, + "grad_norm": 1.2897454364585075, + "learning_rate": 8.864419829040122e-06, + "loss": 0.1625, + "step": 6944 + }, + { + "epoch": 0.5502079619726679, + "grad_norm": 1.6597303844577715, + "learning_rate": 8.86187043343715e-06, + "loss": 0.1362, + "step": 6945 + }, + { + "epoch": 0.5502871855813032, + "grad_norm": 1.849790792496832, + "learning_rate": 8.859321112774535e-06, + "loss": 0.1893, + "step": 6946 + }, + { + "epoch": 0.5503664091899386, + "grad_norm": 1.58852716633234, + "learning_rate": 8.856771867220135e-06, + "loss": 0.2134, + "step": 6947 + }, + { + "epoch": 0.550445632798574, + "grad_norm": 1.7915293270654693, + "learning_rate": 8.854222696941807e-06, + "loss": 0.2057, + "step": 6948 + }, + { + "epoch": 0.5505248564072094, + "grad_norm": 1.574054080189603, + "learning_rate": 8.8516736021074e-06, + "loss": 0.1613, + "step": 6949 + }, + { + "epoch": 0.5506040800158447, + "grad_norm": 1.8713321733407189, + "learning_rate": 8.849124582884762e-06, + "loss": 0.2005, + "step": 6950 + }, + { + "epoch": 0.5506833036244801, + "grad_norm": 1.7987161318970615, + "learning_rate": 8.846575639441732e-06, + "loss": 0.2935, + "step": 6951 + }, + { + "epoch": 0.5507625272331155, + "grad_norm": 2.041162608096013, + "learning_rate": 8.844026771946148e-06, + "loss": 0.3758, + "step": 6952 + }, + { + "epoch": 0.5508417508417508, + "grad_norm": 1.3627367382965248, + "learning_rate": 8.841477980565838e-06, + "loss": 0.217, + "step": 6953 + }, + { + "epoch": 0.5509209744503862, + "grad_norm": 2.2267755660566944, + "learning_rate": 8.838929265468627e-06, + "loss": 0.2653, + "step": 6954 + }, + { + "epoch": 0.5510001980590216, + "grad_norm": 1.711149475187765, + "learning_rate": 8.836380626822339e-06, + "loss": 0.2022, + "step": 6955 + }, + { + "epoch": 0.551079421667657, + "grad_norm": 1.8631335363885488, + "learning_rate": 8.833832064794787e-06, + "loss": 0.3493, + "step": 6956 + }, + { + "epoch": 0.5511586452762923, + "grad_norm": 1.7089585466617916, + "learning_rate": 8.831283579553781e-06, + "loss": 0.1445, + "step": 6957 + }, + { + "epoch": 0.5512378688849278, + "grad_norm": 1.4259605261245403, + "learning_rate": 8.828735171267131e-06, + "loss": 0.2295, + "step": 6958 + }, + { + "epoch": 0.5513170924935631, + "grad_norm": 1.127324555335012, + "learning_rate": 8.82618684010263e-06, + "loss": 0.1599, + "step": 6959 + }, + { + "epoch": 0.5513963161021984, + "grad_norm": 1.6127379805024942, + "learning_rate": 8.823638586228081e-06, + "loss": 0.2846, + "step": 6960 + }, + { + "epoch": 0.5514755397108339, + "grad_norm": 1.7341644378888477, + "learning_rate": 8.82109040981127e-06, + "loss": 0.2213, + "step": 6961 + }, + { + "epoch": 0.5515547633194692, + "grad_norm": 1.606602164675874, + "learning_rate": 8.818542311019982e-06, + "loss": 0.2139, + "step": 6962 + }, + { + "epoch": 0.5516339869281046, + "grad_norm": 1.751798026497039, + "learning_rate": 8.815994290022e-06, + "loss": 0.2841, + "step": 6963 + }, + { + "epoch": 0.5517132105367399, + "grad_norm": 1.5172786185685385, + "learning_rate": 8.813446346985095e-06, + "loss": 0.1577, + "step": 6964 + }, + { + "epoch": 0.5517924341453754, + "grad_norm": 1.861339583970913, + "learning_rate": 8.810898482077038e-06, + "loss": 0.2756, + "step": 6965 + }, + { + "epoch": 0.5518716577540107, + "grad_norm": 1.5778570150187614, + "learning_rate": 8.808350695465597e-06, + "loss": 0.2155, + "step": 6966 + }, + { + "epoch": 0.551950881362646, + "grad_norm": 1.7950250162561092, + "learning_rate": 8.805802987318527e-06, + "loss": 0.2523, + "step": 6967 + }, + { + "epoch": 0.5520301049712815, + "grad_norm": 1.2794079673728658, + "learning_rate": 8.803255357803584e-06, + "loss": 0.1913, + "step": 6968 + }, + { + "epoch": 0.5521093285799168, + "grad_norm": 1.5682419708075892, + "learning_rate": 8.800707807088521e-06, + "loss": 0.1828, + "step": 6969 + }, + { + "epoch": 0.5521885521885522, + "grad_norm": 2.1114238654780615, + "learning_rate": 8.798160335341078e-06, + "loss": 0.2739, + "step": 6970 + }, + { + "epoch": 0.5522677757971876, + "grad_norm": 1.8964277903416056, + "learning_rate": 8.795612942728989e-06, + "loss": 0.2621, + "step": 6971 + }, + { + "epoch": 0.5523469994058229, + "grad_norm": 1.4650625724696595, + "learning_rate": 8.793065629419996e-06, + "loss": 0.2222, + "step": 6972 + }, + { + "epoch": 0.5524262230144583, + "grad_norm": 1.5752744680573632, + "learning_rate": 8.790518395581823e-06, + "loss": 0.2224, + "step": 6973 + }, + { + "epoch": 0.5525054466230936, + "grad_norm": 2.050986661584186, + "learning_rate": 8.787971241382193e-06, + "loss": 0.2312, + "step": 6974 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 1.4061768629540126, + "learning_rate": 8.785424166988827e-06, + "loss": 0.1915, + "step": 6975 + }, + { + "epoch": 0.5526638938403644, + "grad_norm": 1.4577425464026734, + "learning_rate": 8.782877172569433e-06, + "loss": 0.1579, + "step": 6976 + }, + { + "epoch": 0.5527431174489998, + "grad_norm": 1.5977697023573567, + "learning_rate": 8.78033025829172e-06, + "loss": 0.2339, + "step": 6977 + }, + { + "epoch": 0.5528223410576352, + "grad_norm": 1.5270255423035342, + "learning_rate": 8.777783424323396e-06, + "loss": 0.1964, + "step": 6978 + }, + { + "epoch": 0.5529015646662705, + "grad_norm": 1.4756424020295873, + "learning_rate": 8.775236670832146e-06, + "loss": 0.1957, + "step": 6979 + }, + { + "epoch": 0.5529807882749059, + "grad_norm": 1.6871241826511918, + "learning_rate": 8.772689997985674e-06, + "loss": 0.2115, + "step": 6980 + }, + { + "epoch": 0.5530600118835413, + "grad_norm": 1.774865193215159, + "learning_rate": 8.770143405951657e-06, + "loss": 0.2942, + "step": 6981 + }, + { + "epoch": 0.5531392354921767, + "grad_norm": 1.2867774873892495, + "learning_rate": 8.76759689489778e-06, + "loss": 0.1798, + "step": 6982 + }, + { + "epoch": 0.553218459100812, + "grad_norm": 1.5293140409656585, + "learning_rate": 8.765050464991716e-06, + "loss": 0.2086, + "step": 6983 + }, + { + "epoch": 0.5532976827094475, + "grad_norm": 1.5471963562459794, + "learning_rate": 8.762504116401137e-06, + "loss": 0.2097, + "step": 6984 + }, + { + "epoch": 0.5533769063180828, + "grad_norm": 1.6418277040399487, + "learning_rate": 8.759957849293707e-06, + "loss": 0.1733, + "step": 6985 + }, + { + "epoch": 0.5534561299267181, + "grad_norm": 1.9486942542674106, + "learning_rate": 8.75741166383709e-06, + "loss": 0.2954, + "step": 6986 + }, + { + "epoch": 0.5535353535353535, + "grad_norm": 1.6323040056592095, + "learning_rate": 8.754865560198932e-06, + "loss": 0.202, + "step": 6987 + }, + { + "epoch": 0.5536145771439889, + "grad_norm": 1.632650043921622, + "learning_rate": 8.752319538546888e-06, + "loss": 0.2318, + "step": 6988 + }, + { + "epoch": 0.5536938007526243, + "grad_norm": 1.9049238118255067, + "learning_rate": 8.749773599048597e-06, + "loss": 0.3178, + "step": 6989 + }, + { + "epoch": 0.5537730243612596, + "grad_norm": 1.2387705762804189, + "learning_rate": 8.747227741871698e-06, + "loss": 0.1208, + "step": 6990 + }, + { + "epoch": 0.5538522479698951, + "grad_norm": 1.4827469680734027, + "learning_rate": 8.744681967183826e-06, + "loss": 0.2283, + "step": 6991 + }, + { + "epoch": 0.5539314715785304, + "grad_norm": 2.5238770672702913, + "learning_rate": 8.742136275152606e-06, + "loss": 0.2698, + "step": 6992 + }, + { + "epoch": 0.5540106951871657, + "grad_norm": 1.458531118939311, + "learning_rate": 8.73959066594566e-06, + "loss": 0.2072, + "step": 6993 + }, + { + "epoch": 0.5540899187958012, + "grad_norm": 1.4685900241815824, + "learning_rate": 8.737045139730605e-06, + "loss": 0.2103, + "step": 6994 + }, + { + "epoch": 0.5541691424044365, + "grad_norm": 1.088024988992133, + "learning_rate": 8.734499696675048e-06, + "loss": 0.1675, + "step": 6995 + }, + { + "epoch": 0.5542483660130719, + "grad_norm": 1.454133301750155, + "learning_rate": 8.731954336946599e-06, + "loss": 0.1447, + "step": 6996 + }, + { + "epoch": 0.5543275896217073, + "grad_norm": 1.6261155433945458, + "learning_rate": 8.729409060712855e-06, + "loss": 0.2502, + "step": 6997 + }, + { + "epoch": 0.5544068132303427, + "grad_norm": 1.5350964635663378, + "learning_rate": 8.726863868141408e-06, + "loss": 0.1963, + "step": 6998 + }, + { + "epoch": 0.554486036838978, + "grad_norm": 1.6261410328562709, + "learning_rate": 8.724318759399853e-06, + "loss": 0.2401, + "step": 6999 + }, + { + "epoch": 0.5545652604476133, + "grad_norm": 1.6105373044716385, + "learning_rate": 8.721773734655768e-06, + "loss": 0.1873, + "step": 7000 + }, + { + "epoch": 0.5546444840562488, + "grad_norm": 1.5434667181935557, + "learning_rate": 8.719228794076733e-06, + "loss": 0.1313, + "step": 7001 + }, + { + "epoch": 0.5547237076648841, + "grad_norm": 1.4115933195861863, + "learning_rate": 8.716683937830318e-06, + "loss": 0.208, + "step": 7002 + }, + { + "epoch": 0.5548029312735195, + "grad_norm": 1.7185539381379835, + "learning_rate": 8.71413916608409e-06, + "loss": 0.2173, + "step": 7003 + }, + { + "epoch": 0.5548821548821549, + "grad_norm": 1.7608151033548984, + "learning_rate": 8.711594479005614e-06, + "loss": 0.1918, + "step": 7004 + }, + { + "epoch": 0.5549613784907903, + "grad_norm": 1.545132372450632, + "learning_rate": 8.709049876762438e-06, + "loss": 0.1531, + "step": 7005 + }, + { + "epoch": 0.5550406020994256, + "grad_norm": 1.8616259398056916, + "learning_rate": 8.706505359522119e-06, + "loss": 0.2884, + "step": 7006 + }, + { + "epoch": 0.555119825708061, + "grad_norm": 1.599147292505235, + "learning_rate": 8.703960927452197e-06, + "loss": 0.1667, + "step": 7007 + }, + { + "epoch": 0.5551990493166964, + "grad_norm": 1.8157891353523499, + "learning_rate": 8.701416580720212e-06, + "loss": 0.3096, + "step": 7008 + }, + { + "epoch": 0.5552782729253317, + "grad_norm": 1.8618291776958673, + "learning_rate": 8.698872319493698e-06, + "loss": 0.2502, + "step": 7009 + }, + { + "epoch": 0.5553574965339672, + "grad_norm": 1.5154651043474558, + "learning_rate": 8.69632814394018e-06, + "loss": 0.1671, + "step": 7010 + }, + { + "epoch": 0.5554367201426025, + "grad_norm": 1.5029504510030596, + "learning_rate": 8.693784054227179e-06, + "loss": 0.2085, + "step": 7011 + }, + { + "epoch": 0.5555159437512379, + "grad_norm": 2.4916637363742713, + "learning_rate": 8.691240050522215e-06, + "loss": 0.3906, + "step": 7012 + }, + { + "epoch": 0.5555951673598732, + "grad_norm": 1.8632398798315006, + "learning_rate": 8.688696132992797e-06, + "loss": 0.251, + "step": 7013 + }, + { + "epoch": 0.5556743909685086, + "grad_norm": 1.668005116987391, + "learning_rate": 8.686152301806427e-06, + "loss": 0.299, + "step": 7014 + }, + { + "epoch": 0.555753614577144, + "grad_norm": 1.5763005914047759, + "learning_rate": 8.683608557130608e-06, + "loss": 0.1862, + "step": 7015 + }, + { + "epoch": 0.5558328381857793, + "grad_norm": 1.8785291200939969, + "learning_rate": 8.681064899132831e-06, + "loss": 0.2573, + "step": 7016 + }, + { + "epoch": 0.5559120617944148, + "grad_norm": 1.1427925614939018, + "learning_rate": 8.678521327980585e-06, + "loss": 0.1651, + "step": 7017 + }, + { + "epoch": 0.5559912854030501, + "grad_norm": 1.3265957407081017, + "learning_rate": 8.675977843841347e-06, + "loss": 0.1765, + "step": 7018 + }, + { + "epoch": 0.5560705090116855, + "grad_norm": 1.532299598003422, + "learning_rate": 8.673434446882601e-06, + "loss": 0.1854, + "step": 7019 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 1.36563762271964, + "learning_rate": 8.670891137271814e-06, + "loss": 0.1601, + "step": 7020 + }, + { + "epoch": 0.5562289562289562, + "grad_norm": 1.5720984646446576, + "learning_rate": 8.668347915176448e-06, + "loss": 0.1959, + "step": 7021 + }, + { + "epoch": 0.5563081798375916, + "grad_norm": 1.647567756585152, + "learning_rate": 8.665804780763963e-06, + "loss": 0.2458, + "step": 7022 + }, + { + "epoch": 0.556387403446227, + "grad_norm": 1.518473675760264, + "learning_rate": 8.663261734201818e-06, + "loss": 0.2018, + "step": 7023 + }, + { + "epoch": 0.5564666270548624, + "grad_norm": 1.6392680624778466, + "learning_rate": 8.660718775657453e-06, + "loss": 0.2453, + "step": 7024 + }, + { + "epoch": 0.5565458506634977, + "grad_norm": 1.626149944744027, + "learning_rate": 8.658175905298314e-06, + "loss": 0.225, + "step": 7025 + }, + { + "epoch": 0.5566250742721331, + "grad_norm": 1.6966603918627359, + "learning_rate": 8.655633123291833e-06, + "loss": 0.1817, + "step": 7026 + }, + { + "epoch": 0.5567042978807685, + "grad_norm": 1.5520737574007002, + "learning_rate": 8.653090429805442e-06, + "loss": 0.235, + "step": 7027 + }, + { + "epoch": 0.5567835214894038, + "grad_norm": 1.505457128164782, + "learning_rate": 8.650547825006568e-06, + "loss": 0.1618, + "step": 7028 + }, + { + "epoch": 0.5568627450980392, + "grad_norm": 1.425360727244507, + "learning_rate": 8.648005309062623e-06, + "loss": 0.2026, + "step": 7029 + }, + { + "epoch": 0.5569419687066746, + "grad_norm": 1.362019184205673, + "learning_rate": 8.645462882141026e-06, + "loss": 0.1895, + "step": 7030 + }, + { + "epoch": 0.55702119231531, + "grad_norm": 1.3744701799224086, + "learning_rate": 8.64292054440918e-06, + "loss": 0.1814, + "step": 7031 + }, + { + "epoch": 0.5571004159239453, + "grad_norm": 1.5469267017965995, + "learning_rate": 8.640378296034486e-06, + "loss": 0.1479, + "step": 7032 + }, + { + "epoch": 0.5571796395325808, + "grad_norm": 1.8600546026971057, + "learning_rate": 8.63783613718434e-06, + "loss": 0.288, + "step": 7033 + }, + { + "epoch": 0.5572588631412161, + "grad_norm": 1.634389970043728, + "learning_rate": 8.63529406802613e-06, + "loss": 0.2219, + "step": 7034 + }, + { + "epoch": 0.5573380867498514, + "grad_norm": 1.8232736260784956, + "learning_rate": 8.632752088727237e-06, + "loss": 0.1966, + "step": 7035 + }, + { + "epoch": 0.5574173103584869, + "grad_norm": 1.2814330212891232, + "learning_rate": 8.63021019945504e-06, + "loss": 0.1842, + "step": 7036 + }, + { + "epoch": 0.5574965339671222, + "grad_norm": 1.6824010340234732, + "learning_rate": 8.627668400376914e-06, + "loss": 0.1764, + "step": 7037 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 2.0168679427511496, + "learning_rate": 8.625126691660216e-06, + "loss": 0.2556, + "step": 7038 + }, + { + "epoch": 0.5576549811843929, + "grad_norm": 1.6774431748932268, + "learning_rate": 8.622585073472314e-06, + "loss": 0.213, + "step": 7039 + }, + { + "epoch": 0.5577342047930284, + "grad_norm": 1.9162775200372715, + "learning_rate": 8.620043545980554e-06, + "loss": 0.3046, + "step": 7040 + }, + { + "epoch": 0.5578134284016637, + "grad_norm": 2.0185677142736975, + "learning_rate": 8.61750210935229e-06, + "loss": 0.2078, + "step": 7041 + }, + { + "epoch": 0.557892652010299, + "grad_norm": 1.7387826411982605, + "learning_rate": 8.614960763754857e-06, + "loss": 0.2617, + "step": 7042 + }, + { + "epoch": 0.5579718756189345, + "grad_norm": 1.5293351639663046, + "learning_rate": 8.612419509355593e-06, + "loss": 0.1471, + "step": 7043 + }, + { + "epoch": 0.5580510992275698, + "grad_norm": 1.5035641910072244, + "learning_rate": 8.60987834632183e-06, + "loss": 0.2019, + "step": 7044 + }, + { + "epoch": 0.5581303228362052, + "grad_norm": 1.7479100345026106, + "learning_rate": 8.607337274820888e-06, + "loss": 0.2621, + "step": 7045 + }, + { + "epoch": 0.5582095464448406, + "grad_norm": 1.5016324035738737, + "learning_rate": 8.604796295020085e-06, + "loss": 0.2455, + "step": 7046 + }, + { + "epoch": 0.558288770053476, + "grad_norm": 1.8322583558460637, + "learning_rate": 8.602255407086736e-06, + "loss": 0.2273, + "step": 7047 + }, + { + "epoch": 0.5583679936621113, + "grad_norm": 2.0342300687960817, + "learning_rate": 8.599714611188141e-06, + "loss": 0.284, + "step": 7048 + }, + { + "epoch": 0.5584472172707466, + "grad_norm": 1.458983267594101, + "learning_rate": 8.5971739074916e-06, + "loss": 0.1706, + "step": 7049 + }, + { + "epoch": 0.5585264408793821, + "grad_norm": 1.7067220816106976, + "learning_rate": 8.594633296164409e-06, + "loss": 0.2222, + "step": 7050 + }, + { + "epoch": 0.5586056644880174, + "grad_norm": 1.5358251518042219, + "learning_rate": 8.59209277737385e-06, + "loss": 0.1735, + "step": 7051 + }, + { + "epoch": 0.5586848880966528, + "grad_norm": 1.726884728030867, + "learning_rate": 8.58955235128721e-06, + "loss": 0.2544, + "step": 7052 + }, + { + "epoch": 0.5587641117052882, + "grad_norm": 1.8539779357253885, + "learning_rate": 8.58701201807176e-06, + "loss": 0.2531, + "step": 7053 + }, + { + "epoch": 0.5588433353139235, + "grad_norm": 1.7504385909576572, + "learning_rate": 8.584471777894768e-06, + "loss": 0.1592, + "step": 7054 + }, + { + "epoch": 0.5589225589225589, + "grad_norm": 1.5961772254361881, + "learning_rate": 8.581931630923499e-06, + "loss": 0.175, + "step": 7055 + }, + { + "epoch": 0.5590017825311943, + "grad_norm": 1.6783653790384048, + "learning_rate": 8.57939157732521e-06, + "loss": 0.1604, + "step": 7056 + }, + { + "epoch": 0.5590810061398297, + "grad_norm": 2.2506319325283575, + "learning_rate": 8.576851617267151e-06, + "loss": 0.2192, + "step": 7057 + }, + { + "epoch": 0.559160229748465, + "grad_norm": 1.5135739549219434, + "learning_rate": 8.574311750916565e-06, + "loss": 0.1792, + "step": 7058 + }, + { + "epoch": 0.5592394533571005, + "grad_norm": 1.5576204767986435, + "learning_rate": 8.571771978440689e-06, + "loss": 0.3164, + "step": 7059 + }, + { + "epoch": 0.5593186769657358, + "grad_norm": 1.5030744712196413, + "learning_rate": 8.569232300006756e-06, + "loss": 0.1651, + "step": 7060 + }, + { + "epoch": 0.5593979005743711, + "grad_norm": 1.6299736578697477, + "learning_rate": 8.566692715781992e-06, + "loss": 0.1602, + "step": 7061 + }, + { + "epoch": 0.5594771241830065, + "grad_norm": 1.6943766912092884, + "learning_rate": 8.564153225933616e-06, + "loss": 0.2802, + "step": 7062 + }, + { + "epoch": 0.5595563477916419, + "grad_norm": 1.279686639516914, + "learning_rate": 8.56161383062884e-06, + "loss": 0.188, + "step": 7063 + }, + { + "epoch": 0.5596355714002773, + "grad_norm": 1.5576386856168152, + "learning_rate": 8.559074530034875e-06, + "loss": 0.1752, + "step": 7064 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 1.2931759137893757, + "learning_rate": 8.556535324318916e-06, + "loss": 0.1506, + "step": 7065 + }, + { + "epoch": 0.5597940186175481, + "grad_norm": 1.5379778082888391, + "learning_rate": 8.553996213648164e-06, + "loss": 0.1985, + "step": 7066 + }, + { + "epoch": 0.5598732422261834, + "grad_norm": 1.4897685073988978, + "learning_rate": 8.551457198189799e-06, + "loss": 0.1567, + "step": 7067 + }, + { + "epoch": 0.5599524658348187, + "grad_norm": 1.2151206521891214, + "learning_rate": 8.54891827811101e-06, + "loss": 0.0967, + "step": 7068 + }, + { + "epoch": 0.5600316894434542, + "grad_norm": 1.89789025465256, + "learning_rate": 8.546379453578972e-06, + "loss": 0.211, + "step": 7069 + }, + { + "epoch": 0.5601109130520895, + "grad_norm": 2.259174037915149, + "learning_rate": 8.543840724760848e-06, + "loss": 0.2666, + "step": 7070 + }, + { + "epoch": 0.5601901366607249, + "grad_norm": 1.393532075492857, + "learning_rate": 8.541302091823809e-06, + "loss": 0.1897, + "step": 7071 + }, + { + "epoch": 0.5602693602693603, + "grad_norm": 1.7319833635416846, + "learning_rate": 8.538763554935008e-06, + "loss": 0.1904, + "step": 7072 + }, + { + "epoch": 0.5603485838779957, + "grad_norm": 1.3014958372636867, + "learning_rate": 8.536225114261597e-06, + "loss": 0.1715, + "step": 7073 + }, + { + "epoch": 0.560427807486631, + "grad_norm": 1.7671573332060513, + "learning_rate": 8.533686769970717e-06, + "loss": 0.2346, + "step": 7074 + }, + { + "epoch": 0.5605070310952663, + "grad_norm": 1.5706183684872626, + "learning_rate": 8.531148522229509e-06, + "loss": 0.1548, + "step": 7075 + }, + { + "epoch": 0.5605862547039018, + "grad_norm": 1.7312126931837069, + "learning_rate": 8.528610371205102e-06, + "loss": 0.2395, + "step": 7076 + }, + { + "epoch": 0.5606654783125371, + "grad_norm": 2.362450875890349, + "learning_rate": 8.526072317064623e-06, + "loss": 0.289, + "step": 7077 + }, + { + "epoch": 0.5607447019211725, + "grad_norm": 1.6111896914971335, + "learning_rate": 8.52353435997519e-06, + "loss": 0.2691, + "step": 7078 + }, + { + "epoch": 0.5608239255298079, + "grad_norm": 1.6134377068100971, + "learning_rate": 8.520996500103915e-06, + "loss": 0.1893, + "step": 7079 + }, + { + "epoch": 0.5609031491384433, + "grad_norm": 1.9219918345241058, + "learning_rate": 8.518458737617903e-06, + "loss": 0.2737, + "step": 7080 + }, + { + "epoch": 0.5609823727470786, + "grad_norm": 1.538529187539912, + "learning_rate": 8.515921072684255e-06, + "loss": 0.1759, + "step": 7081 + }, + { + "epoch": 0.561061596355714, + "grad_norm": 1.4713101838348777, + "learning_rate": 8.513383505470065e-06, + "loss": 0.2206, + "step": 7082 + }, + { + "epoch": 0.5611408199643494, + "grad_norm": 1.7520911411183, + "learning_rate": 8.510846036142415e-06, + "loss": 0.3054, + "step": 7083 + }, + { + "epoch": 0.5612200435729847, + "grad_norm": 1.1626877692646231, + "learning_rate": 8.50830866486839e-06, + "loss": 0.1594, + "step": 7084 + }, + { + "epoch": 0.5612992671816202, + "grad_norm": 1.5193398768990873, + "learning_rate": 8.505771391815061e-06, + "loss": 0.2715, + "step": 7085 + }, + { + "epoch": 0.5613784907902555, + "grad_norm": 1.5537944908146006, + "learning_rate": 8.503234217149496e-06, + "loss": 0.2298, + "step": 7086 + }, + { + "epoch": 0.5614577143988909, + "grad_norm": 1.7183227787774709, + "learning_rate": 8.500697141038758e-06, + "loss": 0.2214, + "step": 7087 + }, + { + "epoch": 0.5615369380075262, + "grad_norm": 1.3589841961446756, + "learning_rate": 8.498160163649896e-06, + "loss": 0.1803, + "step": 7088 + }, + { + "epoch": 0.5616161616161616, + "grad_norm": 1.3748286419187215, + "learning_rate": 8.495623285149962e-06, + "loss": 0.2183, + "step": 7089 + }, + { + "epoch": 0.561695385224797, + "grad_norm": 1.3126464107669036, + "learning_rate": 8.493086505705998e-06, + "loss": 0.1808, + "step": 7090 + }, + { + "epoch": 0.5617746088334323, + "grad_norm": 1.479596500850297, + "learning_rate": 8.490549825485036e-06, + "loss": 0.262, + "step": 7091 + }, + { + "epoch": 0.5618538324420678, + "grad_norm": 1.6719535508226904, + "learning_rate": 8.488013244654103e-06, + "loss": 0.1625, + "step": 7092 + }, + { + "epoch": 0.5619330560507031, + "grad_norm": 1.6234137226584349, + "learning_rate": 8.485476763380224e-06, + "loss": 0.245, + "step": 7093 + }, + { + "epoch": 0.5620122796593385, + "grad_norm": 1.562111667493653, + "learning_rate": 8.482940381830412e-06, + "loss": 0.1742, + "step": 7094 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 1.6377421872221636, + "learning_rate": 8.480404100171677e-06, + "loss": 0.2491, + "step": 7095 + }, + { + "epoch": 0.5621707268766092, + "grad_norm": 1.3929033722375574, + "learning_rate": 8.47786791857102e-06, + "loss": 0.208, + "step": 7096 + }, + { + "epoch": 0.5622499504852446, + "grad_norm": 1.5094049320700702, + "learning_rate": 8.475331837195435e-06, + "loss": 0.2318, + "step": 7097 + }, + { + "epoch": 0.56232917409388, + "grad_norm": 1.9417872528439555, + "learning_rate": 8.472795856211916e-06, + "loss": 0.2516, + "step": 7098 + }, + { + "epoch": 0.5624083977025154, + "grad_norm": 1.508063606285105, + "learning_rate": 8.470259975787438e-06, + "loss": 0.2251, + "step": 7099 + }, + { + "epoch": 0.5624876213111507, + "grad_norm": 1.7031937074744565, + "learning_rate": 8.46772419608898e-06, + "loss": 0.1813, + "step": 7100 + }, + { + "epoch": 0.5625668449197861, + "grad_norm": 2.1771050434669417, + "learning_rate": 8.465188517283514e-06, + "loss": 0.2041, + "step": 7101 + }, + { + "epoch": 0.5626460685284215, + "grad_norm": 1.7235678854995575, + "learning_rate": 8.462652939537996e-06, + "loss": 0.2245, + "step": 7102 + }, + { + "epoch": 0.5627252921370568, + "grad_norm": 2.0493938854449705, + "learning_rate": 8.460117463019387e-06, + "loss": 0.1878, + "step": 7103 + }, + { + "epoch": 0.5628045157456922, + "grad_norm": 1.454055133373079, + "learning_rate": 8.457582087894631e-06, + "loss": 0.1699, + "step": 7104 + }, + { + "epoch": 0.5628837393543276, + "grad_norm": 1.7948045167220608, + "learning_rate": 8.455046814330674e-06, + "loss": 0.2897, + "step": 7105 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 1.488977244845906, + "learning_rate": 8.452511642494453e-06, + "loss": 0.2535, + "step": 7106 + }, + { + "epoch": 0.5630421865715983, + "grad_norm": 1.7550587822166357, + "learning_rate": 8.449976572552891e-06, + "loss": 0.3068, + "step": 7107 + }, + { + "epoch": 0.5631214101802338, + "grad_norm": 1.1724516498937883, + "learning_rate": 8.447441604672913e-06, + "loss": 0.1294, + "step": 7108 + }, + { + "epoch": 0.5632006337888691, + "grad_norm": 1.35873742081911, + "learning_rate": 8.444906739021438e-06, + "loss": 0.1959, + "step": 7109 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 1.849551135944485, + "learning_rate": 8.442371975765368e-06, + "loss": 0.1839, + "step": 7110 + }, + { + "epoch": 0.5633590810061399, + "grad_norm": 1.7115123747383312, + "learning_rate": 8.439837315071612e-06, + "loss": 0.238, + "step": 7111 + }, + { + "epoch": 0.5634383046147752, + "grad_norm": 1.5275704121290707, + "learning_rate": 8.43730275710706e-06, + "loss": 0.259, + "step": 7112 + }, + { + "epoch": 0.5635175282234106, + "grad_norm": 1.2485642708862594, + "learning_rate": 8.434768302038602e-06, + "loss": 0.1226, + "step": 7113 + }, + { + "epoch": 0.5635967518320459, + "grad_norm": 1.6221255556295489, + "learning_rate": 8.432233950033122e-06, + "loss": 0.2209, + "step": 7114 + }, + { + "epoch": 0.5636759754406814, + "grad_norm": 1.8749249896524045, + "learning_rate": 8.42969970125749e-06, + "loss": 0.2987, + "step": 7115 + }, + { + "epoch": 0.5637551990493167, + "grad_norm": 1.358266941421217, + "learning_rate": 8.427165555878577e-06, + "loss": 0.1657, + "step": 7116 + }, + { + "epoch": 0.563834422657952, + "grad_norm": 1.6746509953691089, + "learning_rate": 8.424631514063247e-06, + "loss": 0.1594, + "step": 7117 + }, + { + "epoch": 0.5639136462665875, + "grad_norm": 1.8553999194635002, + "learning_rate": 8.422097575978349e-06, + "loss": 0.2838, + "step": 7118 + }, + { + "epoch": 0.5639928698752228, + "grad_norm": 1.1729592551162313, + "learning_rate": 8.419563741790735e-06, + "loss": 0.0975, + "step": 7119 + }, + { + "epoch": 0.5640720934838582, + "grad_norm": 1.469682119112714, + "learning_rate": 8.417030011667241e-06, + "loss": 0.1765, + "step": 7120 + }, + { + "epoch": 0.5641513170924936, + "grad_norm": 1.9009284070085266, + "learning_rate": 8.414496385774706e-06, + "loss": 0.3081, + "step": 7121 + }, + { + "epoch": 0.564230540701129, + "grad_norm": 1.2972915281593267, + "learning_rate": 8.411962864279957e-06, + "loss": 0.1161, + "step": 7122 + }, + { + "epoch": 0.5643097643097643, + "grad_norm": 1.421175009953936, + "learning_rate": 8.409429447349811e-06, + "loss": 0.1959, + "step": 7123 + }, + { + "epoch": 0.5643889879183996, + "grad_norm": 1.5795193011973268, + "learning_rate": 8.406896135151081e-06, + "loss": 0.2066, + "step": 7124 + }, + { + "epoch": 0.5644682115270351, + "grad_norm": 1.7249032655909373, + "learning_rate": 8.40436292785058e-06, + "loss": 0.1953, + "step": 7125 + }, + { + "epoch": 0.5645474351356704, + "grad_norm": 1.6477759025681094, + "learning_rate": 8.401829825615098e-06, + "loss": 0.2457, + "step": 7126 + }, + { + "epoch": 0.5646266587443058, + "grad_norm": 1.703867785688905, + "learning_rate": 8.399296828611433e-06, + "loss": 0.1727, + "step": 7127 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 1.4537347447742368, + "learning_rate": 8.396763937006369e-06, + "loss": 0.1968, + "step": 7128 + }, + { + "epoch": 0.5647851059615765, + "grad_norm": 1.7044491792562788, + "learning_rate": 8.394231150966685e-06, + "loss": 0.2567, + "step": 7129 + }, + { + "epoch": 0.5648643295702119, + "grad_norm": 1.7494152612326808, + "learning_rate": 8.391698470659154e-06, + "loss": 0.239, + "step": 7130 + }, + { + "epoch": 0.5649435531788473, + "grad_norm": 1.7824215198518916, + "learning_rate": 8.38916589625054e-06, + "loss": 0.1976, + "step": 7131 + }, + { + "epoch": 0.5650227767874827, + "grad_norm": 1.6691651293977405, + "learning_rate": 8.3866334279076e-06, + "loss": 0.2897, + "step": 7132 + }, + { + "epoch": 0.565102000396118, + "grad_norm": 1.6804170572525348, + "learning_rate": 8.384101065797087e-06, + "loss": 0.2608, + "step": 7133 + }, + { + "epoch": 0.5651812240047535, + "grad_norm": 1.5717283029091211, + "learning_rate": 8.381568810085745e-06, + "loss": 0.1569, + "step": 7134 + }, + { + "epoch": 0.5652604476133888, + "grad_norm": 1.8698896197112826, + "learning_rate": 8.379036660940306e-06, + "loss": 0.2896, + "step": 7135 + }, + { + "epoch": 0.5653396712220241, + "grad_norm": 1.359905107395929, + "learning_rate": 8.376504618527505e-06, + "loss": 0.1529, + "step": 7136 + }, + { + "epoch": 0.5654188948306595, + "grad_norm": 2.1172315685875507, + "learning_rate": 8.373972683014063e-06, + "loss": 0.2671, + "step": 7137 + }, + { + "epoch": 0.5654981184392949, + "grad_norm": 1.8685949512210573, + "learning_rate": 8.371440854566696e-06, + "loss": 0.1816, + "step": 7138 + }, + { + "epoch": 0.5655773420479303, + "grad_norm": 1.4443374683745343, + "learning_rate": 8.368909133352114e-06, + "loss": 0.2118, + "step": 7139 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 1.8430231806352138, + "learning_rate": 8.366377519537015e-06, + "loss": 0.2176, + "step": 7140 + }, + { + "epoch": 0.5657357892652011, + "grad_norm": 1.448194028148172, + "learning_rate": 8.363846013288096e-06, + "loss": 0.1763, + "step": 7141 + }, + { + "epoch": 0.5658150128738364, + "grad_norm": 1.5906568744506333, + "learning_rate": 8.361314614772047e-06, + "loss": 0.2464, + "step": 7142 + }, + { + "epoch": 0.5658942364824717, + "grad_norm": 1.586304249770699, + "learning_rate": 8.358783324155542e-06, + "loss": 0.1753, + "step": 7143 + }, + { + "epoch": 0.5659734600911072, + "grad_norm": 1.5927055966297055, + "learning_rate": 8.35625214160526e-06, + "loss": 0.202, + "step": 7144 + }, + { + "epoch": 0.5660526836997425, + "grad_norm": 1.8085719363079078, + "learning_rate": 8.353721067287865e-06, + "loss": 0.2004, + "step": 7145 + }, + { + "epoch": 0.5661319073083779, + "grad_norm": 2.121741541407398, + "learning_rate": 8.351190101370016e-06, + "loss": 0.1954, + "step": 7146 + }, + { + "epoch": 0.5662111309170133, + "grad_norm": 1.7456032842756144, + "learning_rate": 8.348659244018367e-06, + "loss": 0.2393, + "step": 7147 + }, + { + "epoch": 0.5662903545256487, + "grad_norm": 2.033597040511318, + "learning_rate": 8.34612849539956e-06, + "loss": 0.2559, + "step": 7148 + }, + { + "epoch": 0.566369578134284, + "grad_norm": 1.6191205875804606, + "learning_rate": 8.343597855680231e-06, + "loss": 0.186, + "step": 7149 + }, + { + "epoch": 0.5664488017429193, + "grad_norm": 1.285906488415901, + "learning_rate": 8.341067325027017e-06, + "loss": 0.1488, + "step": 7150 + }, + { + "epoch": 0.5665280253515548, + "grad_norm": 1.8129363283333348, + "learning_rate": 8.338536903606535e-06, + "loss": 0.1845, + "step": 7151 + }, + { + "epoch": 0.5666072489601901, + "grad_norm": 1.4764238521334778, + "learning_rate": 8.336006591585406e-06, + "loss": 0.1514, + "step": 7152 + }, + { + "epoch": 0.5666864725688255, + "grad_norm": 1.5697217819577614, + "learning_rate": 8.333476389130234e-06, + "loss": 0.2412, + "step": 7153 + }, + { + "epoch": 0.5667656961774609, + "grad_norm": 1.3757906492537588, + "learning_rate": 8.330946296407622e-06, + "loss": 0.17, + "step": 7154 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 1.417879707074937, + "learning_rate": 8.328416313584169e-06, + "loss": 0.1365, + "step": 7155 + }, + { + "epoch": 0.5669241433947316, + "grad_norm": 1.3279243842852761, + "learning_rate": 8.325886440826457e-06, + "loss": 0.1422, + "step": 7156 + }, + { + "epoch": 0.567003367003367, + "grad_norm": 1.5803829358564379, + "learning_rate": 8.323356678301067e-06, + "loss": 0.164, + "step": 7157 + }, + { + "epoch": 0.5670825906120024, + "grad_norm": 1.926349933651766, + "learning_rate": 8.320827026174572e-06, + "loss": 0.2229, + "step": 7158 + }, + { + "epoch": 0.5671618142206377, + "grad_norm": 1.8016315606923126, + "learning_rate": 8.318297484613538e-06, + "loss": 0.2279, + "step": 7159 + }, + { + "epoch": 0.5672410378292732, + "grad_norm": 1.8206023360645793, + "learning_rate": 8.315768053784524e-06, + "loss": 0.2158, + "step": 7160 + }, + { + "epoch": 0.5673202614379085, + "grad_norm": 1.8935170887687283, + "learning_rate": 8.313238733854076e-06, + "loss": 0.2595, + "step": 7161 + }, + { + "epoch": 0.5673994850465439, + "grad_norm": 1.6527108055516293, + "learning_rate": 8.310709524988743e-06, + "loss": 0.1807, + "step": 7162 + }, + { + "epoch": 0.5674787086551792, + "grad_norm": 2.237203453728549, + "learning_rate": 8.308180427355062e-06, + "loss": 0.3659, + "step": 7163 + }, + { + "epoch": 0.5675579322638146, + "grad_norm": 1.9876975880569285, + "learning_rate": 8.305651441119558e-06, + "loss": 0.1707, + "step": 7164 + }, + { + "epoch": 0.56763715587245, + "grad_norm": 2.0343381389879127, + "learning_rate": 8.303122566448754e-06, + "loss": 0.1755, + "step": 7165 + }, + { + "epoch": 0.5677163794810853, + "grad_norm": 1.6880433066022393, + "learning_rate": 8.300593803509163e-06, + "loss": 0.2335, + "step": 7166 + }, + { + "epoch": 0.5677956030897208, + "grad_norm": 1.799216347148959, + "learning_rate": 8.298065152467293e-06, + "loss": 0.2435, + "step": 7167 + }, + { + "epoch": 0.5678748266983561, + "grad_norm": 1.4092505555146648, + "learning_rate": 8.295536613489645e-06, + "loss": 0.192, + "step": 7168 + }, + { + "epoch": 0.5679540503069915, + "grad_norm": 1.567213366180453, + "learning_rate": 8.293008186742708e-06, + "loss": 0.2056, + "step": 7169 + }, + { + "epoch": 0.5680332739156269, + "grad_norm": 1.50515292093294, + "learning_rate": 8.290479872392969e-06, + "loss": 0.1641, + "step": 7170 + }, + { + "epoch": 0.5681124975242622, + "grad_norm": 1.4631926228585892, + "learning_rate": 8.287951670606905e-06, + "loss": 0.1985, + "step": 7171 + }, + { + "epoch": 0.5681917211328976, + "grad_norm": 1.5906471413197742, + "learning_rate": 8.285423581550985e-06, + "loss": 0.2045, + "step": 7172 + }, + { + "epoch": 0.568270944741533, + "grad_norm": 1.481088353499964, + "learning_rate": 8.282895605391674e-06, + "loss": 0.1759, + "step": 7173 + }, + { + "epoch": 0.5683501683501684, + "grad_norm": 1.6193342793107977, + "learning_rate": 8.280367742295424e-06, + "loss": 0.2213, + "step": 7174 + }, + { + "epoch": 0.5684293919588037, + "grad_norm": 1.921044475193414, + "learning_rate": 8.277839992428683e-06, + "loss": 0.2878, + "step": 7175 + }, + { + "epoch": 0.5685086155674391, + "grad_norm": 1.5584366648187773, + "learning_rate": 8.275312355957893e-06, + "loss": 0.21, + "step": 7176 + }, + { + "epoch": 0.5685878391760745, + "grad_norm": 1.5789958123311176, + "learning_rate": 8.272784833049485e-06, + "loss": 0.2029, + "step": 7177 + }, + { + "epoch": 0.5686670627847098, + "grad_norm": 1.637634148940145, + "learning_rate": 8.270257423869885e-06, + "loss": 0.2191, + "step": 7178 + }, + { + "epoch": 0.5687462863933452, + "grad_norm": 1.3934641068793392, + "learning_rate": 8.267730128585511e-06, + "loss": 0.1662, + "step": 7179 + }, + { + "epoch": 0.5688255100019806, + "grad_norm": 2.1443933550161853, + "learning_rate": 8.265202947362772e-06, + "loss": 0.3144, + "step": 7180 + }, + { + "epoch": 0.568904733610616, + "grad_norm": 1.7775579931099792, + "learning_rate": 8.262675880368074e-06, + "loss": 0.226, + "step": 7181 + }, + { + "epoch": 0.5689839572192513, + "grad_norm": 1.7075282905209899, + "learning_rate": 8.260148927767807e-06, + "loss": 0.2915, + "step": 7182 + }, + { + "epoch": 0.5690631808278868, + "grad_norm": 2.0249950980817255, + "learning_rate": 8.257622089728362e-06, + "loss": 0.3307, + "step": 7183 + }, + { + "epoch": 0.5691424044365221, + "grad_norm": 1.7824973409860467, + "learning_rate": 8.255095366416122e-06, + "loss": 0.2789, + "step": 7184 + }, + { + "epoch": 0.5692216280451574, + "grad_norm": 1.6721339940478186, + "learning_rate": 8.25256875799745e-06, + "loss": 0.1847, + "step": 7185 + }, + { + "epoch": 0.5693008516537928, + "grad_norm": 1.5233904389011332, + "learning_rate": 8.250042264638721e-06, + "loss": 0.2198, + "step": 7186 + }, + { + "epoch": 0.5693800752624282, + "grad_norm": 1.5901738527888205, + "learning_rate": 8.24751588650629e-06, + "loss": 0.2453, + "step": 7187 + }, + { + "epoch": 0.5694592988710636, + "grad_norm": 1.4224255506150771, + "learning_rate": 8.244989623766502e-06, + "loss": 0.2052, + "step": 7188 + }, + { + "epoch": 0.5695385224796989, + "grad_norm": 1.7414226383896743, + "learning_rate": 8.242463476585707e-06, + "loss": 0.2237, + "step": 7189 + }, + { + "epoch": 0.5696177460883344, + "grad_norm": 1.713689844712856, + "learning_rate": 8.239937445130232e-06, + "loss": 0.203, + "step": 7190 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 1.6590205727067242, + "learning_rate": 8.237411529566407e-06, + "loss": 0.2355, + "step": 7191 + }, + { + "epoch": 0.569776193305605, + "grad_norm": 1.8908245667395156, + "learning_rate": 8.234885730060554e-06, + "loss": 0.2464, + "step": 7192 + }, + { + "epoch": 0.5698554169142405, + "grad_norm": 1.6626577760630519, + "learning_rate": 8.232360046778982e-06, + "loss": 0.2008, + "step": 7193 + }, + { + "epoch": 0.5699346405228758, + "grad_norm": 1.525945061799335, + "learning_rate": 8.229834479887992e-06, + "loss": 0.1848, + "step": 7194 + }, + { + "epoch": 0.5700138641315112, + "grad_norm": 1.2094967268299146, + "learning_rate": 8.227309029553889e-06, + "loss": 0.1132, + "step": 7195 + }, + { + "epoch": 0.5700930877401466, + "grad_norm": 1.8439241349566875, + "learning_rate": 8.224783695942954e-06, + "loss": 0.1555, + "step": 7196 + }, + { + "epoch": 0.570172311348782, + "grad_norm": 1.7294648874207226, + "learning_rate": 8.222258479221473e-06, + "loss": 0.218, + "step": 7197 + }, + { + "epoch": 0.5702515349574173, + "grad_norm": 1.7039752622642823, + "learning_rate": 8.219733379555715e-06, + "loss": 0.1981, + "step": 7198 + }, + { + "epoch": 0.5703307585660526, + "grad_norm": 1.4493054380279036, + "learning_rate": 8.217208397111948e-06, + "loss": 0.1712, + "step": 7199 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 1.6909015430438719, + "learning_rate": 8.21468353205643e-06, + "loss": 0.2229, + "step": 7200 + }, + { + "epoch": 0.5704892057833234, + "grad_norm": 1.692378896432889, + "learning_rate": 8.212158784555412e-06, + "loss": 0.27, + "step": 7201 + }, + { + "epoch": 0.5705684293919588, + "grad_norm": 1.537232110106939, + "learning_rate": 8.209634154775134e-06, + "loss": 0.1495, + "step": 7202 + }, + { + "epoch": 0.5706476530005942, + "grad_norm": 1.7115009443493503, + "learning_rate": 8.207109642881836e-06, + "loss": 0.2137, + "step": 7203 + }, + { + "epoch": 0.5707268766092296, + "grad_norm": 1.7248471314916907, + "learning_rate": 8.20458524904174e-06, + "loss": 0.2512, + "step": 7204 + }, + { + "epoch": 0.5708061002178649, + "grad_norm": 1.5700271970662605, + "learning_rate": 8.202060973421064e-06, + "loss": 0.1893, + "step": 7205 + }, + { + "epoch": 0.5708853238265003, + "grad_norm": 1.846018527733847, + "learning_rate": 8.199536816186025e-06, + "loss": 0.2002, + "step": 7206 + }, + { + "epoch": 0.5709645474351357, + "grad_norm": 1.785997674820738, + "learning_rate": 8.197012777502819e-06, + "loss": 0.2979, + "step": 7207 + }, + { + "epoch": 0.571043771043771, + "grad_norm": 1.466090700295504, + "learning_rate": 8.194488857537646e-06, + "loss": 0.1688, + "step": 7208 + }, + { + "epoch": 0.5711229946524065, + "grad_norm": 1.6848150936657436, + "learning_rate": 8.191965056456699e-06, + "loss": 0.2262, + "step": 7209 + }, + { + "epoch": 0.5712022182610418, + "grad_norm": 1.6258597244150454, + "learning_rate": 8.18944137442615e-06, + "loss": 0.2622, + "step": 7210 + }, + { + "epoch": 0.5712814418696771, + "grad_norm": 2.0162694466437845, + "learning_rate": 8.186917811612173e-06, + "loss": 0.3038, + "step": 7211 + }, + { + "epoch": 0.5713606654783125, + "grad_norm": 1.4975929344426002, + "learning_rate": 8.184394368180937e-06, + "loss": 0.1557, + "step": 7212 + }, + { + "epoch": 0.5714398890869479, + "grad_norm": 1.3128093497968392, + "learning_rate": 8.181871044298594e-06, + "loss": 0.1773, + "step": 7213 + }, + { + "epoch": 0.5715191126955833, + "grad_norm": 1.3339295785788323, + "learning_rate": 8.179347840131297e-06, + "loss": 0.1427, + "step": 7214 + }, + { + "epoch": 0.5715983363042186, + "grad_norm": 1.704253023293362, + "learning_rate": 8.176824755845183e-06, + "loss": 0.2402, + "step": 7215 + }, + { + "epoch": 0.5716775599128541, + "grad_norm": 1.5998406704208354, + "learning_rate": 8.174301791606384e-06, + "loss": 0.2053, + "step": 7216 + }, + { + "epoch": 0.5717567835214894, + "grad_norm": 1.579364527988466, + "learning_rate": 8.171778947581032e-06, + "loss": 0.2245, + "step": 7217 + }, + { + "epoch": 0.5718360071301247, + "grad_norm": 1.5894443403673484, + "learning_rate": 8.169256223935236e-06, + "loss": 0.2011, + "step": 7218 + }, + { + "epoch": 0.5719152307387602, + "grad_norm": 1.6457889995542956, + "learning_rate": 8.166733620835107e-06, + "loss": 0.2605, + "step": 7219 + }, + { + "epoch": 0.5719944543473955, + "grad_norm": 1.4879413333941212, + "learning_rate": 8.164211138446753e-06, + "loss": 0.2863, + "step": 7220 + }, + { + "epoch": 0.5720736779560309, + "grad_norm": 1.620795512615661, + "learning_rate": 8.161688776936259e-06, + "loss": 0.2255, + "step": 7221 + }, + { + "epoch": 0.5721529015646662, + "grad_norm": 1.5600584576401233, + "learning_rate": 8.159166536469717e-06, + "loss": 0.2198, + "step": 7222 + }, + { + "epoch": 0.5722321251733017, + "grad_norm": 1.6844428399419666, + "learning_rate": 8.156644417213196e-06, + "loss": 0.2155, + "step": 7223 + }, + { + "epoch": 0.572311348781937, + "grad_norm": 1.1161114589068044, + "learning_rate": 8.154122419332772e-06, + "loss": 0.1417, + "step": 7224 + }, + { + "epoch": 0.5723905723905723, + "grad_norm": 1.5656457317724726, + "learning_rate": 8.151600542994506e-06, + "loss": 0.2177, + "step": 7225 + }, + { + "epoch": 0.5724697959992078, + "grad_norm": 1.5475943800810086, + "learning_rate": 8.149078788364451e-06, + "loss": 0.188, + "step": 7226 + }, + { + "epoch": 0.5725490196078431, + "grad_norm": 1.854962748245374, + "learning_rate": 8.14655715560865e-06, + "loss": 0.1981, + "step": 7227 + }, + { + "epoch": 0.5726282432164785, + "grad_norm": 1.5940628017538636, + "learning_rate": 8.144035644893143e-06, + "loss": 0.2541, + "step": 7228 + }, + { + "epoch": 0.5727074668251139, + "grad_norm": 1.2999784417169014, + "learning_rate": 8.141514256383957e-06, + "loss": 0.1764, + "step": 7229 + }, + { + "epoch": 0.5727866904337493, + "grad_norm": 1.381332867582748, + "learning_rate": 8.138992990247119e-06, + "loss": 0.1961, + "step": 7230 + }, + { + "epoch": 0.5728659140423846, + "grad_norm": 1.697555672404761, + "learning_rate": 8.136471846648633e-06, + "loss": 0.2601, + "step": 7231 + }, + { + "epoch": 0.57294513765102, + "grad_norm": 1.7537931243838112, + "learning_rate": 8.133950825754511e-06, + "loss": 0.2834, + "step": 7232 + }, + { + "epoch": 0.5730243612596554, + "grad_norm": 1.77155229334891, + "learning_rate": 8.13142992773075e-06, + "loss": 0.2091, + "step": 7233 + }, + { + "epoch": 0.5731035848682907, + "grad_norm": 1.6048050727374819, + "learning_rate": 8.128909152743334e-06, + "loss": 0.2545, + "step": 7234 + }, + { + "epoch": 0.5731828084769262, + "grad_norm": 1.5544839678344629, + "learning_rate": 8.12638850095825e-06, + "loss": 0.1799, + "step": 7235 + }, + { + "epoch": 0.5732620320855615, + "grad_norm": 1.8944010759220822, + "learning_rate": 8.123867972541466e-06, + "loss": 0.2255, + "step": 7236 + }, + { + "epoch": 0.5733412556941969, + "grad_norm": 2.0141425187757465, + "learning_rate": 8.12134756765895e-06, + "loss": 0.2448, + "step": 7237 + }, + { + "epoch": 0.5734204793028322, + "grad_norm": 1.553231612125875, + "learning_rate": 8.118827286476658e-06, + "loss": 0.2796, + "step": 7238 + }, + { + "epoch": 0.5734997029114676, + "grad_norm": 1.8101664552954098, + "learning_rate": 8.116307129160535e-06, + "loss": 0.2773, + "step": 7239 + }, + { + "epoch": 0.573578926520103, + "grad_norm": 1.5021324965085898, + "learning_rate": 8.113787095876525e-06, + "loss": 0.2124, + "step": 7240 + }, + { + "epoch": 0.5736581501287383, + "grad_norm": 1.4706068828036591, + "learning_rate": 8.11126718679056e-06, + "loss": 0.2022, + "step": 7241 + }, + { + "epoch": 0.5737373737373738, + "grad_norm": 1.4912690632029677, + "learning_rate": 8.10874740206856e-06, + "loss": 0.2498, + "step": 7242 + }, + { + "epoch": 0.5738165973460091, + "grad_norm": 1.675693886890341, + "learning_rate": 8.106227741876447e-06, + "loss": 0.3023, + "step": 7243 + }, + { + "epoch": 0.5738958209546445, + "grad_norm": 1.7662578223830618, + "learning_rate": 8.103708206380123e-06, + "loss": 0.2428, + "step": 7244 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 1.6891028802295858, + "learning_rate": 8.101188795745489e-06, + "loss": 0.1655, + "step": 7245 + }, + { + "epoch": 0.5740542681719152, + "grad_norm": 1.470241860262505, + "learning_rate": 8.098669510138438e-06, + "loss": 0.2048, + "step": 7246 + }, + { + "epoch": 0.5741334917805506, + "grad_norm": 1.3763211298664193, + "learning_rate": 8.09615034972485e-06, + "loss": 0.2379, + "step": 7247 + }, + { + "epoch": 0.574212715389186, + "grad_norm": 1.9837225704042876, + "learning_rate": 8.093631314670598e-06, + "loss": 0.2625, + "step": 7248 + }, + { + "epoch": 0.5742919389978214, + "grad_norm": 1.528437224140775, + "learning_rate": 8.091112405141555e-06, + "loss": 0.1805, + "step": 7249 + }, + { + "epoch": 0.5743711626064567, + "grad_norm": 1.7958254538166991, + "learning_rate": 8.088593621303573e-06, + "loss": 0.2255, + "step": 7250 + }, + { + "epoch": 0.5744503862150921, + "grad_norm": 1.3239041738970985, + "learning_rate": 8.086074963322505e-06, + "loss": 0.1891, + "step": 7251 + }, + { + "epoch": 0.5745296098237275, + "grad_norm": 1.6670753342873152, + "learning_rate": 8.083556431364191e-06, + "loss": 0.2595, + "step": 7252 + }, + { + "epoch": 0.5746088334323628, + "grad_norm": 1.4312438991694492, + "learning_rate": 8.081038025594464e-06, + "loss": 0.2067, + "step": 7253 + }, + { + "epoch": 0.5746880570409982, + "grad_norm": 1.5986057118444013, + "learning_rate": 8.078519746179153e-06, + "loss": 0.2013, + "step": 7254 + }, + { + "epoch": 0.5747672806496336, + "grad_norm": 1.3820754267029534, + "learning_rate": 8.076001593284066e-06, + "loss": 0.1804, + "step": 7255 + }, + { + "epoch": 0.574846504258269, + "grad_norm": 1.6083103562027041, + "learning_rate": 8.073483567075018e-06, + "loss": 0.226, + "step": 7256 + }, + { + "epoch": 0.5749257278669043, + "grad_norm": 1.3260904503125175, + "learning_rate": 8.070965667717809e-06, + "loss": 0.1593, + "step": 7257 + }, + { + "epoch": 0.5750049514755398, + "grad_norm": 1.6781814899787393, + "learning_rate": 8.06844789537823e-06, + "loss": 0.255, + "step": 7258 + }, + { + "epoch": 0.5750841750841751, + "grad_norm": 1.7037600283909466, + "learning_rate": 8.065930250222061e-06, + "loss": 0.1972, + "step": 7259 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 1.2477962924815693, + "learning_rate": 8.063412732415077e-06, + "loss": 0.1675, + "step": 7260 + }, + { + "epoch": 0.5752426223014458, + "grad_norm": 1.644909653105973, + "learning_rate": 8.060895342123049e-06, + "loss": 0.2057, + "step": 7261 + }, + { + "epoch": 0.5753218459100812, + "grad_norm": 1.7224419201316707, + "learning_rate": 8.058378079511732e-06, + "loss": 0.2513, + "step": 7262 + }, + { + "epoch": 0.5754010695187166, + "grad_norm": 1.3664927723470475, + "learning_rate": 8.055860944746876e-06, + "loss": 0.2052, + "step": 7263 + }, + { + "epoch": 0.5754802931273519, + "grad_norm": 1.670790160406618, + "learning_rate": 8.05334393799422e-06, + "loss": 0.24, + "step": 7264 + }, + { + "epoch": 0.5755595167359874, + "grad_norm": 1.2908889373837937, + "learning_rate": 8.050827059419502e-06, + "loss": 0.1549, + "step": 7265 + }, + { + "epoch": 0.5756387403446227, + "grad_norm": 2.09136560361488, + "learning_rate": 8.04831030918844e-06, + "loss": 0.3494, + "step": 7266 + }, + { + "epoch": 0.575717963953258, + "grad_norm": 1.5644870970355587, + "learning_rate": 8.045793687466757e-06, + "loss": 0.198, + "step": 7267 + }, + { + "epoch": 0.5757971875618935, + "grad_norm": 1.4583632507235909, + "learning_rate": 8.043277194420155e-06, + "loss": 0.1843, + "step": 7268 + }, + { + "epoch": 0.5758764111705288, + "grad_norm": 1.8484708684703344, + "learning_rate": 8.040760830214334e-06, + "loss": 0.2101, + "step": 7269 + }, + { + "epoch": 0.5759556347791642, + "grad_norm": 1.2917664096857764, + "learning_rate": 8.038244595014986e-06, + "loss": 0.1735, + "step": 7270 + }, + { + "epoch": 0.5760348583877996, + "grad_norm": 1.5158154177678604, + "learning_rate": 8.03572848898779e-06, + "loss": 0.1895, + "step": 7271 + }, + { + "epoch": 0.576114081996435, + "grad_norm": 1.4357997985443058, + "learning_rate": 8.033212512298422e-06, + "loss": 0.2056, + "step": 7272 + }, + { + "epoch": 0.5761933056050703, + "grad_norm": 1.9207206252555802, + "learning_rate": 8.03069666511255e-06, + "loss": 0.2706, + "step": 7273 + }, + { + "epoch": 0.5762725292137056, + "grad_norm": 1.7526368059601876, + "learning_rate": 8.028180947595823e-06, + "loss": 0.1974, + "step": 7274 + }, + { + "epoch": 0.5763517528223411, + "grad_norm": 1.6521947712187373, + "learning_rate": 8.025665359913897e-06, + "loss": 0.179, + "step": 7275 + }, + { + "epoch": 0.5764309764309764, + "grad_norm": 1.9111428246261795, + "learning_rate": 8.023149902232404e-06, + "loss": 0.2338, + "step": 7276 + }, + { + "epoch": 0.5765102000396118, + "grad_norm": 1.3963015069276818, + "learning_rate": 8.020634574716976e-06, + "loss": 0.1491, + "step": 7277 + }, + { + "epoch": 0.5765894236482472, + "grad_norm": 1.4852121021266091, + "learning_rate": 8.018119377533243e-06, + "loss": 0.1889, + "step": 7278 + }, + { + "epoch": 0.5766686472568826, + "grad_norm": 1.6877131783706179, + "learning_rate": 8.015604310846807e-06, + "loss": 0.2071, + "step": 7279 + }, + { + "epoch": 0.5767478708655179, + "grad_norm": 1.6177034124873833, + "learning_rate": 8.013089374823281e-06, + "loss": 0.1991, + "step": 7280 + }, + { + "epoch": 0.5768270944741533, + "grad_norm": 2.064247297204755, + "learning_rate": 8.010574569628263e-06, + "loss": 0.2441, + "step": 7281 + }, + { + "epoch": 0.5769063180827887, + "grad_norm": 1.3653454892402546, + "learning_rate": 8.008059895427334e-06, + "loss": 0.1704, + "step": 7282 + }, + { + "epoch": 0.576985541691424, + "grad_norm": 1.3672169177523648, + "learning_rate": 8.005545352386077e-06, + "loss": 0.1614, + "step": 7283 + }, + { + "epoch": 0.5770647653000595, + "grad_norm": 1.4658636744979527, + "learning_rate": 8.003030940670061e-06, + "loss": 0.1417, + "step": 7284 + }, + { + "epoch": 0.5771439889086948, + "grad_norm": 1.5060100916104144, + "learning_rate": 8.000516660444848e-06, + "loss": 0.243, + "step": 7285 + }, + { + "epoch": 0.5772232125173302, + "grad_norm": 1.6372504794503895, + "learning_rate": 7.99800251187599e-06, + "loss": 0.1427, + "step": 7286 + }, + { + "epoch": 0.5773024361259655, + "grad_norm": 2.0943432662912187, + "learning_rate": 7.995488495129039e-06, + "loss": 0.2679, + "step": 7287 + }, + { + "epoch": 0.5773816597346009, + "grad_norm": 1.9847692729449438, + "learning_rate": 7.992974610369521e-06, + "loss": 0.2482, + "step": 7288 + }, + { + "epoch": 0.5774608833432363, + "grad_norm": 1.5031415487500104, + "learning_rate": 7.990460857762969e-06, + "loss": 0.1762, + "step": 7289 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.1995105282650207, + "learning_rate": 7.987947237474903e-06, + "loss": 0.2128, + "step": 7290 + }, + { + "epoch": 0.5776193305605071, + "grad_norm": 1.679879937596472, + "learning_rate": 7.985433749670825e-06, + "loss": 0.2252, + "step": 7291 + }, + { + "epoch": 0.5776985541691424, + "grad_norm": 1.6089326408370244, + "learning_rate": 7.982920394516247e-06, + "loss": 0.2108, + "step": 7292 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 1.8065095948417769, + "learning_rate": 7.98040717217665e-06, + "loss": 0.2501, + "step": 7293 + }, + { + "epoch": 0.5778570013864132, + "grad_norm": 1.5732673240721722, + "learning_rate": 7.977894082817524e-06, + "loss": 0.2463, + "step": 7294 + }, + { + "epoch": 0.5779362249950485, + "grad_norm": 1.506557784267785, + "learning_rate": 7.975381126604346e-06, + "loss": 0.1568, + "step": 7295 + }, + { + "epoch": 0.5780154486036839, + "grad_norm": 1.4889185041643265, + "learning_rate": 7.972868303702576e-06, + "loss": 0.1961, + "step": 7296 + }, + { + "epoch": 0.5780946722123192, + "grad_norm": 1.6347677387781698, + "learning_rate": 7.970355614277674e-06, + "loss": 0.2082, + "step": 7297 + }, + { + "epoch": 0.5781738958209547, + "grad_norm": 2.649268010522181, + "learning_rate": 7.967843058495092e-06, + "loss": 0.2781, + "step": 7298 + }, + { + "epoch": 0.57825311942959, + "grad_norm": 1.5538571208514451, + "learning_rate": 7.965330636520262e-06, + "loss": 0.2095, + "step": 7299 + }, + { + "epoch": 0.5783323430382253, + "grad_norm": 1.4633856760820738, + "learning_rate": 7.962818348518623e-06, + "loss": 0.257, + "step": 7300 + }, + { + "epoch": 0.5784115666468608, + "grad_norm": 1.43828529775794, + "learning_rate": 7.960306194655593e-06, + "loss": 0.1819, + "step": 7301 + }, + { + "epoch": 0.5784907902554961, + "grad_norm": 1.6101635965560042, + "learning_rate": 7.957794175096585e-06, + "loss": 0.2171, + "step": 7302 + }, + { + "epoch": 0.5785700138641315, + "grad_norm": 2.0250001290962496, + "learning_rate": 7.955282290007006e-06, + "loss": 0.2732, + "step": 7303 + }, + { + "epoch": 0.5786492374727669, + "grad_norm": 1.4420376384620313, + "learning_rate": 7.952770539552246e-06, + "loss": 0.1931, + "step": 7304 + }, + { + "epoch": 0.5787284610814023, + "grad_norm": 1.3618466857212324, + "learning_rate": 7.950258923897695e-06, + "loss": 0.1772, + "step": 7305 + }, + { + "epoch": 0.5788076846900376, + "grad_norm": 1.551858486834539, + "learning_rate": 7.947747443208735e-06, + "loss": 0.219, + "step": 7306 + }, + { + "epoch": 0.578886908298673, + "grad_norm": 1.5917687245250431, + "learning_rate": 7.945236097650729e-06, + "loss": 0.1581, + "step": 7307 + }, + { + "epoch": 0.5789661319073084, + "grad_norm": 1.5082789832828323, + "learning_rate": 7.942724887389041e-06, + "loss": 0.1771, + "step": 7308 + }, + { + "epoch": 0.5790453555159437, + "grad_norm": 1.677984045977772, + "learning_rate": 7.940213812589018e-06, + "loss": 0.2054, + "step": 7309 + }, + { + "epoch": 0.5791245791245792, + "grad_norm": 1.5881251260978109, + "learning_rate": 7.937702873416005e-06, + "loss": 0.2177, + "step": 7310 + }, + { + "epoch": 0.5792038027332145, + "grad_norm": 1.6570658660141848, + "learning_rate": 7.935192070035335e-06, + "loss": 0.1636, + "step": 7311 + }, + { + "epoch": 0.5792830263418499, + "grad_norm": 2.450106985192867, + "learning_rate": 7.932681402612332e-06, + "loss": 0.4511, + "step": 7312 + }, + { + "epoch": 0.5793622499504852, + "grad_norm": 1.5040911320701662, + "learning_rate": 7.93017087131231e-06, + "loss": 0.1476, + "step": 7313 + }, + { + "epoch": 0.5794414735591206, + "grad_norm": 1.6720726936121675, + "learning_rate": 7.927660476300578e-06, + "loss": 0.1799, + "step": 7314 + }, + { + "epoch": 0.579520697167756, + "grad_norm": 1.4983948771718287, + "learning_rate": 7.925150217742431e-06, + "loss": 0.2251, + "step": 7315 + }, + { + "epoch": 0.5795999207763913, + "grad_norm": 1.8777163764540121, + "learning_rate": 7.92264009580316e-06, + "loss": 0.2383, + "step": 7316 + }, + { + "epoch": 0.5796791443850268, + "grad_norm": 1.9503488963376103, + "learning_rate": 7.920130110648044e-06, + "loss": 0.2333, + "step": 7317 + }, + { + "epoch": 0.5797583679936621, + "grad_norm": 1.396599470150119, + "learning_rate": 7.917620262442349e-06, + "loss": 0.16, + "step": 7318 + }, + { + "epoch": 0.5798375916022975, + "grad_norm": 1.3316024969176476, + "learning_rate": 7.915110551351344e-06, + "loss": 0.1445, + "step": 7319 + }, + { + "epoch": 0.5799168152109329, + "grad_norm": 1.8475021004353738, + "learning_rate": 7.912600977540275e-06, + "loss": 0.2572, + "step": 7320 + }, + { + "epoch": 0.5799960388195682, + "grad_norm": 1.8854731577534638, + "learning_rate": 7.910091541174388e-06, + "loss": 0.3087, + "step": 7321 + }, + { + "epoch": 0.5800752624282036, + "grad_norm": 2.035559672891875, + "learning_rate": 7.907582242418916e-06, + "loss": 0.2637, + "step": 7322 + }, + { + "epoch": 0.580154486036839, + "grad_norm": 1.7967319312949754, + "learning_rate": 7.905073081439087e-06, + "loss": 0.2327, + "step": 7323 + }, + { + "epoch": 0.5802337096454744, + "grad_norm": 1.3853013185902623, + "learning_rate": 7.902564058400116e-06, + "loss": 0.1705, + "step": 7324 + }, + { + "epoch": 0.5803129332541097, + "grad_norm": 1.6040940919908941, + "learning_rate": 7.900055173467207e-06, + "loss": 0.2192, + "step": 7325 + }, + { + "epoch": 0.5803921568627451, + "grad_norm": 1.6915223518830884, + "learning_rate": 7.897546426805561e-06, + "loss": 0.2627, + "step": 7326 + }, + { + "epoch": 0.5804713804713805, + "grad_norm": 1.4365202102715322, + "learning_rate": 7.89503781858037e-06, + "loss": 0.189, + "step": 7327 + }, + { + "epoch": 0.5805506040800158, + "grad_norm": 1.656623708831888, + "learning_rate": 7.892529348956805e-06, + "loss": 0.2493, + "step": 7328 + }, + { + "epoch": 0.5806298276886512, + "grad_norm": 0.9988737429943114, + "learning_rate": 7.890021018100045e-06, + "loss": 0.1021, + "step": 7329 + }, + { + "epoch": 0.5807090512972866, + "grad_norm": 1.3282268789970066, + "learning_rate": 7.887512826175247e-06, + "loss": 0.2134, + "step": 7330 + }, + { + "epoch": 0.580788274905922, + "grad_norm": 1.4715313871158777, + "learning_rate": 7.885004773347565e-06, + "loss": 0.1625, + "step": 7331 + }, + { + "epoch": 0.5808674985145573, + "grad_norm": 2.3555701996219423, + "learning_rate": 7.882496859782145e-06, + "loss": 0.2184, + "step": 7332 + }, + { + "epoch": 0.5809467221231928, + "grad_norm": 1.2667015407256914, + "learning_rate": 7.879989085644114e-06, + "loss": 0.1296, + "step": 7333 + }, + { + "epoch": 0.5810259457318281, + "grad_norm": 1.7741974740350812, + "learning_rate": 7.877481451098602e-06, + "loss": 0.2255, + "step": 7334 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 1.3740913128849466, + "learning_rate": 7.874973956310726e-06, + "loss": 0.1703, + "step": 7335 + }, + { + "epoch": 0.5811843929490988, + "grad_norm": 1.9566208543741046, + "learning_rate": 7.872466601445587e-06, + "loss": 0.3091, + "step": 7336 + }, + { + "epoch": 0.5812636165577342, + "grad_norm": 1.3321241556772043, + "learning_rate": 7.869959386668286e-06, + "loss": 0.1637, + "step": 7337 + }, + { + "epoch": 0.5813428401663696, + "grad_norm": 1.6632698510891268, + "learning_rate": 7.86745231214391e-06, + "loss": 0.176, + "step": 7338 + }, + { + "epoch": 0.5814220637750049, + "grad_norm": 1.6438140441934932, + "learning_rate": 7.864945378037538e-06, + "loss": 0.2706, + "step": 7339 + }, + { + "epoch": 0.5815012873836404, + "grad_norm": 1.5787218788023105, + "learning_rate": 7.862438584514242e-06, + "loss": 0.1937, + "step": 7340 + }, + { + "epoch": 0.5815805109922757, + "grad_norm": 1.3080732064679614, + "learning_rate": 7.859931931739077e-06, + "loss": 0.2035, + "step": 7341 + }, + { + "epoch": 0.581659734600911, + "grad_norm": 1.486490302479561, + "learning_rate": 7.857425419877097e-06, + "loss": 0.1695, + "step": 7342 + }, + { + "epoch": 0.5817389582095465, + "grad_norm": 1.4044690756786709, + "learning_rate": 7.854919049093345e-06, + "loss": 0.1458, + "step": 7343 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 1.45693372678055, + "learning_rate": 7.852412819552853e-06, + "loss": 0.1476, + "step": 7344 + }, + { + "epoch": 0.5818974054268172, + "grad_norm": 1.4199161809622265, + "learning_rate": 7.849906731420642e-06, + "loss": 0.1673, + "step": 7345 + }, + { + "epoch": 0.5819766290354526, + "grad_norm": 1.0830407164113578, + "learning_rate": 7.847400784861727e-06, + "loss": 0.102, + "step": 7346 + }, + { + "epoch": 0.582055852644088, + "grad_norm": 1.2206867799938337, + "learning_rate": 7.844894980041112e-06, + "loss": 0.1164, + "step": 7347 + }, + { + "epoch": 0.5821350762527233, + "grad_norm": 1.347524174502622, + "learning_rate": 7.842389317123795e-06, + "loss": 0.1571, + "step": 7348 + }, + { + "epoch": 0.5822142998613586, + "grad_norm": 1.4373286737459365, + "learning_rate": 7.839883796274758e-06, + "loss": 0.1447, + "step": 7349 + }, + { + "epoch": 0.5822935234699941, + "grad_norm": 1.5678046119059164, + "learning_rate": 7.83737841765898e-06, + "loss": 0.2645, + "step": 7350 + }, + { + "epoch": 0.5823727470786294, + "grad_norm": 1.8735561538804082, + "learning_rate": 7.834873181441426e-06, + "loss": 0.2379, + "step": 7351 + }, + { + "epoch": 0.5824519706872648, + "grad_norm": 1.4910033748164775, + "learning_rate": 7.832368087787056e-06, + "loss": 0.1843, + "step": 7352 + }, + { + "epoch": 0.5825311942959002, + "grad_norm": 1.4822589257996637, + "learning_rate": 7.82986313686082e-06, + "loss": 0.1491, + "step": 7353 + }, + { + "epoch": 0.5826104179045356, + "grad_norm": 1.4546380221478576, + "learning_rate": 7.82735832882765e-06, + "loss": 0.1845, + "step": 7354 + }, + { + "epoch": 0.5826896415131709, + "grad_norm": 2.228102250388726, + "learning_rate": 7.824853663852482e-06, + "loss": 0.2356, + "step": 7355 + }, + { + "epoch": 0.5827688651218063, + "grad_norm": 1.74278369204629, + "learning_rate": 7.822349142100236e-06, + "loss": 0.1763, + "step": 7356 + }, + { + "epoch": 0.5828480887304417, + "grad_norm": 1.2361151491785414, + "learning_rate": 7.819844763735818e-06, + "loss": 0.1507, + "step": 7357 + }, + { + "epoch": 0.582927312339077, + "grad_norm": 1.6643136497569972, + "learning_rate": 7.817340528924132e-06, + "loss": 0.2407, + "step": 7358 + }, + { + "epoch": 0.5830065359477125, + "grad_norm": 1.21829976363329, + "learning_rate": 7.814836437830074e-06, + "loss": 0.1637, + "step": 7359 + }, + { + "epoch": 0.5830857595563478, + "grad_norm": 1.597439023932669, + "learning_rate": 7.812332490618521e-06, + "loss": 0.1991, + "step": 7360 + }, + { + "epoch": 0.5831649831649832, + "grad_norm": 1.4610659664827095, + "learning_rate": 7.809828687454343e-06, + "loss": 0.1949, + "step": 7361 + }, + { + "epoch": 0.5832442067736185, + "grad_norm": 1.2423700761020409, + "learning_rate": 7.807325028502412e-06, + "loss": 0.1424, + "step": 7362 + }, + { + "epoch": 0.5833234303822539, + "grad_norm": 1.808372202672393, + "learning_rate": 7.804821513927574e-06, + "loss": 0.2757, + "step": 7363 + }, + { + "epoch": 0.5834026539908893, + "grad_norm": 1.6933673668947338, + "learning_rate": 7.802318143894678e-06, + "loss": 0.208, + "step": 7364 + }, + { + "epoch": 0.5834818775995246, + "grad_norm": 1.2728386526262256, + "learning_rate": 7.799814918568559e-06, + "loss": 0.1439, + "step": 7365 + }, + { + "epoch": 0.5835611012081601, + "grad_norm": 1.4067736456268327, + "learning_rate": 7.797311838114038e-06, + "loss": 0.1763, + "step": 7366 + }, + { + "epoch": 0.5836403248167954, + "grad_norm": 1.4137022613901862, + "learning_rate": 7.794808902695935e-06, + "loss": 0.1583, + "step": 7367 + }, + { + "epoch": 0.5837195484254307, + "grad_norm": 1.5527576114466286, + "learning_rate": 7.792306112479055e-06, + "loss": 0.2426, + "step": 7368 + }, + { + "epoch": 0.5837987720340662, + "grad_norm": 1.4868732528214235, + "learning_rate": 7.789803467628196e-06, + "loss": 0.2171, + "step": 7369 + }, + { + "epoch": 0.5838779956427015, + "grad_norm": 1.2531848604775941, + "learning_rate": 7.787300968308144e-06, + "loss": 0.1809, + "step": 7370 + }, + { + "epoch": 0.5839572192513369, + "grad_norm": 1.6035137846042486, + "learning_rate": 7.784798614683675e-06, + "loss": 0.1858, + "step": 7371 + }, + { + "epoch": 0.5840364428599722, + "grad_norm": 1.5782136919648733, + "learning_rate": 7.782296406919557e-06, + "loss": 0.1892, + "step": 7372 + }, + { + "epoch": 0.5841156664686077, + "grad_norm": 1.2940119101465892, + "learning_rate": 7.779794345180552e-06, + "loss": 0.1582, + "step": 7373 + }, + { + "epoch": 0.584194890077243, + "grad_norm": 1.7977074156018713, + "learning_rate": 7.777292429631405e-06, + "loss": 0.1343, + "step": 7374 + }, + { + "epoch": 0.5842741136858783, + "grad_norm": 1.8275288543711155, + "learning_rate": 7.774790660436857e-06, + "loss": 0.3028, + "step": 7375 + }, + { + "epoch": 0.5843533372945138, + "grad_norm": 1.4315002334367006, + "learning_rate": 7.772289037761639e-06, + "loss": 0.2209, + "step": 7376 + }, + { + "epoch": 0.5844325609031491, + "grad_norm": 1.5079418253540777, + "learning_rate": 7.769787561770466e-06, + "loss": 0.1877, + "step": 7377 + }, + { + "epoch": 0.5845117845117845, + "grad_norm": 2.078985270683973, + "learning_rate": 7.767286232628054e-06, + "loss": 0.3406, + "step": 7378 + }, + { + "epoch": 0.5845910081204199, + "grad_norm": 1.7108565953951784, + "learning_rate": 7.764785050499098e-06, + "loss": 0.1984, + "step": 7379 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 1.4422617027044293, + "learning_rate": 7.76228401554829e-06, + "loss": 0.1533, + "step": 7380 + }, + { + "epoch": 0.5847494553376906, + "grad_norm": 1.521485936719373, + "learning_rate": 7.759783127940315e-06, + "loss": 0.1655, + "step": 7381 + }, + { + "epoch": 0.584828678946326, + "grad_norm": 1.444930411295526, + "learning_rate": 7.757282387839842e-06, + "loss": 0.2408, + "step": 7382 + }, + { + "epoch": 0.5849079025549614, + "grad_norm": 1.5006625285954482, + "learning_rate": 7.75478179541153e-06, + "loss": 0.2169, + "step": 7383 + }, + { + "epoch": 0.5849871261635967, + "grad_norm": 1.6704140909628131, + "learning_rate": 7.752281350820037e-06, + "loss": 0.2663, + "step": 7384 + }, + { + "epoch": 0.5850663497722322, + "grad_norm": 1.8925580776486521, + "learning_rate": 7.749781054229998e-06, + "loss": 0.2998, + "step": 7385 + }, + { + "epoch": 0.5851455733808675, + "grad_norm": 1.6760707482436805, + "learning_rate": 7.747280905806051e-06, + "loss": 0.2296, + "step": 7386 + }, + { + "epoch": 0.5852247969895029, + "grad_norm": 1.575618260689309, + "learning_rate": 7.744780905712818e-06, + "loss": 0.1792, + "step": 7387 + }, + { + "epoch": 0.5853040205981382, + "grad_norm": 1.4684360621138008, + "learning_rate": 7.742281054114909e-06, + "loss": 0.218, + "step": 7388 + }, + { + "epoch": 0.5853832442067736, + "grad_norm": 1.813178613724032, + "learning_rate": 7.73978135117693e-06, + "loss": 0.2412, + "step": 7389 + }, + { + "epoch": 0.585462467815409, + "grad_norm": 1.7942912971991012, + "learning_rate": 7.737281797063473e-06, + "loss": 0.1901, + "step": 7390 + }, + { + "epoch": 0.5855416914240443, + "grad_norm": 1.5534491332276268, + "learning_rate": 7.734782391939123e-06, + "loss": 0.1962, + "step": 7391 + }, + { + "epoch": 0.5856209150326798, + "grad_norm": 1.2370068229019497, + "learning_rate": 7.732283135968452e-06, + "loss": 0.1165, + "step": 7392 + }, + { + "epoch": 0.5857001386413151, + "grad_norm": 1.38219487024793, + "learning_rate": 7.729784029316025e-06, + "loss": 0.1801, + "step": 7393 + }, + { + "epoch": 0.5857793622499505, + "grad_norm": 1.6395266423136008, + "learning_rate": 7.7272850721464e-06, + "loss": 0.2612, + "step": 7394 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 2.569825377769722, + "learning_rate": 7.724786264624112e-06, + "loss": 0.3263, + "step": 7395 + }, + { + "epoch": 0.5859378094672212, + "grad_norm": 1.5540901626600006, + "learning_rate": 7.722287606913703e-06, + "loss": 0.2244, + "step": 7396 + }, + { + "epoch": 0.5860170330758566, + "grad_norm": 1.2766898903574357, + "learning_rate": 7.719789099179696e-06, + "loss": 0.1477, + "step": 7397 + }, + { + "epoch": 0.586096256684492, + "grad_norm": 1.7034877399301822, + "learning_rate": 7.717290741586602e-06, + "loss": 0.2134, + "step": 7398 + }, + { + "epoch": 0.5861754802931274, + "grad_norm": 1.5668080866324068, + "learning_rate": 7.714792534298934e-06, + "loss": 0.2433, + "step": 7399 + }, + { + "epoch": 0.5862547039017627, + "grad_norm": 2.1071109200674845, + "learning_rate": 7.712294477481177e-06, + "loss": 0.3131, + "step": 7400 + }, + { + "epoch": 0.5863339275103981, + "grad_norm": 1.6374636452367801, + "learning_rate": 7.709796571297823e-06, + "loss": 0.1872, + "step": 7401 + }, + { + "epoch": 0.5864131511190335, + "grad_norm": 1.752874226437224, + "learning_rate": 7.707298815913346e-06, + "loss": 0.2336, + "step": 7402 + }, + { + "epoch": 0.5864923747276688, + "grad_norm": 1.5787076973281444, + "learning_rate": 7.70480121149221e-06, + "loss": 0.2516, + "step": 7403 + }, + { + "epoch": 0.5865715983363042, + "grad_norm": 1.2778506845834356, + "learning_rate": 7.702303758198868e-06, + "loss": 0.1574, + "step": 7404 + }, + { + "epoch": 0.5866508219449396, + "grad_norm": 1.263129459817484, + "learning_rate": 7.699806456197771e-06, + "loss": 0.1393, + "step": 7405 + }, + { + "epoch": 0.586730045553575, + "grad_norm": 2.0536170410407393, + "learning_rate": 7.697309305653348e-06, + "loss": 0.2962, + "step": 7406 + }, + { + "epoch": 0.5868092691622103, + "grad_norm": 1.6066563887601932, + "learning_rate": 7.694812306730031e-06, + "loss": 0.2089, + "step": 7407 + }, + { + "epoch": 0.5868884927708458, + "grad_norm": 1.724127569304111, + "learning_rate": 7.69231545959223e-06, + "loss": 0.273, + "step": 7408 + }, + { + "epoch": 0.5869677163794811, + "grad_norm": 1.772746835436174, + "learning_rate": 7.689818764404351e-06, + "loss": 0.2648, + "step": 7409 + }, + { + "epoch": 0.5870469399881164, + "grad_norm": 1.400817299921641, + "learning_rate": 7.687322221330794e-06, + "loss": 0.1265, + "step": 7410 + }, + { + "epoch": 0.5871261635967518, + "grad_norm": 1.7113843309178138, + "learning_rate": 7.684825830535935e-06, + "loss": 0.1714, + "step": 7411 + }, + { + "epoch": 0.5872053872053872, + "grad_norm": 1.7658430467096589, + "learning_rate": 7.682329592184158e-06, + "loss": 0.205, + "step": 7412 + }, + { + "epoch": 0.5872846108140226, + "grad_norm": 1.61182488260892, + "learning_rate": 7.679833506439826e-06, + "loss": 0.2136, + "step": 7413 + }, + { + "epoch": 0.5873638344226579, + "grad_norm": 1.92473727839742, + "learning_rate": 7.677337573467294e-06, + "loss": 0.2135, + "step": 7414 + }, + { + "epoch": 0.5874430580312934, + "grad_norm": 1.927164374383882, + "learning_rate": 7.674841793430907e-06, + "loss": 0.279, + "step": 7415 + }, + { + "epoch": 0.5875222816399287, + "grad_norm": 2.1716251837227083, + "learning_rate": 7.672346166494999e-06, + "loss": 0.307, + "step": 7416 + }, + { + "epoch": 0.587601505248564, + "grad_norm": 1.709594680815539, + "learning_rate": 7.669850692823895e-06, + "loss": 0.2193, + "step": 7417 + }, + { + "epoch": 0.5876807288571995, + "grad_norm": 1.8776004149572332, + "learning_rate": 7.667355372581913e-06, + "loss": 0.2316, + "step": 7418 + }, + { + "epoch": 0.5877599524658348, + "grad_norm": 2.0678547868731956, + "learning_rate": 7.664860205933356e-06, + "loss": 0.241, + "step": 7419 + }, + { + "epoch": 0.5878391760744702, + "grad_norm": 1.8401528725309737, + "learning_rate": 7.662365193042516e-06, + "loss": 0.1509, + "step": 7420 + }, + { + "epoch": 0.5879183996831056, + "grad_norm": 1.5633666386116454, + "learning_rate": 7.659870334073683e-06, + "loss": 0.1895, + "step": 7421 + }, + { + "epoch": 0.587997623291741, + "grad_norm": 1.3746797454269115, + "learning_rate": 7.657375629191126e-06, + "loss": 0.1762, + "step": 7422 + }, + { + "epoch": 0.5880768469003763, + "grad_norm": 1.771937331025057, + "learning_rate": 7.654881078559112e-06, + "loss": 0.2074, + "step": 7423 + }, + { + "epoch": 0.5881560705090116, + "grad_norm": 1.5697221652671167, + "learning_rate": 7.652386682341895e-06, + "loss": 0.1974, + "step": 7424 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.6771166483152264, + "learning_rate": 7.64989244070372e-06, + "loss": 0.2224, + "step": 7425 + }, + { + "epoch": 0.5883145177262824, + "grad_norm": 1.5833939615961354, + "learning_rate": 7.647398353808822e-06, + "loss": 0.165, + "step": 7426 + }, + { + "epoch": 0.5883937413349178, + "grad_norm": 1.3801981553068066, + "learning_rate": 7.644904421821418e-06, + "loss": 0.1534, + "step": 7427 + }, + { + "epoch": 0.5884729649435532, + "grad_norm": 1.4833152072980584, + "learning_rate": 7.642410644905726e-06, + "loss": 0.1964, + "step": 7428 + }, + { + "epoch": 0.5885521885521886, + "grad_norm": 1.1463566073384621, + "learning_rate": 7.639917023225953e-06, + "loss": 0.1464, + "step": 7429 + }, + { + "epoch": 0.5886314121608239, + "grad_norm": 1.8298440492273966, + "learning_rate": 7.637423556946284e-06, + "loss": 0.1884, + "step": 7430 + }, + { + "epoch": 0.5887106357694593, + "grad_norm": 1.482646746607733, + "learning_rate": 7.63493024623091e-06, + "loss": 0.1772, + "step": 7431 + }, + { + "epoch": 0.5887898593780947, + "grad_norm": 1.5964567960701876, + "learning_rate": 7.632437091243996e-06, + "loss": 0.2114, + "step": 7432 + }, + { + "epoch": 0.58886908298673, + "grad_norm": 1.4102131564329259, + "learning_rate": 7.629944092149707e-06, + "loss": 0.2064, + "step": 7433 + }, + { + "epoch": 0.5889483065953655, + "grad_norm": 1.3930073216985237, + "learning_rate": 7.627451249112199e-06, + "loss": 0.1364, + "step": 7434 + }, + { + "epoch": 0.5890275302040008, + "grad_norm": 1.395804725903595, + "learning_rate": 7.624958562295607e-06, + "loss": 0.1558, + "step": 7435 + }, + { + "epoch": 0.5891067538126362, + "grad_norm": 1.4641264576087305, + "learning_rate": 7.622466031864066e-06, + "loss": 0.1363, + "step": 7436 + }, + { + "epoch": 0.5891859774212715, + "grad_norm": 1.240265737552144, + "learning_rate": 7.6199736579817005e-06, + "loss": 0.1672, + "step": 7437 + }, + { + "epoch": 0.5892652010299069, + "grad_norm": 1.7226526469263839, + "learning_rate": 7.617481440812617e-06, + "loss": 0.2194, + "step": 7438 + }, + { + "epoch": 0.5893444246385423, + "grad_norm": 1.8455411006227738, + "learning_rate": 7.614989380520914e-06, + "loss": 0.2776, + "step": 7439 + }, + { + "epoch": 0.5894236482471776, + "grad_norm": 1.907638351279368, + "learning_rate": 7.612497477270686e-06, + "loss": 0.1984, + "step": 7440 + }, + { + "epoch": 0.5895028718558131, + "grad_norm": 1.6683352005687586, + "learning_rate": 7.610005731226009e-06, + "loss": 0.2036, + "step": 7441 + }, + { + "epoch": 0.5895820954644484, + "grad_norm": 1.7231457261407273, + "learning_rate": 7.607514142550955e-06, + "loss": 0.3299, + "step": 7442 + }, + { + "epoch": 0.5896613190730838, + "grad_norm": 1.6616231689487826, + "learning_rate": 7.605022711409585e-06, + "loss": 0.1898, + "step": 7443 + }, + { + "epoch": 0.5897405426817192, + "grad_norm": 1.5384747991129377, + "learning_rate": 7.602531437965943e-06, + "loss": 0.1878, + "step": 7444 + }, + { + "epoch": 0.5898197662903545, + "grad_norm": 1.7914997127146324, + "learning_rate": 7.6000403223840714e-06, + "loss": 0.2259, + "step": 7445 + }, + { + "epoch": 0.5898989898989899, + "grad_norm": 1.675488354038786, + "learning_rate": 7.597549364827997e-06, + "loss": 0.2158, + "step": 7446 + }, + { + "epoch": 0.5899782135076252, + "grad_norm": 1.7693188811033118, + "learning_rate": 7.595058565461736e-06, + "loss": 0.2245, + "step": 7447 + }, + { + "epoch": 0.5900574371162607, + "grad_norm": 1.8404426526266984, + "learning_rate": 7.5925679244492985e-06, + "loss": 0.2779, + "step": 7448 + }, + { + "epoch": 0.590136660724896, + "grad_norm": 1.6301558992273812, + "learning_rate": 7.5900774419546775e-06, + "loss": 0.247, + "step": 7449 + }, + { + "epoch": 0.5902158843335313, + "grad_norm": 1.491175797758845, + "learning_rate": 7.58758711814186e-06, + "loss": 0.1344, + "step": 7450 + }, + { + "epoch": 0.5902951079421668, + "grad_norm": 1.696416510881321, + "learning_rate": 7.585096953174827e-06, + "loss": 0.2807, + "step": 7451 + }, + { + "epoch": 0.5903743315508021, + "grad_norm": 1.6755142588541274, + "learning_rate": 7.582606947217537e-06, + "loss": 0.1924, + "step": 7452 + }, + { + "epoch": 0.5904535551594375, + "grad_norm": 1.7556816542604936, + "learning_rate": 7.580117100433947e-06, + "loss": 0.3028, + "step": 7453 + }, + { + "epoch": 0.5905327787680729, + "grad_norm": 1.4336986850350117, + "learning_rate": 7.577627412988005e-06, + "loss": 0.1626, + "step": 7454 + }, + { + "epoch": 0.5906120023767083, + "grad_norm": 1.4900141317463638, + "learning_rate": 7.57513788504364e-06, + "loss": 0.245, + "step": 7455 + }, + { + "epoch": 0.5906912259853436, + "grad_norm": 2.2121266232198917, + "learning_rate": 7.572648516764778e-06, + "loss": 0.223, + "step": 7456 + }, + { + "epoch": 0.590770449593979, + "grad_norm": 1.3869070717614518, + "learning_rate": 7.570159308315331e-06, + "loss": 0.1779, + "step": 7457 + }, + { + "epoch": 0.5908496732026144, + "grad_norm": 1.6095382062402306, + "learning_rate": 7.5676702598592025e-06, + "loss": 0.2495, + "step": 7458 + }, + { + "epoch": 0.5909288968112497, + "grad_norm": 1.6965519774818403, + "learning_rate": 7.5651813715602855e-06, + "loss": 0.2093, + "step": 7459 + }, + { + "epoch": 0.5910081204198852, + "grad_norm": 1.4744062962455062, + "learning_rate": 7.562692643582456e-06, + "loss": 0.1582, + "step": 7460 + }, + { + "epoch": 0.5910873440285205, + "grad_norm": 1.6782265245942305, + "learning_rate": 7.56020407608959e-06, + "loss": 0.245, + "step": 7461 + }, + { + "epoch": 0.5911665676371559, + "grad_norm": 2.030536778917779, + "learning_rate": 7.557715669245547e-06, + "loss": 0.2352, + "step": 7462 + }, + { + "epoch": 0.5912457912457912, + "grad_norm": 1.8017533290790013, + "learning_rate": 7.555227423214174e-06, + "loss": 0.1653, + "step": 7463 + }, + { + "epoch": 0.5913250148544266, + "grad_norm": 1.6817661670520199, + "learning_rate": 7.552739338159314e-06, + "loss": 0.1248, + "step": 7464 + }, + { + "epoch": 0.591404238463062, + "grad_norm": 1.410988868893279, + "learning_rate": 7.550251414244791e-06, + "loss": 0.1793, + "step": 7465 + }, + { + "epoch": 0.5914834620716973, + "grad_norm": 2.194095789629852, + "learning_rate": 7.5477636516344255e-06, + "loss": 0.2322, + "step": 7466 + }, + { + "epoch": 0.5915626856803328, + "grad_norm": 1.4728707036576527, + "learning_rate": 7.545276050492025e-06, + "loss": 0.2031, + "step": 7467 + }, + { + "epoch": 0.5916419092889681, + "grad_norm": 1.814372102446906, + "learning_rate": 7.542788610981384e-06, + "loss": 0.2969, + "step": 7468 + }, + { + "epoch": 0.5917211328976035, + "grad_norm": 1.8870305552345494, + "learning_rate": 7.540301333266289e-06, + "loss": 0.2752, + "step": 7469 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 1.2628947142486275, + "learning_rate": 7.537814217510518e-06, + "loss": 0.1571, + "step": 7470 + }, + { + "epoch": 0.5918795801148742, + "grad_norm": 1.4571467798044746, + "learning_rate": 7.535327263877832e-06, + "loss": 0.214, + "step": 7471 + }, + { + "epoch": 0.5919588037235096, + "grad_norm": 1.3676288149330387, + "learning_rate": 7.532840472531988e-06, + "loss": 0.1923, + "step": 7472 + }, + { + "epoch": 0.592038027332145, + "grad_norm": 1.8942295958228004, + "learning_rate": 7.530353843636726e-06, + "loss": 0.2364, + "step": 7473 + }, + { + "epoch": 0.5921172509407804, + "grad_norm": 1.47835187108296, + "learning_rate": 7.52786737735578e-06, + "loss": 0.2148, + "step": 7474 + }, + { + "epoch": 0.5921964745494157, + "grad_norm": 1.4904041274472746, + "learning_rate": 7.525381073852874e-06, + "loss": 0.1552, + "step": 7475 + }, + { + "epoch": 0.5922756981580511, + "grad_norm": 1.6395740358422257, + "learning_rate": 7.522894933291715e-06, + "loss": 0.2664, + "step": 7476 + }, + { + "epoch": 0.5923549217666865, + "grad_norm": 1.4791070926762029, + "learning_rate": 7.5204089558360076e-06, + "loss": 0.2207, + "step": 7477 + }, + { + "epoch": 0.5924341453753218, + "grad_norm": 1.4312737917623066, + "learning_rate": 7.517923141649439e-06, + "loss": 0.2194, + "step": 7478 + }, + { + "epoch": 0.5925133689839572, + "grad_norm": 1.6109664535306307, + "learning_rate": 7.515437490895688e-06, + "loss": 0.2778, + "step": 7479 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 1.3401836430719771, + "learning_rate": 7.5129520037384225e-06, + "loss": 0.2119, + "step": 7480 + }, + { + "epoch": 0.592671816201228, + "grad_norm": 1.7954808505865363, + "learning_rate": 7.5104666803413015e-06, + "loss": 0.2537, + "step": 7481 + }, + { + "epoch": 0.5927510398098633, + "grad_norm": 1.2378367435262168, + "learning_rate": 7.50798152086797e-06, + "loss": 0.1458, + "step": 7482 + }, + { + "epoch": 0.5928302634184988, + "grad_norm": 1.325387131806812, + "learning_rate": 7.505496525482066e-06, + "loss": 0.183, + "step": 7483 + }, + { + "epoch": 0.5929094870271341, + "grad_norm": 2.895468924061322, + "learning_rate": 7.503011694347212e-06, + "loss": 0.2108, + "step": 7484 + }, + { + "epoch": 0.5929887106357694, + "grad_norm": 1.9100198884730346, + "learning_rate": 7.500527027627025e-06, + "loss": 0.2398, + "step": 7485 + }, + { + "epoch": 0.5930679342444048, + "grad_norm": 1.6497273421773464, + "learning_rate": 7.4980425254851034e-06, + "loss": 0.2276, + "step": 7486 + }, + { + "epoch": 0.5931471578530402, + "grad_norm": 1.573713696957266, + "learning_rate": 7.495558188085044e-06, + "loss": 0.1782, + "step": 7487 + }, + { + "epoch": 0.5932263814616756, + "grad_norm": 1.5540595415324925, + "learning_rate": 7.493074015590429e-06, + "loss": 0.2056, + "step": 7488 + }, + { + "epoch": 0.5933056050703109, + "grad_norm": 1.3437221880306618, + "learning_rate": 7.490590008164824e-06, + "loss": 0.1472, + "step": 7489 + }, + { + "epoch": 0.5933848286789464, + "grad_norm": 1.437666800499826, + "learning_rate": 7.488106165971795e-06, + "loss": 0.2073, + "step": 7490 + }, + { + "epoch": 0.5934640522875817, + "grad_norm": 1.902829932381785, + "learning_rate": 7.485622489174888e-06, + "loss": 0.1442, + "step": 7491 + }, + { + "epoch": 0.593543275896217, + "grad_norm": 1.4263930247751762, + "learning_rate": 7.483138977937643e-06, + "loss": 0.1706, + "step": 7492 + }, + { + "epoch": 0.5936224995048525, + "grad_norm": 1.2750902489373617, + "learning_rate": 7.480655632423586e-06, + "loss": 0.1172, + "step": 7493 + }, + { + "epoch": 0.5937017231134878, + "grad_norm": 1.9653500329408096, + "learning_rate": 7.478172452796231e-06, + "loss": 0.2398, + "step": 7494 + }, + { + "epoch": 0.5937809467221232, + "grad_norm": 1.3090970040179197, + "learning_rate": 7.475689439219085e-06, + "loss": 0.1638, + "step": 7495 + }, + { + "epoch": 0.5938601703307586, + "grad_norm": 1.4515458669484513, + "learning_rate": 7.473206591855646e-06, + "loss": 0.1983, + "step": 7496 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 2.4360764396838754, + "learning_rate": 7.470723910869393e-06, + "loss": 0.2866, + "step": 7497 + }, + { + "epoch": 0.5940186175480293, + "grad_norm": 1.5758426551525762, + "learning_rate": 7.468241396423801e-06, + "loss": 0.1942, + "step": 7498 + }, + { + "epoch": 0.5940978411566646, + "grad_norm": 1.284295747047346, + "learning_rate": 7.465759048682333e-06, + "loss": 0.1275, + "step": 7499 + }, + { + "epoch": 0.5941770647653001, + "grad_norm": 1.6761975640343274, + "learning_rate": 7.463276867808435e-06, + "loss": 0.1733, + "step": 7500 + }, + { + "epoch": 0.5942562883739354, + "grad_norm": 1.574147240134103, + "learning_rate": 7.46079485396555e-06, + "loss": 0.2497, + "step": 7501 + }, + { + "epoch": 0.5943355119825708, + "grad_norm": 1.64665031059084, + "learning_rate": 7.458313007317106e-06, + "loss": 0.2369, + "step": 7502 + }, + { + "epoch": 0.5944147355912062, + "grad_norm": 1.4885554041713425, + "learning_rate": 7.45583132802652e-06, + "loss": 0.2112, + "step": 7503 + }, + { + "epoch": 0.5944939591998416, + "grad_norm": 1.56036903462225, + "learning_rate": 7.4533498162572004e-06, + "loss": 0.2529, + "step": 7504 + }, + { + "epoch": 0.5945731828084769, + "grad_norm": 1.4564983783149468, + "learning_rate": 7.450868472172541e-06, + "loss": 0.211, + "step": 7505 + }, + { + "epoch": 0.5946524064171123, + "grad_norm": 1.8085709021335465, + "learning_rate": 7.448387295935926e-06, + "loss": 0.2086, + "step": 7506 + }, + { + "epoch": 0.5947316300257477, + "grad_norm": 1.3844658607981795, + "learning_rate": 7.445906287710733e-06, + "loss": 0.2122, + "step": 7507 + }, + { + "epoch": 0.594810853634383, + "grad_norm": 1.466532617133725, + "learning_rate": 7.443425447660319e-06, + "loss": 0.1985, + "step": 7508 + }, + { + "epoch": 0.5948900772430185, + "grad_norm": 1.591733919342183, + "learning_rate": 7.4409447759480404e-06, + "loss": 0.2274, + "step": 7509 + }, + { + "epoch": 0.5949693008516538, + "grad_norm": 1.390284436839561, + "learning_rate": 7.438464272737232e-06, + "loss": 0.1894, + "step": 7510 + }, + { + "epoch": 0.5950485244602892, + "grad_norm": 1.3942454172751464, + "learning_rate": 7.435983938191227e-06, + "loss": 0.1829, + "step": 7511 + }, + { + "epoch": 0.5951277480689245, + "grad_norm": 1.716729276264812, + "learning_rate": 7.433503772473343e-06, + "loss": 0.2497, + "step": 7512 + }, + { + "epoch": 0.5952069716775599, + "grad_norm": 1.8072010336025572, + "learning_rate": 7.431023775746886e-06, + "loss": 0.2936, + "step": 7513 + }, + { + "epoch": 0.5952861952861953, + "grad_norm": 1.4508899835233136, + "learning_rate": 7.428543948175151e-06, + "loss": 0.1835, + "step": 7514 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 1.7513932844304312, + "learning_rate": 7.426064289921429e-06, + "loss": 0.1893, + "step": 7515 + }, + { + "epoch": 0.5954446425034661, + "grad_norm": 1.8882346323032806, + "learning_rate": 7.423584801148985e-06, + "loss": 0.2767, + "step": 7516 + }, + { + "epoch": 0.5955238661121014, + "grad_norm": 1.1630629996727673, + "learning_rate": 7.421105482021084e-06, + "loss": 0.1906, + "step": 7517 + }, + { + "epoch": 0.5956030897207368, + "grad_norm": 1.3363220896736516, + "learning_rate": 7.41862633270098e-06, + "loss": 0.1521, + "step": 7518 + }, + { + "epoch": 0.5956823133293722, + "grad_norm": 1.3489488656597928, + "learning_rate": 7.416147353351909e-06, + "loss": 0.2176, + "step": 7519 + }, + { + "epoch": 0.5957615369380075, + "grad_norm": 1.7409845482961002, + "learning_rate": 7.4136685441371025e-06, + "loss": 0.2367, + "step": 7520 + }, + { + "epoch": 0.5958407605466429, + "grad_norm": 1.5579907491199265, + "learning_rate": 7.41118990521978e-06, + "loss": 0.197, + "step": 7521 + }, + { + "epoch": 0.5959199841552782, + "grad_norm": 1.66325078638249, + "learning_rate": 7.408711436763143e-06, + "loss": 0.2323, + "step": 7522 + }, + { + "epoch": 0.5959992077639137, + "grad_norm": 1.514226489610716, + "learning_rate": 7.406233138930389e-06, + "loss": 0.224, + "step": 7523 + }, + { + "epoch": 0.596078431372549, + "grad_norm": 1.301425961876574, + "learning_rate": 7.4037550118847044e-06, + "loss": 0.1894, + "step": 7524 + }, + { + "epoch": 0.5961576549811844, + "grad_norm": 2.302043459119219, + "learning_rate": 7.401277055789259e-06, + "loss": 0.234, + "step": 7525 + }, + { + "epoch": 0.5962368785898198, + "grad_norm": 1.6691946135400209, + "learning_rate": 7.398799270807217e-06, + "loss": 0.2055, + "step": 7526 + }, + { + "epoch": 0.5963161021984551, + "grad_norm": 1.4852209304539714, + "learning_rate": 7.3963216571017235e-06, + "loss": 0.222, + "step": 7527 + }, + { + "epoch": 0.5963953258070905, + "grad_norm": 1.3847414162680347, + "learning_rate": 7.3938442148359215e-06, + "loss": 0.1793, + "step": 7528 + }, + { + "epoch": 0.5964745494157259, + "grad_norm": 1.8818345843921624, + "learning_rate": 7.391366944172941e-06, + "loss": 0.2033, + "step": 7529 + }, + { + "epoch": 0.5965537730243613, + "grad_norm": 1.949855559570462, + "learning_rate": 7.388889845275893e-06, + "loss": 0.2678, + "step": 7530 + }, + { + "epoch": 0.5966329966329966, + "grad_norm": 2.0744829222270305, + "learning_rate": 7.3864129183078835e-06, + "loss": 0.2881, + "step": 7531 + }, + { + "epoch": 0.596712220241632, + "grad_norm": 1.8056034247734498, + "learning_rate": 7.38393616343201e-06, + "loss": 0.1992, + "step": 7532 + }, + { + "epoch": 0.5967914438502674, + "grad_norm": 1.4540548937764923, + "learning_rate": 7.381459580811352e-06, + "loss": 0.1902, + "step": 7533 + }, + { + "epoch": 0.5968706674589027, + "grad_norm": 1.2133129477610578, + "learning_rate": 7.378983170608982e-06, + "loss": 0.166, + "step": 7534 + }, + { + "epoch": 0.5969498910675382, + "grad_norm": 1.4185713740831576, + "learning_rate": 7.376506932987956e-06, + "loss": 0.2411, + "step": 7535 + }, + { + "epoch": 0.5970291146761735, + "grad_norm": 1.2997916437712833, + "learning_rate": 7.374030868111326e-06, + "loss": 0.1401, + "step": 7536 + }, + { + "epoch": 0.5971083382848089, + "grad_norm": 1.717198331922666, + "learning_rate": 7.371554976142128e-06, + "loss": 0.184, + "step": 7537 + }, + { + "epoch": 0.5971875618934442, + "grad_norm": 1.533675257115316, + "learning_rate": 7.369079257243388e-06, + "loss": 0.1964, + "step": 7538 + }, + { + "epoch": 0.5972667855020796, + "grad_norm": 1.4133944231430584, + "learning_rate": 7.366603711578119e-06, + "loss": 0.138, + "step": 7539 + }, + { + "epoch": 0.597346009110715, + "grad_norm": 1.0047709943714092, + "learning_rate": 7.364128339309326e-06, + "loss": 0.1318, + "step": 7540 + }, + { + "epoch": 0.5974252327193503, + "grad_norm": 1.6690406345432887, + "learning_rate": 7.361653140599997e-06, + "loss": 0.227, + "step": 7541 + }, + { + "epoch": 0.5975044563279858, + "grad_norm": 1.473319163574499, + "learning_rate": 7.359178115613116e-06, + "loss": 0.1581, + "step": 7542 + }, + { + "epoch": 0.5975836799366211, + "grad_norm": 1.7992236191033433, + "learning_rate": 7.356703264511646e-06, + "loss": 0.259, + "step": 7543 + }, + { + "epoch": 0.5976629035452565, + "grad_norm": 1.5309951035045426, + "learning_rate": 7.354228587458549e-06, + "loss": 0.1908, + "step": 7544 + }, + { + "epoch": 0.5977421271538919, + "grad_norm": 1.5413873328517331, + "learning_rate": 7.351754084616771e-06, + "loss": 0.2023, + "step": 7545 + }, + { + "epoch": 0.5978213507625272, + "grad_norm": 1.8613854969727364, + "learning_rate": 7.349279756149241e-06, + "loss": 0.2618, + "step": 7546 + }, + { + "epoch": 0.5979005743711626, + "grad_norm": 1.3782039045483327, + "learning_rate": 7.346805602218885e-06, + "loss": 0.1429, + "step": 7547 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 1.0950435839528003, + "learning_rate": 7.344331622988616e-06, + "loss": 0.1412, + "step": 7548 + }, + { + "epoch": 0.5980590215884334, + "grad_norm": 1.2331082884778737, + "learning_rate": 7.341857818621328e-06, + "loss": 0.1536, + "step": 7549 + }, + { + "epoch": 0.5981382451970687, + "grad_norm": 1.3944433181556044, + "learning_rate": 7.339384189279917e-06, + "loss": 0.133, + "step": 7550 + }, + { + "epoch": 0.5982174688057041, + "grad_norm": 1.4105343209411911, + "learning_rate": 7.33691073512725e-06, + "loss": 0.1746, + "step": 7551 + }, + { + "epoch": 0.5982966924143395, + "grad_norm": 1.4134313314750464, + "learning_rate": 7.3344374563262e-06, + "loss": 0.168, + "step": 7552 + }, + { + "epoch": 0.5983759160229748, + "grad_norm": 1.4990726175232938, + "learning_rate": 7.3319643530396175e-06, + "loss": 0.2168, + "step": 7553 + }, + { + "epoch": 0.5984551396316102, + "grad_norm": 1.4439393721882599, + "learning_rate": 7.329491425430344e-06, + "loss": 0.156, + "step": 7554 + }, + { + "epoch": 0.5985343632402456, + "grad_norm": 1.8575783690839462, + "learning_rate": 7.327018673661209e-06, + "loss": 0.2932, + "step": 7555 + }, + { + "epoch": 0.598613586848881, + "grad_norm": 1.4803929070883979, + "learning_rate": 7.324546097895036e-06, + "loss": 0.2327, + "step": 7556 + }, + { + "epoch": 0.5986928104575163, + "grad_norm": 1.2508065486283875, + "learning_rate": 7.3220736982946275e-06, + "loss": 0.1574, + "step": 7557 + }, + { + "epoch": 0.5987720340661518, + "grad_norm": 1.7400344217433672, + "learning_rate": 7.3196014750227815e-06, + "loss": 0.2446, + "step": 7558 + }, + { + "epoch": 0.5988512576747871, + "grad_norm": 1.099608502479037, + "learning_rate": 7.317129428242279e-06, + "loss": 0.1695, + "step": 7559 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 1.54710041193007, + "learning_rate": 7.3146575581158945e-06, + "loss": 0.1717, + "step": 7560 + }, + { + "epoch": 0.5990097048920578, + "grad_norm": 1.470844078119281, + "learning_rate": 7.312185864806391e-06, + "loss": 0.2169, + "step": 7561 + }, + { + "epoch": 0.5990889285006932, + "grad_norm": 1.8732210895999644, + "learning_rate": 7.309714348476513e-06, + "loss": 0.2031, + "step": 7562 + }, + { + "epoch": 0.5991681521093286, + "grad_norm": 1.4888413490990122, + "learning_rate": 7.307243009289005e-06, + "loss": 0.1519, + "step": 7563 + }, + { + "epoch": 0.5992473757179639, + "grad_norm": 1.575642740569434, + "learning_rate": 7.304771847406582e-06, + "loss": 0.2201, + "step": 7564 + }, + { + "epoch": 0.5993265993265994, + "grad_norm": 1.3800534248022087, + "learning_rate": 7.3023008629919665e-06, + "loss": 0.1727, + "step": 7565 + }, + { + "epoch": 0.5994058229352347, + "grad_norm": 1.306370755599113, + "learning_rate": 7.299830056207861e-06, + "loss": 0.1615, + "step": 7566 + }, + { + "epoch": 0.59948504654387, + "grad_norm": 1.3689165387000066, + "learning_rate": 7.29735942721695e-06, + "loss": 0.1735, + "step": 7567 + }, + { + "epoch": 0.5995642701525055, + "grad_norm": 1.810228618296614, + "learning_rate": 7.294888976181919e-06, + "loss": 0.299, + "step": 7568 + }, + { + "epoch": 0.5996434937611408, + "grad_norm": 1.7211881323383862, + "learning_rate": 7.2924187032654335e-06, + "loss": 0.222, + "step": 7569 + }, + { + "epoch": 0.5997227173697762, + "grad_norm": 1.228565037460542, + "learning_rate": 7.289948608630146e-06, + "loss": 0.1868, + "step": 7570 + }, + { + "epoch": 0.5998019409784116, + "grad_norm": 1.7974019125352265, + "learning_rate": 7.287478692438705e-06, + "loss": 0.2586, + "step": 7571 + }, + { + "epoch": 0.599881164587047, + "grad_norm": 1.6644761638656067, + "learning_rate": 7.285008954853739e-06, + "loss": 0.2356, + "step": 7572 + }, + { + "epoch": 0.5999603881956823, + "grad_norm": 1.871366287911412, + "learning_rate": 7.282539396037868e-06, + "loss": 0.2008, + "step": 7573 + }, + { + "epoch": 0.6000396118043176, + "grad_norm": 1.4257322776430963, + "learning_rate": 7.280070016153706e-06, + "loss": 0.1895, + "step": 7574 + }, + { + "epoch": 0.6001188354129531, + "grad_norm": 1.3397558590154806, + "learning_rate": 7.277600815363842e-06, + "loss": 0.1387, + "step": 7575 + }, + { + "epoch": 0.6001980590215884, + "grad_norm": 1.6561727555965249, + "learning_rate": 7.275131793830865e-06, + "loss": 0.1888, + "step": 7576 + }, + { + "epoch": 0.6002772826302238, + "grad_norm": 1.679878499688621, + "learning_rate": 7.272662951717352e-06, + "loss": 0.24, + "step": 7577 + }, + { + "epoch": 0.6003565062388592, + "grad_norm": 1.3243416613694476, + "learning_rate": 7.270194289185858e-06, + "loss": 0.1599, + "step": 7578 + }, + { + "epoch": 0.6004357298474946, + "grad_norm": 1.7022961454073975, + "learning_rate": 7.267725806398936e-06, + "loss": 0.2084, + "step": 7579 + }, + { + "epoch": 0.6005149534561299, + "grad_norm": 1.7721215784350584, + "learning_rate": 7.265257503519122e-06, + "loss": 0.1778, + "step": 7580 + }, + { + "epoch": 0.6005941770647653, + "grad_norm": 1.5639756605187733, + "learning_rate": 7.262789380708942e-06, + "loss": 0.2644, + "step": 7581 + }, + { + "epoch": 0.6006734006734007, + "grad_norm": 1.3104953891714552, + "learning_rate": 7.260321438130913e-06, + "loss": 0.2004, + "step": 7582 + }, + { + "epoch": 0.600752624282036, + "grad_norm": 1.3226126988818692, + "learning_rate": 7.257853675947533e-06, + "loss": 0.1721, + "step": 7583 + }, + { + "epoch": 0.6008318478906715, + "grad_norm": 1.7804861638423175, + "learning_rate": 7.255386094321293e-06, + "loss": 0.2284, + "step": 7584 + }, + { + "epoch": 0.6009110714993068, + "grad_norm": 1.2520062156070564, + "learning_rate": 7.2529186934146756e-06, + "loss": 0.1329, + "step": 7585 + }, + { + "epoch": 0.6009902951079422, + "grad_norm": 1.6255730569270301, + "learning_rate": 7.250451473390141e-06, + "loss": 0.2111, + "step": 7586 + }, + { + "epoch": 0.6010695187165775, + "grad_norm": 1.28845236684381, + "learning_rate": 7.24798443441015e-06, + "loss": 0.1875, + "step": 7587 + }, + { + "epoch": 0.6011487423252129, + "grad_norm": 1.7687523318478398, + "learning_rate": 7.24551757663714e-06, + "loss": 0.175, + "step": 7588 + }, + { + "epoch": 0.6012279659338483, + "grad_norm": 1.3541737313406095, + "learning_rate": 7.2430509002335434e-06, + "loss": 0.2053, + "step": 7589 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 1.5810467398208936, + "learning_rate": 7.240584405361781e-06, + "loss": 0.2237, + "step": 7590 + }, + { + "epoch": 0.6013864131511191, + "grad_norm": 1.477947454842436, + "learning_rate": 7.238118092184256e-06, + "loss": 0.1613, + "step": 7591 + }, + { + "epoch": 0.6014656367597544, + "grad_norm": 1.7007612933797356, + "learning_rate": 7.2356519608633665e-06, + "loss": 0.2579, + "step": 7592 + }, + { + "epoch": 0.6015448603683898, + "grad_norm": 1.702217802654443, + "learning_rate": 7.233186011561498e-06, + "loss": 0.3759, + "step": 7593 + }, + { + "epoch": 0.6016240839770252, + "grad_norm": 1.5962176798448573, + "learning_rate": 7.230720244441016e-06, + "loss": 0.1757, + "step": 7594 + }, + { + "epoch": 0.6017033075856605, + "grad_norm": 1.6546982883316201, + "learning_rate": 7.228254659664278e-06, + "loss": 0.2649, + "step": 7595 + }, + { + "epoch": 0.6017825311942959, + "grad_norm": 1.3941126631045617, + "learning_rate": 7.225789257393636e-06, + "loss": 0.1673, + "step": 7596 + }, + { + "epoch": 0.6018617548029312, + "grad_norm": 1.6387317230827676, + "learning_rate": 7.223324037791421e-06, + "loss": 0.1928, + "step": 7597 + }, + { + "epoch": 0.6019409784115667, + "grad_norm": 1.9360923284859015, + "learning_rate": 7.220859001019957e-06, + "loss": 0.2389, + "step": 7598 + }, + { + "epoch": 0.602020202020202, + "grad_norm": 1.6318450898092798, + "learning_rate": 7.218394147241559e-06, + "loss": 0.2211, + "step": 7599 + }, + { + "epoch": 0.6020994256288374, + "grad_norm": 1.7303257492357589, + "learning_rate": 7.2159294766185174e-06, + "loss": 0.1931, + "step": 7600 + }, + { + "epoch": 0.6021786492374728, + "grad_norm": 1.2552983658595818, + "learning_rate": 7.213464989313126e-06, + "loss": 0.2034, + "step": 7601 + }, + { + "epoch": 0.6022578728461081, + "grad_norm": 1.4279397659427802, + "learning_rate": 7.211000685487658e-06, + "loss": 0.1985, + "step": 7602 + }, + { + "epoch": 0.6023370964547435, + "grad_norm": 1.6865625890477975, + "learning_rate": 7.208536565304374e-06, + "loss": 0.2601, + "step": 7603 + }, + { + "epoch": 0.6024163200633789, + "grad_norm": 1.4249315049439206, + "learning_rate": 7.206072628925526e-06, + "loss": 0.1831, + "step": 7604 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 1.6655400910839726, + "learning_rate": 7.203608876513351e-06, + "loss": 0.1877, + "step": 7605 + }, + { + "epoch": 0.6025747672806496, + "grad_norm": 1.6819901841355187, + "learning_rate": 7.201145308230075e-06, + "loss": 0.1418, + "step": 7606 + }, + { + "epoch": 0.602653990889285, + "grad_norm": 1.3693835928356586, + "learning_rate": 7.198681924237918e-06, + "loss": 0.1452, + "step": 7607 + }, + { + "epoch": 0.6027332144979204, + "grad_norm": 1.6954150141154036, + "learning_rate": 7.196218724699072e-06, + "loss": 0.2241, + "step": 7608 + }, + { + "epoch": 0.6028124381065557, + "grad_norm": 1.4867464992020691, + "learning_rate": 7.193755709775734e-06, + "loss": 0.231, + "step": 7609 + }, + { + "epoch": 0.6028916617151912, + "grad_norm": 1.6601739580110069, + "learning_rate": 7.191292879630081e-06, + "loss": 0.2537, + "step": 7610 + }, + { + "epoch": 0.6029708853238265, + "grad_norm": 1.1866121143397077, + "learning_rate": 7.188830234424275e-06, + "loss": 0.1146, + "step": 7611 + }, + { + "epoch": 0.6030501089324619, + "grad_norm": 1.743733347516833, + "learning_rate": 7.186367774320474e-06, + "loss": 0.2669, + "step": 7612 + }, + { + "epoch": 0.6031293325410972, + "grad_norm": 1.8904863862344754, + "learning_rate": 7.1839054994808145e-06, + "loss": 0.3211, + "step": 7613 + }, + { + "epoch": 0.6032085561497326, + "grad_norm": 1.4128432653830687, + "learning_rate": 7.181443410067428e-06, + "loss": 0.1389, + "step": 7614 + }, + { + "epoch": 0.603287779758368, + "grad_norm": 1.3912848230152774, + "learning_rate": 7.1789815062424325e-06, + "loss": 0.1715, + "step": 7615 + }, + { + "epoch": 0.6033670033670033, + "grad_norm": 1.6256105545046848, + "learning_rate": 7.176519788167929e-06, + "loss": 0.2261, + "step": 7616 + }, + { + "epoch": 0.6034462269756388, + "grad_norm": 1.88972356241294, + "learning_rate": 7.174058256006012e-06, + "loss": 0.2462, + "step": 7617 + }, + { + "epoch": 0.6035254505842741, + "grad_norm": 1.8384833083689796, + "learning_rate": 7.171596909918763e-06, + "loss": 0.2116, + "step": 7618 + }, + { + "epoch": 0.6036046741929095, + "grad_norm": 1.707874816952542, + "learning_rate": 7.169135750068247e-06, + "loss": 0.2439, + "step": 7619 + }, + { + "epoch": 0.6036838978015449, + "grad_norm": 1.5957493492893893, + "learning_rate": 7.1666747766165226e-06, + "loss": 0.2514, + "step": 7620 + }, + { + "epoch": 0.6037631214101802, + "grad_norm": 1.4690219957079473, + "learning_rate": 7.164213989725628e-06, + "loss": 0.1721, + "step": 7621 + }, + { + "epoch": 0.6038423450188156, + "grad_norm": 1.7962754932292309, + "learning_rate": 7.1617533895575975e-06, + "loss": 0.2104, + "step": 7622 + }, + { + "epoch": 0.6039215686274509, + "grad_norm": 1.5232146411566614, + "learning_rate": 7.1592929762744515e-06, + "loss": 0.2063, + "step": 7623 + }, + { + "epoch": 0.6040007922360864, + "grad_norm": 1.7222871363603482, + "learning_rate": 7.156832750038192e-06, + "loss": 0.2553, + "step": 7624 + }, + { + "epoch": 0.6040800158447217, + "grad_norm": 1.525523311669195, + "learning_rate": 7.154372711010815e-06, + "loss": 0.234, + "step": 7625 + }, + { + "epoch": 0.6041592394533571, + "grad_norm": 1.265709238587938, + "learning_rate": 7.1519128593543065e-06, + "loss": 0.1742, + "step": 7626 + }, + { + "epoch": 0.6042384630619925, + "grad_norm": 1.3485829711943362, + "learning_rate": 7.149453195230629e-06, + "loss": 0.1773, + "step": 7627 + }, + { + "epoch": 0.6043176866706278, + "grad_norm": 1.7329119017807024, + "learning_rate": 7.1469937188017444e-06, + "loss": 0.2286, + "step": 7628 + }, + { + "epoch": 0.6043969102792632, + "grad_norm": 1.6193505478847148, + "learning_rate": 7.144534430229595e-06, + "loss": 0.2037, + "step": 7629 + }, + { + "epoch": 0.6044761338878986, + "grad_norm": 1.5670374638429332, + "learning_rate": 7.142075329676112e-06, + "loss": 0.2326, + "step": 7630 + }, + { + "epoch": 0.604555357496534, + "grad_norm": 1.564422532751707, + "learning_rate": 7.139616417303221e-06, + "loss": 0.2628, + "step": 7631 + }, + { + "epoch": 0.6046345811051693, + "grad_norm": 1.8680598130350998, + "learning_rate": 7.137157693272822e-06, + "loss": 0.2541, + "step": 7632 + }, + { + "epoch": 0.6047138047138048, + "grad_norm": 1.3916907195722301, + "learning_rate": 7.1346991577468136e-06, + "loss": 0.1987, + "step": 7633 + }, + { + "epoch": 0.6047930283224401, + "grad_norm": 1.7075846033850193, + "learning_rate": 7.132240810887083e-06, + "loss": 0.2446, + "step": 7634 + }, + { + "epoch": 0.6048722519310754, + "grad_norm": 1.6268395678137915, + "learning_rate": 7.129782652855492e-06, + "loss": 0.1631, + "step": 7635 + }, + { + "epoch": 0.6049514755397108, + "grad_norm": 1.535001715235308, + "learning_rate": 7.127324683813906e-06, + "loss": 0.1956, + "step": 7636 + }, + { + "epoch": 0.6050306991483462, + "grad_norm": 1.6055154731305206, + "learning_rate": 7.124866903924164e-06, + "loss": 0.2002, + "step": 7637 + }, + { + "epoch": 0.6051099227569816, + "grad_norm": 1.5428118648356541, + "learning_rate": 7.122409313348102e-06, + "loss": 0.1383, + "step": 7638 + }, + { + "epoch": 0.6051891463656169, + "grad_norm": 1.9405258789655566, + "learning_rate": 7.119951912247545e-06, + "loss": 0.2723, + "step": 7639 + }, + { + "epoch": 0.6052683699742524, + "grad_norm": 1.5421867881043776, + "learning_rate": 7.117494700784292e-06, + "loss": 0.2086, + "step": 7640 + }, + { + "epoch": 0.6053475935828877, + "grad_norm": 1.2954947964818078, + "learning_rate": 7.115037679120147e-06, + "loss": 0.1651, + "step": 7641 + }, + { + "epoch": 0.605426817191523, + "grad_norm": 1.6585366235179484, + "learning_rate": 7.112580847416886e-06, + "loss": 0.2105, + "step": 7642 + }, + { + "epoch": 0.6055060408001585, + "grad_norm": 1.652038128325132, + "learning_rate": 7.110124205836283e-06, + "loss": 0.2311, + "step": 7643 + }, + { + "epoch": 0.6055852644087938, + "grad_norm": 1.2432018411473804, + "learning_rate": 7.107667754540097e-06, + "loss": 0.146, + "step": 7644 + }, + { + "epoch": 0.6056644880174292, + "grad_norm": 1.4015713876787912, + "learning_rate": 7.105211493690073e-06, + "loss": 0.1666, + "step": 7645 + }, + { + "epoch": 0.6057437116260646, + "grad_norm": 1.4908357237233425, + "learning_rate": 7.102755423447941e-06, + "loss": 0.2707, + "step": 7646 + }, + { + "epoch": 0.6058229352347, + "grad_norm": 1.7435092907695804, + "learning_rate": 7.100299543975426e-06, + "loss": 0.234, + "step": 7647 + }, + { + "epoch": 0.6059021588433353, + "grad_norm": 1.6902016222644076, + "learning_rate": 7.097843855434232e-06, + "loss": 0.1954, + "step": 7648 + }, + { + "epoch": 0.6059813824519706, + "grad_norm": 1.2524984582562644, + "learning_rate": 7.09538835798606e-06, + "loss": 0.1754, + "step": 7649 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.829532888756569, + "learning_rate": 7.092933051792583e-06, + "loss": 0.2642, + "step": 7650 + }, + { + "epoch": 0.6061398296692414, + "grad_norm": 1.8607923212682247, + "learning_rate": 7.090477937015479e-06, + "loss": 0.2222, + "step": 7651 + }, + { + "epoch": 0.6062190532778768, + "grad_norm": 1.6300736121866581, + "learning_rate": 7.088023013816403e-06, + "loss": 0.1911, + "step": 7652 + }, + { + "epoch": 0.6062982768865122, + "grad_norm": 1.5452479706372202, + "learning_rate": 7.085568282357e-06, + "loss": 0.1262, + "step": 7653 + }, + { + "epoch": 0.6063775004951476, + "grad_norm": 1.2677158022075257, + "learning_rate": 7.083113742798901e-06, + "loss": 0.1722, + "step": 7654 + }, + { + "epoch": 0.6064567241037829, + "grad_norm": 1.5793223069701792, + "learning_rate": 7.080659395303729e-06, + "loss": 0.1809, + "step": 7655 + }, + { + "epoch": 0.6065359477124183, + "grad_norm": 1.7591775682165742, + "learning_rate": 7.078205240033087e-06, + "loss": 0.289, + "step": 7656 + }, + { + "epoch": 0.6066151713210537, + "grad_norm": 1.693679217202281, + "learning_rate": 7.075751277148574e-06, + "loss": 0.2241, + "step": 7657 + }, + { + "epoch": 0.606694394929689, + "grad_norm": 1.5230415465470217, + "learning_rate": 7.073297506811766e-06, + "loss": 0.1873, + "step": 7658 + }, + { + "epoch": 0.6067736185383245, + "grad_norm": 1.8421205282833288, + "learning_rate": 7.0708439291842345e-06, + "loss": 0.2423, + "step": 7659 + }, + { + "epoch": 0.6068528421469598, + "grad_norm": 1.9853180657405853, + "learning_rate": 7.068390544427539e-06, + "loss": 0.2465, + "step": 7660 + }, + { + "epoch": 0.6069320657555952, + "grad_norm": 1.2262155409784774, + "learning_rate": 7.065937352703218e-06, + "loss": 0.1266, + "step": 7661 + }, + { + "epoch": 0.6070112893642305, + "grad_norm": 1.7988475703684603, + "learning_rate": 7.063484354172804e-06, + "loss": 0.2253, + "step": 7662 + }, + { + "epoch": 0.6070905129728659, + "grad_norm": 1.5244720804925753, + "learning_rate": 7.061031548997818e-06, + "loss": 0.1989, + "step": 7663 + }, + { + "epoch": 0.6071697365815013, + "grad_norm": 1.859338204933103, + "learning_rate": 7.058578937339759e-06, + "loss": 0.2297, + "step": 7664 + }, + { + "epoch": 0.6072489601901366, + "grad_norm": 1.1953807654098614, + "learning_rate": 7.056126519360129e-06, + "loss": 0.1569, + "step": 7665 + }, + { + "epoch": 0.6073281837987721, + "grad_norm": 1.4569953666997117, + "learning_rate": 7.053674295220399e-06, + "loss": 0.2211, + "step": 7666 + }, + { + "epoch": 0.6074074074074074, + "grad_norm": 1.2131943333860487, + "learning_rate": 7.05122226508204e-06, + "loss": 0.1547, + "step": 7667 + }, + { + "epoch": 0.6074866310160428, + "grad_norm": 1.8784771341243491, + "learning_rate": 7.048770429106509e-06, + "loss": 0.1855, + "step": 7668 + }, + { + "epoch": 0.6075658546246782, + "grad_norm": 1.5410335799529344, + "learning_rate": 7.0463187874552415e-06, + "loss": 0.1515, + "step": 7669 + }, + { + "epoch": 0.6076450782333135, + "grad_norm": 2.0534669847913, + "learning_rate": 7.043867340289672e-06, + "loss": 0.2554, + "step": 7670 + }, + { + "epoch": 0.6077243018419489, + "grad_norm": 1.6593503718491958, + "learning_rate": 7.0414160877712155e-06, + "loss": 0.2313, + "step": 7671 + }, + { + "epoch": 0.6078035254505842, + "grad_norm": 1.648280044604803, + "learning_rate": 7.038965030061273e-06, + "loss": 0.2631, + "step": 7672 + }, + { + "epoch": 0.6078827490592197, + "grad_norm": 1.5558673075659173, + "learning_rate": 7.0365141673212336e-06, + "loss": 0.2418, + "step": 7673 + }, + { + "epoch": 0.607961972667855, + "grad_norm": 1.4441030087286562, + "learning_rate": 7.034063499712479e-06, + "loss": 0.1983, + "step": 7674 + }, + { + "epoch": 0.6080411962764904, + "grad_norm": 1.4971161597896698, + "learning_rate": 7.031613027396369e-06, + "loss": 0.219, + "step": 7675 + }, + { + "epoch": 0.6081204198851258, + "grad_norm": 1.375924989171503, + "learning_rate": 7.029162750534259e-06, + "loss": 0.1398, + "step": 7676 + }, + { + "epoch": 0.6081996434937611, + "grad_norm": 1.5114631984579001, + "learning_rate": 7.02671266928749e-06, + "loss": 0.2326, + "step": 7677 + }, + { + "epoch": 0.6082788671023965, + "grad_norm": 1.663758060656713, + "learning_rate": 7.024262783817382e-06, + "loss": 0.2109, + "step": 7678 + }, + { + "epoch": 0.6083580907110319, + "grad_norm": 1.1813630021656605, + "learning_rate": 7.02181309428525e-06, + "loss": 0.1507, + "step": 7679 + }, + { + "epoch": 0.6084373143196673, + "grad_norm": 1.7339369630515733, + "learning_rate": 7.0193636008524e-06, + "loss": 0.1457, + "step": 7680 + }, + { + "epoch": 0.6085165379283026, + "grad_norm": 1.6476003094931166, + "learning_rate": 7.016914303680111e-06, + "loss": 0.2583, + "step": 7681 + }, + { + "epoch": 0.6085957615369381, + "grad_norm": 1.5549549286706958, + "learning_rate": 7.014465202929665e-06, + "loss": 0.2171, + "step": 7682 + }, + { + "epoch": 0.6086749851455734, + "grad_norm": 1.762090939941784, + "learning_rate": 7.012016298762317e-06, + "loss": 0.1781, + "step": 7683 + }, + { + "epoch": 0.6087542087542087, + "grad_norm": 2.113377464183182, + "learning_rate": 7.009567591339319e-06, + "loss": 0.1996, + "step": 7684 + }, + { + "epoch": 0.6088334323628442, + "grad_norm": 1.4403072082298425, + "learning_rate": 7.007119080821908e-06, + "loss": 0.1582, + "step": 7685 + }, + { + "epoch": 0.6089126559714795, + "grad_norm": 1.3993503290713227, + "learning_rate": 7.004670767371302e-06, + "loss": 0.1964, + "step": 7686 + }, + { + "epoch": 0.6089918795801149, + "grad_norm": 1.794888262809066, + "learning_rate": 7.002222651148714e-06, + "loss": 0.2128, + "step": 7687 + }, + { + "epoch": 0.6090711031887502, + "grad_norm": 1.961321137280887, + "learning_rate": 6.999774732315343e-06, + "loss": 0.1983, + "step": 7688 + }, + { + "epoch": 0.6091503267973856, + "grad_norm": 1.522627209550531, + "learning_rate": 6.9973270110323666e-06, + "loss": 0.2549, + "step": 7689 + }, + { + "epoch": 0.609229550406021, + "grad_norm": 1.7873904756391688, + "learning_rate": 6.994879487460961e-06, + "loss": 0.1871, + "step": 7690 + }, + { + "epoch": 0.6093087740146563, + "grad_norm": 1.4441905795424799, + "learning_rate": 6.992432161762278e-06, + "loss": 0.1475, + "step": 7691 + }, + { + "epoch": 0.6093879976232918, + "grad_norm": 1.748925683680036, + "learning_rate": 6.989985034097466e-06, + "loss": 0.1901, + "step": 7692 + }, + { + "epoch": 0.6094672212319271, + "grad_norm": 1.34826325200304, + "learning_rate": 6.9875381046276605e-06, + "loss": 0.1818, + "step": 7693 + }, + { + "epoch": 0.6095464448405625, + "grad_norm": 1.5428512100267902, + "learning_rate": 6.985091373513972e-06, + "loss": 0.2029, + "step": 7694 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 1.8332183640525483, + "learning_rate": 6.982644840917509e-06, + "loss": 0.2164, + "step": 7695 + }, + { + "epoch": 0.6097048920578332, + "grad_norm": 2.0335466422565385, + "learning_rate": 6.980198506999368e-06, + "loss": 0.222, + "step": 7696 + }, + { + "epoch": 0.6097841156664686, + "grad_norm": 1.5290000515964255, + "learning_rate": 6.977752371920623e-06, + "loss": 0.218, + "step": 7697 + }, + { + "epoch": 0.6098633392751039, + "grad_norm": 1.4460977793360785, + "learning_rate": 6.975306435842344e-06, + "loss": 0.1989, + "step": 7698 + }, + { + "epoch": 0.6099425628837394, + "grad_norm": 1.6477341896030866, + "learning_rate": 6.97286069892558e-06, + "loss": 0.265, + "step": 7699 + }, + { + "epoch": 0.6100217864923747, + "grad_norm": 1.8500834955056498, + "learning_rate": 6.970415161331373e-06, + "loss": 0.2353, + "step": 7700 + }, + { + "epoch": 0.6101010101010101, + "grad_norm": 1.7667719491407283, + "learning_rate": 6.967969823220752e-06, + "loss": 0.2656, + "step": 7701 + }, + { + "epoch": 0.6101802337096455, + "grad_norm": 1.4895866644490012, + "learning_rate": 6.965524684754729e-06, + "loss": 0.1598, + "step": 7702 + }, + { + "epoch": 0.6102594573182808, + "grad_norm": 1.544962785482416, + "learning_rate": 6.963079746094302e-06, + "loss": 0.2379, + "step": 7703 + }, + { + "epoch": 0.6103386809269162, + "grad_norm": 1.2851613208771364, + "learning_rate": 6.960635007400465e-06, + "loss": 0.131, + "step": 7704 + }, + { + "epoch": 0.6104179045355516, + "grad_norm": 1.108919454077273, + "learning_rate": 6.9581904688341854e-06, + "loss": 0.1359, + "step": 7705 + }, + { + "epoch": 0.610497128144187, + "grad_norm": 1.5599839118979857, + "learning_rate": 6.955746130556429e-06, + "loss": 0.239, + "step": 7706 + }, + { + "epoch": 0.6105763517528223, + "grad_norm": 1.579605500268842, + "learning_rate": 6.95330199272814e-06, + "loss": 0.2092, + "step": 7707 + }, + { + "epoch": 0.6106555753614578, + "grad_norm": 1.6134924206653056, + "learning_rate": 6.950858055510254e-06, + "loss": 0.2149, + "step": 7708 + }, + { + "epoch": 0.6107347989700931, + "grad_norm": 1.688202024288464, + "learning_rate": 6.948414319063696e-06, + "loss": 0.1677, + "step": 7709 + }, + { + "epoch": 0.6108140225787284, + "grad_norm": 1.1859036459295644, + "learning_rate": 6.945970783549372e-06, + "loss": 0.1573, + "step": 7710 + }, + { + "epoch": 0.6108932461873638, + "grad_norm": 1.6004186265449694, + "learning_rate": 6.943527449128174e-06, + "loss": 0.1525, + "step": 7711 + }, + { + "epoch": 0.6109724697959992, + "grad_norm": 1.5961064999682426, + "learning_rate": 6.9410843159609905e-06, + "loss": 0.2269, + "step": 7712 + }, + { + "epoch": 0.6110516934046346, + "grad_norm": 1.7675686310963605, + "learning_rate": 6.9386413842086845e-06, + "loss": 0.2737, + "step": 7713 + }, + { + "epoch": 0.6111309170132699, + "grad_norm": 1.4223694711734998, + "learning_rate": 6.936198654032114e-06, + "loss": 0.2081, + "step": 7714 + }, + { + "epoch": 0.6112101406219054, + "grad_norm": 1.2302534938709742, + "learning_rate": 6.933756125592117e-06, + "loss": 0.1603, + "step": 7715 + }, + { + "epoch": 0.6112893642305407, + "grad_norm": 1.401804184482667, + "learning_rate": 6.931313799049526e-06, + "loss": 0.1821, + "step": 7716 + }, + { + "epoch": 0.611368587839176, + "grad_norm": 1.436751158303, + "learning_rate": 6.928871674565158e-06, + "loss": 0.167, + "step": 7717 + }, + { + "epoch": 0.6114478114478115, + "grad_norm": 1.5394257648501508, + "learning_rate": 6.926429752299812e-06, + "loss": 0.2342, + "step": 7718 + }, + { + "epoch": 0.6115270350564468, + "grad_norm": 1.845597553088207, + "learning_rate": 6.923988032414277e-06, + "loss": 0.2925, + "step": 7719 + }, + { + "epoch": 0.6116062586650822, + "grad_norm": 1.334730553246682, + "learning_rate": 6.9215465150693305e-06, + "loss": 0.1611, + "step": 7720 + }, + { + "epoch": 0.6116854822737176, + "grad_norm": 1.7648865297550416, + "learning_rate": 6.919105200425733e-06, + "loss": 0.2268, + "step": 7721 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 1.7981697561396313, + "learning_rate": 6.916664088644234e-06, + "loss": 0.2298, + "step": 7722 + }, + { + "epoch": 0.6118439294909883, + "grad_norm": 1.6566984573709413, + "learning_rate": 6.914223179885567e-06, + "loss": 0.1867, + "step": 7723 + }, + { + "epoch": 0.6119231530996236, + "grad_norm": 1.5154254423614493, + "learning_rate": 6.911782474310456e-06, + "loss": 0.1598, + "step": 7724 + }, + { + "epoch": 0.6120023767082591, + "grad_norm": 1.708572628079177, + "learning_rate": 6.909341972079613e-06, + "loss": 0.259, + "step": 7725 + }, + { + "epoch": 0.6120816003168944, + "grad_norm": 1.3275729146856206, + "learning_rate": 6.9069016733537255e-06, + "loss": 0.1838, + "step": 7726 + }, + { + "epoch": 0.6121608239255298, + "grad_norm": 1.6814734664527193, + "learning_rate": 6.904461578293483e-06, + "loss": 0.1953, + "step": 7727 + }, + { + "epoch": 0.6122400475341652, + "grad_norm": 1.7965669989457957, + "learning_rate": 6.902021687059549e-06, + "loss": 0.2683, + "step": 7728 + }, + { + "epoch": 0.6123192711428006, + "grad_norm": 1.8788348195847642, + "learning_rate": 6.89958199981258e-06, + "loss": 0.287, + "step": 7729 + }, + { + "epoch": 0.6123984947514359, + "grad_norm": 2.0586221880761424, + "learning_rate": 6.89714251671322e-06, + "loss": 0.2441, + "step": 7730 + }, + { + "epoch": 0.6124777183600713, + "grad_norm": 1.339751772977914, + "learning_rate": 6.894703237922094e-06, + "loss": 0.1822, + "step": 7731 + }, + { + "epoch": 0.6125569419687067, + "grad_norm": 2.0740174138075433, + "learning_rate": 6.892264163599817e-06, + "loss": 0.2834, + "step": 7732 + }, + { + "epoch": 0.612636165577342, + "grad_norm": 1.5504725615822852, + "learning_rate": 6.889825293906993e-06, + "loss": 0.1994, + "step": 7733 + }, + { + "epoch": 0.6127153891859775, + "grad_norm": 1.5590286484174887, + "learning_rate": 6.887386629004207e-06, + "loss": 0.2114, + "step": 7734 + }, + { + "epoch": 0.6127946127946128, + "grad_norm": 1.5428772940580473, + "learning_rate": 6.884948169052037e-06, + "loss": 0.1975, + "step": 7735 + }, + { + "epoch": 0.6128738364032482, + "grad_norm": 2.4669080910655863, + "learning_rate": 6.88250991421104e-06, + "loss": 0.2354, + "step": 7736 + }, + { + "epoch": 0.6129530600118835, + "grad_norm": 1.545272880772027, + "learning_rate": 6.880071864641762e-06, + "loss": 0.1999, + "step": 7737 + }, + { + "epoch": 0.6130322836205189, + "grad_norm": 1.298425861912459, + "learning_rate": 6.8776340205047446e-06, + "loss": 0.2083, + "step": 7738 + }, + { + "epoch": 0.6131115072291543, + "grad_norm": 1.4435926098110319, + "learning_rate": 6.875196381960498e-06, + "loss": 0.1994, + "step": 7739 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 1.1964840107975034, + "learning_rate": 6.872758949169536e-06, + "loss": 0.1677, + "step": 7740 + }, + { + "epoch": 0.6132699544464251, + "grad_norm": 1.366476003600934, + "learning_rate": 6.8703217222923525e-06, + "loss": 0.2017, + "step": 7741 + }, + { + "epoch": 0.6133491780550604, + "grad_norm": 1.1843385763406453, + "learning_rate": 6.867884701489421e-06, + "loss": 0.1219, + "step": 7742 + }, + { + "epoch": 0.6134284016636958, + "grad_norm": 1.2974364316455382, + "learning_rate": 6.865447886921215e-06, + "loss": 0.0953, + "step": 7743 + }, + { + "epoch": 0.6135076252723312, + "grad_norm": 1.7274479229808215, + "learning_rate": 6.86301127874818e-06, + "loss": 0.2562, + "step": 7744 + }, + { + "epoch": 0.6135868488809665, + "grad_norm": 1.5496842102806074, + "learning_rate": 6.860574877130757e-06, + "loss": 0.2233, + "step": 7745 + }, + { + "epoch": 0.6136660724896019, + "grad_norm": 1.467108760413429, + "learning_rate": 6.8581386822293765e-06, + "loss": 0.1905, + "step": 7746 + }, + { + "epoch": 0.6137452960982372, + "grad_norm": 2.0026071055842762, + "learning_rate": 6.8557026942044425e-06, + "loss": 0.3157, + "step": 7747 + }, + { + "epoch": 0.6138245197068727, + "grad_norm": 1.1907568652405425, + "learning_rate": 6.853266913216357e-06, + "loss": 0.1012, + "step": 7748 + }, + { + "epoch": 0.613903743315508, + "grad_norm": 1.4096046627866865, + "learning_rate": 6.850831339425508e-06, + "loss": 0.1969, + "step": 7749 + }, + { + "epoch": 0.6139829669241434, + "grad_norm": 1.4932945929157266, + "learning_rate": 6.848395972992261e-06, + "loss": 0.2058, + "step": 7750 + }, + { + "epoch": 0.6140621905327788, + "grad_norm": 1.5450903853733862, + "learning_rate": 6.845960814076973e-06, + "loss": 0.2726, + "step": 7751 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 1.582389236581247, + "learning_rate": 6.8435258628399905e-06, + "loss": 0.1955, + "step": 7752 + }, + { + "epoch": 0.6142206377500495, + "grad_norm": 1.4360353473784515, + "learning_rate": 6.841091119441639e-06, + "loss": 0.2029, + "step": 7753 + }, + { + "epoch": 0.6142998613586849, + "grad_norm": 1.4553466143703913, + "learning_rate": 6.8386565840422385e-06, + "loss": 0.2256, + "step": 7754 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 1.4443511229085004, + "learning_rate": 6.836222256802093e-06, + "loss": 0.174, + "step": 7755 + }, + { + "epoch": 0.6144583085759556, + "grad_norm": 1.5692939212542871, + "learning_rate": 6.833788137881486e-06, + "loss": 0.2543, + "step": 7756 + }, + { + "epoch": 0.6145375321845911, + "grad_norm": 1.470408084499561, + "learning_rate": 6.8313542274406964e-06, + "loss": 0.1969, + "step": 7757 + }, + { + "epoch": 0.6146167557932264, + "grad_norm": 1.049736007709788, + "learning_rate": 6.828920525639985e-06, + "loss": 0.1592, + "step": 7758 + }, + { + "epoch": 0.6146959794018617, + "grad_norm": 1.2899274194319184, + "learning_rate": 6.826487032639597e-06, + "loss": 0.1583, + "step": 7759 + }, + { + "epoch": 0.6147752030104972, + "grad_norm": 1.6403175528594522, + "learning_rate": 6.8240537485997704e-06, + "loss": 0.2151, + "step": 7760 + }, + { + "epoch": 0.6148544266191325, + "grad_norm": 1.712396712258188, + "learning_rate": 6.821620673680721e-06, + "loss": 0.2516, + "step": 7761 + }, + { + "epoch": 0.6149336502277679, + "grad_norm": 1.6236426175346228, + "learning_rate": 6.819187808042656e-06, + "loss": 0.2148, + "step": 7762 + }, + { + "epoch": 0.6150128738364032, + "grad_norm": 1.6843082427058818, + "learning_rate": 6.816755151845771e-06, + "loss": 0.2313, + "step": 7763 + }, + { + "epoch": 0.6150920974450387, + "grad_norm": 1.7917335483741674, + "learning_rate": 6.814322705250241e-06, + "loss": 0.2481, + "step": 7764 + }, + { + "epoch": 0.615171321053674, + "grad_norm": 1.947116269951721, + "learning_rate": 6.8118904684162325e-06, + "loss": 0.3276, + "step": 7765 + }, + { + "epoch": 0.6152505446623093, + "grad_norm": 1.695559285051137, + "learning_rate": 6.8094584415038975e-06, + "loss": 0.2957, + "step": 7766 + }, + { + "epoch": 0.6153297682709448, + "grad_norm": 1.525461372475352, + "learning_rate": 6.807026624673372e-06, + "loss": 0.1682, + "step": 7767 + }, + { + "epoch": 0.6154089918795801, + "grad_norm": 1.2115235720647932, + "learning_rate": 6.80459501808478e-06, + "loss": 0.1593, + "step": 7768 + }, + { + "epoch": 0.6154882154882155, + "grad_norm": 1.4850956226928616, + "learning_rate": 6.8021636218982275e-06, + "loss": 0.1378, + "step": 7769 + }, + { + "epoch": 0.6155674390968509, + "grad_norm": 1.7707901458498578, + "learning_rate": 6.799732436273816e-06, + "loss": 0.1713, + "step": 7770 + }, + { + "epoch": 0.6156466627054862, + "grad_norm": 1.3081373022727951, + "learning_rate": 6.797301461371626e-06, + "loss": 0.1538, + "step": 7771 + }, + { + "epoch": 0.6157258863141216, + "grad_norm": 1.519540473896888, + "learning_rate": 6.7948706973517235e-06, + "loss": 0.1993, + "step": 7772 + }, + { + "epoch": 0.6158051099227569, + "grad_norm": 1.5413612759578315, + "learning_rate": 6.792440144374162e-06, + "loss": 0.2383, + "step": 7773 + }, + { + "epoch": 0.6158843335313924, + "grad_norm": 1.2274614108806103, + "learning_rate": 6.790009802598984e-06, + "loss": 0.1618, + "step": 7774 + }, + { + "epoch": 0.6159635571400277, + "grad_norm": 1.7465765715264476, + "learning_rate": 6.787579672186215e-06, + "loss": 0.244, + "step": 7775 + }, + { + "epoch": 0.6160427807486631, + "grad_norm": 1.1561774305577743, + "learning_rate": 6.78514975329587e-06, + "loss": 0.1273, + "step": 7776 + }, + { + "epoch": 0.6161220043572985, + "grad_norm": 1.4510180561565333, + "learning_rate": 6.78272004608794e-06, + "loss": 0.1678, + "step": 7777 + }, + { + "epoch": 0.6162012279659338, + "grad_norm": 1.77449949108753, + "learning_rate": 6.780290550722417e-06, + "loss": 0.3752, + "step": 7778 + }, + { + "epoch": 0.6162804515745692, + "grad_norm": 1.2822063468207703, + "learning_rate": 6.777861267359272e-06, + "loss": 0.1562, + "step": 7779 + }, + { + "epoch": 0.6163596751832046, + "grad_norm": 2.024009239336556, + "learning_rate": 6.7754321961584535e-06, + "loss": 0.1841, + "step": 7780 + }, + { + "epoch": 0.61643889879184, + "grad_norm": 1.3148775566139552, + "learning_rate": 6.773003337279911e-06, + "loss": 0.1429, + "step": 7781 + }, + { + "epoch": 0.6165181224004753, + "grad_norm": 1.5292328856004207, + "learning_rate": 6.7705746908835734e-06, + "loss": 0.2279, + "step": 7782 + }, + { + "epoch": 0.6165973460091108, + "grad_norm": 1.3347754511220324, + "learning_rate": 6.768146257129351e-06, + "loss": 0.1628, + "step": 7783 + }, + { + "epoch": 0.6166765696177461, + "grad_norm": 1.8389521658442611, + "learning_rate": 6.765718036177148e-06, + "loss": 0.2983, + "step": 7784 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 1.5770412851955178, + "learning_rate": 6.763290028186849e-06, + "loss": 0.2232, + "step": 7785 + }, + { + "epoch": 0.6168350168350168, + "grad_norm": 1.7306297190951418, + "learning_rate": 6.760862233318327e-06, + "loss": 0.2333, + "step": 7786 + }, + { + "epoch": 0.6169142404436522, + "grad_norm": 1.9176375084991417, + "learning_rate": 6.758434651731445e-06, + "loss": 0.2297, + "step": 7787 + }, + { + "epoch": 0.6169934640522876, + "grad_norm": 1.815241484907113, + "learning_rate": 6.756007283586039e-06, + "loss": 0.1927, + "step": 7788 + }, + { + "epoch": 0.6170726876609229, + "grad_norm": 1.4777259085232395, + "learning_rate": 6.753580129041945e-06, + "loss": 0.2125, + "step": 7789 + }, + { + "epoch": 0.6171519112695584, + "grad_norm": 1.492731248945997, + "learning_rate": 6.751153188258983e-06, + "loss": 0.2192, + "step": 7790 + }, + { + "epoch": 0.6172311348781937, + "grad_norm": 2.0077460775507228, + "learning_rate": 6.748726461396946e-06, + "loss": 0.1549, + "step": 7791 + }, + { + "epoch": 0.617310358486829, + "grad_norm": 1.261488324419248, + "learning_rate": 6.7462999486156315e-06, + "loss": 0.1587, + "step": 7792 + }, + { + "epoch": 0.6173895820954645, + "grad_norm": 1.2649155249767312, + "learning_rate": 6.743873650074807e-06, + "loss": 0.1577, + "step": 7793 + }, + { + "epoch": 0.6174688057040998, + "grad_norm": 1.5651750934289832, + "learning_rate": 6.741447565934236e-06, + "loss": 0.208, + "step": 7794 + }, + { + "epoch": 0.6175480293127352, + "grad_norm": 1.2790467364672031, + "learning_rate": 6.739021696353665e-06, + "loss": 0.1304, + "step": 7795 + }, + { + "epoch": 0.6176272529213706, + "grad_norm": 1.9962154254589741, + "learning_rate": 6.736596041492821e-06, + "loss": 0.2469, + "step": 7796 + }, + { + "epoch": 0.617706476530006, + "grad_norm": 1.8078689585872731, + "learning_rate": 6.734170601511427e-06, + "loss": 0.2865, + "step": 7797 + }, + { + "epoch": 0.6177857001386413, + "grad_norm": 1.4626140932035738, + "learning_rate": 6.7317453765691855e-06, + "loss": 0.1615, + "step": 7798 + }, + { + "epoch": 0.6178649237472766, + "grad_norm": 1.160603390317872, + "learning_rate": 6.729320366825785e-06, + "loss": 0.1276, + "step": 7799 + }, + { + "epoch": 0.6179441473559121, + "grad_norm": 1.4890843096656736, + "learning_rate": 6.726895572440901e-06, + "loss": 0.2062, + "step": 7800 + }, + { + "epoch": 0.6180233709645474, + "grad_norm": 2.058948624716214, + "learning_rate": 6.7244709935741925e-06, + "loss": 0.243, + "step": 7801 + }, + { + "epoch": 0.6181025945731828, + "grad_norm": 1.8117926833993583, + "learning_rate": 6.722046630385309e-06, + "loss": 0.2471, + "step": 7802 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 1.7202056452665793, + "learning_rate": 6.719622483033883e-06, + "loss": 0.2233, + "step": 7803 + }, + { + "epoch": 0.6182610417904536, + "grad_norm": 1.894869509695651, + "learning_rate": 6.7171985516795315e-06, + "loss": 0.1641, + "step": 7804 + }, + { + "epoch": 0.6183402653990889, + "grad_norm": 1.6469666587090124, + "learning_rate": 6.714774836481862e-06, + "loss": 0.2267, + "step": 7805 + }, + { + "epoch": 0.6184194890077243, + "grad_norm": 1.554052023337749, + "learning_rate": 6.71235133760046e-06, + "loss": 0.1635, + "step": 7806 + }, + { + "epoch": 0.6184987126163597, + "grad_norm": 1.4420637540528445, + "learning_rate": 6.709928055194902e-06, + "loss": 0.1475, + "step": 7807 + }, + { + "epoch": 0.618577936224995, + "grad_norm": 1.467254789872434, + "learning_rate": 6.707504989424753e-06, + "loss": 0.2286, + "step": 7808 + }, + { + "epoch": 0.6186571598336305, + "grad_norm": 1.393884241931426, + "learning_rate": 6.705082140449557e-06, + "loss": 0.1594, + "step": 7809 + }, + { + "epoch": 0.6187363834422658, + "grad_norm": 1.7762687766445593, + "learning_rate": 6.702659508428847e-06, + "loss": 0.214, + "step": 7810 + }, + { + "epoch": 0.6188156070509012, + "grad_norm": 1.552419636603836, + "learning_rate": 6.7002370935221454e-06, + "loss": 0.2276, + "step": 7811 + }, + { + "epoch": 0.6188948306595365, + "grad_norm": 1.377759892930884, + "learning_rate": 6.697814895888951e-06, + "loss": 0.1766, + "step": 7812 + }, + { + "epoch": 0.6189740542681719, + "grad_norm": 1.6026921271772006, + "learning_rate": 6.695392915688759e-06, + "loss": 0.1879, + "step": 7813 + }, + { + "epoch": 0.6190532778768073, + "grad_norm": 1.7550905475913694, + "learning_rate": 6.692971153081041e-06, + "loss": 0.2527, + "step": 7814 + }, + { + "epoch": 0.6191325014854426, + "grad_norm": 1.436363118299248, + "learning_rate": 6.690549608225258e-06, + "loss": 0.166, + "step": 7815 + }, + { + "epoch": 0.6192117250940781, + "grad_norm": 1.4071843002807982, + "learning_rate": 6.688128281280863e-06, + "loss": 0.1752, + "step": 7816 + }, + { + "epoch": 0.6192909487027134, + "grad_norm": 1.3391895742895448, + "learning_rate": 6.685707172407284e-06, + "loss": 0.1536, + "step": 7817 + }, + { + "epoch": 0.6193701723113488, + "grad_norm": 2.1293464971881866, + "learning_rate": 6.683286281763939e-06, + "loss": 0.1827, + "step": 7818 + }, + { + "epoch": 0.6194493959199842, + "grad_norm": 1.396718845030991, + "learning_rate": 6.6808656095102365e-06, + "loss": 0.1559, + "step": 7819 + }, + { + "epoch": 0.6195286195286195, + "grad_norm": 1.2961154980735496, + "learning_rate": 6.6784451558055596e-06, + "loss": 0.1639, + "step": 7820 + }, + { + "epoch": 0.6196078431372549, + "grad_norm": 1.7605337476373997, + "learning_rate": 6.67602492080929e-06, + "loss": 0.1778, + "step": 7821 + }, + { + "epoch": 0.6196870667458902, + "grad_norm": 1.7126802352449526, + "learning_rate": 6.6736049046807815e-06, + "loss": 0.1984, + "step": 7822 + }, + { + "epoch": 0.6197662903545257, + "grad_norm": 1.3718199537462228, + "learning_rate": 6.671185107579387e-06, + "loss": 0.1321, + "step": 7823 + }, + { + "epoch": 0.619845513963161, + "grad_norm": 1.3652329684378686, + "learning_rate": 6.668765529664436e-06, + "loss": 0.0989, + "step": 7824 + }, + { + "epoch": 0.6199247375717964, + "grad_norm": 1.4527634699668837, + "learning_rate": 6.6663461710952445e-06, + "loss": 0.1805, + "step": 7825 + }, + { + "epoch": 0.6200039611804318, + "grad_norm": 1.573924393543757, + "learning_rate": 6.663927032031118e-06, + "loss": 0.2202, + "step": 7826 + }, + { + "epoch": 0.6200831847890671, + "grad_norm": 1.9794009597699758, + "learning_rate": 6.661508112631347e-06, + "loss": 0.191, + "step": 7827 + }, + { + "epoch": 0.6201624083977025, + "grad_norm": 2.02077421575022, + "learning_rate": 6.659089413055202e-06, + "loss": 0.2438, + "step": 7828 + }, + { + "epoch": 0.6202416320063379, + "grad_norm": 1.6340115751289448, + "learning_rate": 6.656670933461942e-06, + "loss": 0.2239, + "step": 7829 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 1.303156684618766, + "learning_rate": 6.654252674010815e-06, + "loss": 0.1333, + "step": 7830 + }, + { + "epoch": 0.6204000792236086, + "grad_norm": 1.395190634999042, + "learning_rate": 6.6518346348610484e-06, + "loss": 0.1337, + "step": 7831 + }, + { + "epoch": 0.6204793028322441, + "grad_norm": 1.3471323328538636, + "learning_rate": 6.649416816171861e-06, + "loss": 0.193, + "step": 7832 + }, + { + "epoch": 0.6205585264408794, + "grad_norm": 1.7693470753972365, + "learning_rate": 6.646999218102457e-06, + "loss": 0.1921, + "step": 7833 + }, + { + "epoch": 0.6206377500495147, + "grad_norm": 2.006796199403565, + "learning_rate": 6.644581840812019e-06, + "loss": 0.2262, + "step": 7834 + }, + { + "epoch": 0.6207169736581502, + "grad_norm": 1.753958188899018, + "learning_rate": 6.64216468445972e-06, + "loss": 0.2657, + "step": 7835 + }, + { + "epoch": 0.6207961972667855, + "grad_norm": 1.633385184679226, + "learning_rate": 6.639747749204723e-06, + "loss": 0.2329, + "step": 7836 + }, + { + "epoch": 0.6208754208754209, + "grad_norm": 1.4922652045363833, + "learning_rate": 6.637331035206166e-06, + "loss": 0.1316, + "step": 7837 + }, + { + "epoch": 0.6209546444840562, + "grad_norm": 1.783399984258043, + "learning_rate": 6.634914542623182e-06, + "loss": 0.2565, + "step": 7838 + }, + { + "epoch": 0.6210338680926917, + "grad_norm": 1.484108623254551, + "learning_rate": 6.632498271614882e-06, + "loss": 0.2606, + "step": 7839 + }, + { + "epoch": 0.621113091701327, + "grad_norm": 1.5065649642025296, + "learning_rate": 6.630082222340366e-06, + "loss": 0.2385, + "step": 7840 + }, + { + "epoch": 0.6211923153099623, + "grad_norm": 1.7025695628454245, + "learning_rate": 6.627666394958725e-06, + "loss": 0.2565, + "step": 7841 + }, + { + "epoch": 0.6212715389185978, + "grad_norm": 2.40305701513319, + "learning_rate": 6.625250789629021e-06, + "loss": 0.2346, + "step": 7842 + }, + { + "epoch": 0.6213507625272331, + "grad_norm": 1.3311065730431535, + "learning_rate": 6.622835406510315e-06, + "loss": 0.1648, + "step": 7843 + }, + { + "epoch": 0.6214299861358685, + "grad_norm": 1.5544699034926304, + "learning_rate": 6.620420245761651e-06, + "loss": 0.1878, + "step": 7844 + }, + { + "epoch": 0.6215092097445039, + "grad_norm": 1.4568278149859846, + "learning_rate": 6.6180053075420484e-06, + "loss": 0.1697, + "step": 7845 + }, + { + "epoch": 0.6215884333531392, + "grad_norm": 1.7742298442586866, + "learning_rate": 6.615590592010526e-06, + "loss": 0.2611, + "step": 7846 + }, + { + "epoch": 0.6216676569617746, + "grad_norm": 2.6871618122073873, + "learning_rate": 6.613176099326077e-06, + "loss": 0.2244, + "step": 7847 + }, + { + "epoch": 0.6217468805704099, + "grad_norm": 1.2969903310902295, + "learning_rate": 6.610761829647685e-06, + "loss": 0.1597, + "step": 7848 + }, + { + "epoch": 0.6218261041790454, + "grad_norm": 1.4145144500394258, + "learning_rate": 6.608347783134319e-06, + "loss": 0.2058, + "step": 7849 + }, + { + "epoch": 0.6219053277876807, + "grad_norm": 1.4969661273895694, + "learning_rate": 6.605933959944933e-06, + "loss": 0.2164, + "step": 7850 + }, + { + "epoch": 0.6219845513963161, + "grad_norm": 1.3614776938651494, + "learning_rate": 6.603520360238462e-06, + "loss": 0.1955, + "step": 7851 + }, + { + "epoch": 0.6220637750049515, + "grad_norm": 1.8176205987293823, + "learning_rate": 6.601106984173835e-06, + "loss": 0.2122, + "step": 7852 + }, + { + "epoch": 0.6221429986135868, + "grad_norm": 1.8053032324702936, + "learning_rate": 6.598693831909957e-06, + "loss": 0.2071, + "step": 7853 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 1.6599932776281034, + "learning_rate": 6.596280903605725e-06, + "loss": 0.2571, + "step": 7854 + }, + { + "epoch": 0.6223014458308576, + "grad_norm": 1.4472467412106185, + "learning_rate": 6.593868199420017e-06, + "loss": 0.212, + "step": 7855 + }, + { + "epoch": 0.622380669439493, + "grad_norm": 1.5127955763863512, + "learning_rate": 6.591455719511699e-06, + "loss": 0.2028, + "step": 7856 + }, + { + "epoch": 0.6224598930481283, + "grad_norm": 1.6375932304639091, + "learning_rate": 6.589043464039624e-06, + "loss": 0.3083, + "step": 7857 + }, + { + "epoch": 0.6225391166567638, + "grad_norm": 1.9057316741438, + "learning_rate": 6.58663143316262e-06, + "loss": 0.285, + "step": 7858 + }, + { + "epoch": 0.6226183402653991, + "grad_norm": 1.6177226168771455, + "learning_rate": 6.584219627039513e-06, + "loss": 0.2264, + "step": 7859 + }, + { + "epoch": 0.6226975638740344, + "grad_norm": 1.4088879237062937, + "learning_rate": 6.58180804582911e-06, + "loss": 0.2026, + "step": 7860 + }, + { + "epoch": 0.6227767874826698, + "grad_norm": 1.34170129169816, + "learning_rate": 6.579396689690198e-06, + "loss": 0.1917, + "step": 7861 + }, + { + "epoch": 0.6228560110913052, + "grad_norm": 1.4353674413909872, + "learning_rate": 6.576985558781557e-06, + "loss": 0.1923, + "step": 7862 + }, + { + "epoch": 0.6229352346999406, + "grad_norm": 1.6550588694569888, + "learning_rate": 6.574574653261945e-06, + "loss": 0.2382, + "step": 7863 + }, + { + "epoch": 0.6230144583085759, + "grad_norm": 1.520229713270655, + "learning_rate": 6.572163973290109e-06, + "loss": 0.2127, + "step": 7864 + }, + { + "epoch": 0.6230936819172114, + "grad_norm": 1.8793905473605552, + "learning_rate": 6.569753519024784e-06, + "loss": 0.2602, + "step": 7865 + }, + { + "epoch": 0.6231729055258467, + "grad_norm": 1.7599093716223355, + "learning_rate": 6.567343290624683e-06, + "loss": 0.2268, + "step": 7866 + }, + { + "epoch": 0.623252129134482, + "grad_norm": 1.1536658109816318, + "learning_rate": 6.564933288248509e-06, + "loss": 0.1206, + "step": 7867 + }, + { + "epoch": 0.6233313527431175, + "grad_norm": 1.698620187974912, + "learning_rate": 6.562523512054951e-06, + "loss": 0.1711, + "step": 7868 + }, + { + "epoch": 0.6234105763517528, + "grad_norm": 1.2768363169274073, + "learning_rate": 6.560113962202679e-06, + "loss": 0.1192, + "step": 7869 + }, + { + "epoch": 0.6234897999603882, + "grad_norm": 1.6253243281763008, + "learning_rate": 6.557704638850352e-06, + "loss": 0.1982, + "step": 7870 + }, + { + "epoch": 0.6235690235690236, + "grad_norm": 1.5837026098292162, + "learning_rate": 6.555295542156609e-06, + "loss": 0.2504, + "step": 7871 + }, + { + "epoch": 0.623648247177659, + "grad_norm": 1.6127401725297927, + "learning_rate": 6.55288667228008e-06, + "loss": 0.1785, + "step": 7872 + }, + { + "epoch": 0.6237274707862943, + "grad_norm": 2.2175356392876604, + "learning_rate": 6.550478029379379e-06, + "loss": 0.2498, + "step": 7873 + }, + { + "epoch": 0.6238066943949296, + "grad_norm": 1.7599761311327187, + "learning_rate": 6.548069613613099e-06, + "loss": 0.2263, + "step": 7874 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 1.9372065568810095, + "learning_rate": 6.545661425139827e-06, + "loss": 0.2443, + "step": 7875 + }, + { + "epoch": 0.6239651416122004, + "grad_norm": 1.4477781353879597, + "learning_rate": 6.543253464118131e-06, + "loss": 0.1502, + "step": 7876 + }, + { + "epoch": 0.6240443652208358, + "grad_norm": 1.6544573759020447, + "learning_rate": 6.540845730706557e-06, + "loss": 0.1924, + "step": 7877 + }, + { + "epoch": 0.6241235888294712, + "grad_norm": 1.5159022206176498, + "learning_rate": 6.538438225063653e-06, + "loss": 0.1717, + "step": 7878 + }, + { + "epoch": 0.6242028124381066, + "grad_norm": 1.4576665570617475, + "learning_rate": 6.536030947347931e-06, + "loss": 0.1343, + "step": 7879 + }, + { + "epoch": 0.6242820360467419, + "grad_norm": 2.0244353800357553, + "learning_rate": 6.533623897717905e-06, + "loss": 0.2323, + "step": 7880 + }, + { + "epoch": 0.6243612596553773, + "grad_norm": 1.9392669886849387, + "learning_rate": 6.531217076332068e-06, + "loss": 0.2666, + "step": 7881 + }, + { + "epoch": 0.6244404832640127, + "grad_norm": 1.281287698891154, + "learning_rate": 6.528810483348893e-06, + "loss": 0.1574, + "step": 7882 + }, + { + "epoch": 0.624519706872648, + "grad_norm": 1.6101515704384937, + "learning_rate": 6.526404118926848e-06, + "loss": 0.1787, + "step": 7883 + }, + { + "epoch": 0.6245989304812835, + "grad_norm": 1.25469824957441, + "learning_rate": 6.523997983224375e-06, + "loss": 0.1561, + "step": 7884 + }, + { + "epoch": 0.6246781540899188, + "grad_norm": 1.7500032693895498, + "learning_rate": 6.52159207639991e-06, + "loss": 0.2571, + "step": 7885 + }, + { + "epoch": 0.6247573776985542, + "grad_norm": 1.6246852739696336, + "learning_rate": 6.519186398611872e-06, + "loss": 0.17, + "step": 7886 + }, + { + "epoch": 0.6248366013071895, + "grad_norm": 1.7273253689408932, + "learning_rate": 6.51678095001866e-06, + "loss": 0.1682, + "step": 7887 + }, + { + "epoch": 0.6249158249158249, + "grad_norm": 1.317922407366493, + "learning_rate": 6.51437573077866e-06, + "loss": 0.1542, + "step": 7888 + }, + { + "epoch": 0.6249950485244603, + "grad_norm": 1.2180161169104873, + "learning_rate": 6.5119707410502495e-06, + "loss": 0.1292, + "step": 7889 + }, + { + "epoch": 0.6250742721330956, + "grad_norm": 1.3836435381393863, + "learning_rate": 6.509565980991781e-06, + "loss": 0.1934, + "step": 7890 + }, + { + "epoch": 0.6251534957417311, + "grad_norm": 1.3183575081037062, + "learning_rate": 6.5071614507615985e-06, + "loss": 0.1424, + "step": 7891 + }, + { + "epoch": 0.6252327193503664, + "grad_norm": 1.3125494246746818, + "learning_rate": 6.5047571505180265e-06, + "loss": 0.1587, + "step": 7892 + }, + { + "epoch": 0.6253119429590018, + "grad_norm": 2.050193006069645, + "learning_rate": 6.502353080419379e-06, + "loss": 0.3957, + "step": 7893 + }, + { + "epoch": 0.6253911665676372, + "grad_norm": 1.7876277907671507, + "learning_rate": 6.4999492406239525e-06, + "loss": 0.1996, + "step": 7894 + }, + { + "epoch": 0.6254703901762725, + "grad_norm": 1.490852829465011, + "learning_rate": 6.497545631290025e-06, + "loss": 0.1917, + "step": 7895 + }, + { + "epoch": 0.6255496137849079, + "grad_norm": 1.3170881655199036, + "learning_rate": 6.495142252575866e-06, + "loss": 0.1758, + "step": 7896 + }, + { + "epoch": 0.6256288373935432, + "grad_norm": 1.5944141139829973, + "learning_rate": 6.492739104639727e-06, + "loss": 0.1948, + "step": 7897 + }, + { + "epoch": 0.6257080610021787, + "grad_norm": 1.5936983327495715, + "learning_rate": 6.490336187639841e-06, + "loss": 0.1439, + "step": 7898 + }, + { + "epoch": 0.625787284610814, + "grad_norm": 1.5801527451779789, + "learning_rate": 6.487933501734429e-06, + "loss": 0.2152, + "step": 7899 + }, + { + "epoch": 0.6258665082194494, + "grad_norm": 1.6507551259810622, + "learning_rate": 6.485531047081697e-06, + "loss": 0.1595, + "step": 7900 + }, + { + "epoch": 0.6259457318280848, + "grad_norm": 2.27734118783868, + "learning_rate": 6.483128823839835e-06, + "loss": 0.276, + "step": 7901 + }, + { + "epoch": 0.6260249554367201, + "grad_norm": 1.565796823546716, + "learning_rate": 6.480726832167019e-06, + "loss": 0.1672, + "step": 7902 + }, + { + "epoch": 0.6261041790453555, + "grad_norm": 1.5137469420682084, + "learning_rate": 6.4783250722214066e-06, + "loss": 0.1605, + "step": 7903 + }, + { + "epoch": 0.6261834026539909, + "grad_norm": 1.1787520873598003, + "learning_rate": 6.475923544161142e-06, + "loss": 0.1633, + "step": 7904 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 1.159742081282386, + "learning_rate": 6.473522248144359e-06, + "loss": 0.1469, + "step": 7905 + }, + { + "epoch": 0.6263418498712616, + "grad_norm": 1.2783348596837976, + "learning_rate": 6.471121184329167e-06, + "loss": 0.1124, + "step": 7906 + }, + { + "epoch": 0.6264210734798971, + "grad_norm": 1.7741075046613175, + "learning_rate": 6.468720352873662e-06, + "loss": 0.2386, + "step": 7907 + }, + { + "epoch": 0.6265002970885324, + "grad_norm": 1.4752064146101964, + "learning_rate": 6.466319753935933e-06, + "loss": 0.1936, + "step": 7908 + }, + { + "epoch": 0.6265795206971677, + "grad_norm": 1.284769464375227, + "learning_rate": 6.463919387674043e-06, + "loss": 0.1575, + "step": 7909 + }, + { + "epoch": 0.6266587443058032, + "grad_norm": 2.099376180836153, + "learning_rate": 6.461519254246046e-06, + "loss": 0.3644, + "step": 7910 + }, + { + "epoch": 0.6267379679144385, + "grad_norm": 1.491872246312762, + "learning_rate": 6.459119353809982e-06, + "loss": 0.1701, + "step": 7911 + }, + { + "epoch": 0.6268171915230739, + "grad_norm": 1.8710698682188274, + "learning_rate": 6.45671968652387e-06, + "loss": 0.2716, + "step": 7912 + }, + { + "epoch": 0.6268964151317092, + "grad_norm": 1.7134978834300063, + "learning_rate": 6.4543202525457175e-06, + "loss": 0.2674, + "step": 7913 + }, + { + "epoch": 0.6269756387403447, + "grad_norm": 1.4898753288129385, + "learning_rate": 6.451921052033516e-06, + "loss": 0.2058, + "step": 7914 + }, + { + "epoch": 0.62705486234898, + "grad_norm": 1.615596523491488, + "learning_rate": 6.449522085145241e-06, + "loss": 0.2131, + "step": 7915 + }, + { + "epoch": 0.6271340859576153, + "grad_norm": 1.3833741430795214, + "learning_rate": 6.447123352038853e-06, + "loss": 0.1284, + "step": 7916 + }, + { + "epoch": 0.6272133095662508, + "grad_norm": 1.4985700035634617, + "learning_rate": 6.444724852872297e-06, + "loss": 0.1886, + "step": 7917 + }, + { + "epoch": 0.6272925331748861, + "grad_norm": 1.4725338963061434, + "learning_rate": 6.4423265878035015e-06, + "loss": 0.1576, + "step": 7918 + }, + { + "epoch": 0.6273717567835215, + "grad_norm": 1.4798489321096, + "learning_rate": 6.439928556990382e-06, + "loss": 0.1729, + "step": 7919 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 1.5319312001039236, + "learning_rate": 6.437530760590838e-06, + "loss": 0.2167, + "step": 7920 + }, + { + "epoch": 0.6275302040007923, + "grad_norm": 2.5943755672081648, + "learning_rate": 6.435133198762751e-06, + "loss": 0.2177, + "step": 7921 + }, + { + "epoch": 0.6276094276094276, + "grad_norm": 1.5021069287035707, + "learning_rate": 6.432735871663991e-06, + "loss": 0.1498, + "step": 7922 + }, + { + "epoch": 0.6276886512180629, + "grad_norm": 1.3621013039014824, + "learning_rate": 6.430338779452407e-06, + "loss": 0.158, + "step": 7923 + }, + { + "epoch": 0.6277678748266984, + "grad_norm": 1.4806787941253534, + "learning_rate": 6.4279419222858416e-06, + "loss": 0.2161, + "step": 7924 + }, + { + "epoch": 0.6278470984353337, + "grad_norm": 1.419841761954037, + "learning_rate": 6.4255453003221115e-06, + "loss": 0.2166, + "step": 7925 + }, + { + "epoch": 0.6279263220439691, + "grad_norm": 1.3091978430645574, + "learning_rate": 6.423148913719022e-06, + "loss": 0.1468, + "step": 7926 + }, + { + "epoch": 0.6280055456526045, + "grad_norm": 1.673773160867503, + "learning_rate": 6.420752762634369e-06, + "loss": 0.2671, + "step": 7927 + }, + { + "epoch": 0.6280847692612398, + "grad_norm": 2.004494127782503, + "learning_rate": 6.4183568472259216e-06, + "loss": 0.2465, + "step": 7928 + }, + { + "epoch": 0.6281639928698752, + "grad_norm": 1.5316460797627829, + "learning_rate": 6.415961167651443e-06, + "loss": 0.1813, + "step": 7929 + }, + { + "epoch": 0.6282432164785106, + "grad_norm": 1.4609092935370414, + "learning_rate": 6.413565724068678e-06, + "loss": 0.1987, + "step": 7930 + }, + { + "epoch": 0.628322440087146, + "grad_norm": 1.6128836549760186, + "learning_rate": 6.4111705166353525e-06, + "loss": 0.2445, + "step": 7931 + }, + { + "epoch": 0.6284016636957813, + "grad_norm": 1.5546820546173823, + "learning_rate": 6.40877554550918e-06, + "loss": 0.232, + "step": 7932 + }, + { + "epoch": 0.6284808873044168, + "grad_norm": 1.4104342061217787, + "learning_rate": 6.406380810847856e-06, + "loss": 0.1533, + "step": 7933 + }, + { + "epoch": 0.6285601109130521, + "grad_norm": 1.7638370266461518, + "learning_rate": 6.403986312809065e-06, + "loss": 0.2476, + "step": 7934 + }, + { + "epoch": 0.6286393345216874, + "grad_norm": 1.3556109123773417, + "learning_rate": 6.401592051550475e-06, + "loss": 0.1523, + "step": 7935 + }, + { + "epoch": 0.6287185581303228, + "grad_norm": 1.659517186936443, + "learning_rate": 6.399198027229732e-06, + "loss": 0.2067, + "step": 7936 + }, + { + "epoch": 0.6287977817389582, + "grad_norm": 1.5913665494212388, + "learning_rate": 6.39680424000447e-06, + "loss": 0.2012, + "step": 7937 + }, + { + "epoch": 0.6288770053475936, + "grad_norm": 1.3731528814564413, + "learning_rate": 6.3944106900323174e-06, + "loss": 0.2018, + "step": 7938 + }, + { + "epoch": 0.6289562289562289, + "grad_norm": 1.6091954723861581, + "learning_rate": 6.392017377470867e-06, + "loss": 0.2359, + "step": 7939 + }, + { + "epoch": 0.6290354525648644, + "grad_norm": 1.4311807733127115, + "learning_rate": 6.389624302477715e-06, + "loss": 0.2047, + "step": 7940 + }, + { + "epoch": 0.6291146761734997, + "grad_norm": 1.4836520355515233, + "learning_rate": 6.387231465210428e-06, + "loss": 0.1784, + "step": 7941 + }, + { + "epoch": 0.629193899782135, + "grad_norm": 1.9051714198751553, + "learning_rate": 6.384838865826567e-06, + "loss": 0.2349, + "step": 7942 + }, + { + "epoch": 0.6292731233907705, + "grad_norm": 1.453331753354582, + "learning_rate": 6.382446504483672e-06, + "loss": 0.18, + "step": 7943 + }, + { + "epoch": 0.6293523469994058, + "grad_norm": 1.6644564917854334, + "learning_rate": 6.380054381339267e-06, + "loss": 0.2612, + "step": 7944 + }, + { + "epoch": 0.6294315706080412, + "grad_norm": 1.4451494250246275, + "learning_rate": 6.377662496550863e-06, + "loss": 0.1703, + "step": 7945 + }, + { + "epoch": 0.6295107942166766, + "grad_norm": 1.8435397218283522, + "learning_rate": 6.375270850275956e-06, + "loss": 0.2207, + "step": 7946 + }, + { + "epoch": 0.629590017825312, + "grad_norm": 1.400221202007302, + "learning_rate": 6.37287944267202e-06, + "loss": 0.1446, + "step": 7947 + }, + { + "epoch": 0.6296692414339473, + "grad_norm": 1.723450942102276, + "learning_rate": 6.370488273896522e-06, + "loss": 0.1875, + "step": 7948 + }, + { + "epoch": 0.6297484650425826, + "grad_norm": 1.7713840152730844, + "learning_rate": 6.368097344106905e-06, + "loss": 0.2051, + "step": 7949 + }, + { + "epoch": 0.6298276886512181, + "grad_norm": 1.3937403365606027, + "learning_rate": 6.365706653460602e-06, + "loss": 0.1856, + "step": 7950 + }, + { + "epoch": 0.6299069122598534, + "grad_norm": 1.5131896324848333, + "learning_rate": 6.363316202115033e-06, + "loss": 0.1781, + "step": 7951 + }, + { + "epoch": 0.6299861358684888, + "grad_norm": 1.3320947120240656, + "learning_rate": 6.3609259902275884e-06, + "loss": 0.1332, + "step": 7952 + }, + { + "epoch": 0.6300653594771242, + "grad_norm": 1.495553962659539, + "learning_rate": 6.358536017955659e-06, + "loss": 0.2295, + "step": 7953 + }, + { + "epoch": 0.6301445830857596, + "grad_norm": 1.7521066539337526, + "learning_rate": 6.3561462854566135e-06, + "loss": 0.1702, + "step": 7954 + }, + { + "epoch": 0.6302238066943949, + "grad_norm": 1.749518352108375, + "learning_rate": 6.3537567928878e-06, + "loss": 0.2705, + "step": 7955 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 1.4152723006226362, + "learning_rate": 6.3513675404065575e-06, + "loss": 0.118, + "step": 7956 + }, + { + "epoch": 0.6303822539116657, + "grad_norm": 1.543881319263055, + "learning_rate": 6.348978528170205e-06, + "loss": 0.2047, + "step": 7957 + }, + { + "epoch": 0.630461477520301, + "grad_norm": 1.2754375667232551, + "learning_rate": 6.34658975633605e-06, + "loss": 0.1566, + "step": 7958 + }, + { + "epoch": 0.6305407011289365, + "grad_norm": 1.278788340479489, + "learning_rate": 6.344201225061382e-06, + "loss": 0.1586, + "step": 7959 + }, + { + "epoch": 0.6306199247375718, + "grad_norm": 1.7023676523676752, + "learning_rate": 6.341812934503469e-06, + "loss": 0.1867, + "step": 7960 + }, + { + "epoch": 0.6306991483462072, + "grad_norm": 1.261038584144999, + "learning_rate": 6.339424884819574e-06, + "loss": 0.1401, + "step": 7961 + }, + { + "epoch": 0.6307783719548425, + "grad_norm": 1.5919417404903258, + "learning_rate": 6.337037076166939e-06, + "loss": 0.1946, + "step": 7962 + }, + { + "epoch": 0.6308575955634779, + "grad_norm": 1.529102757026119, + "learning_rate": 6.334649508702784e-06, + "loss": 0.1759, + "step": 7963 + }, + { + "epoch": 0.6309368191721133, + "grad_norm": 1.2855492439484082, + "learning_rate": 6.332262182584325e-06, + "loss": 0.1555, + "step": 7964 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 1.7316650697955416, + "learning_rate": 6.3298750979687515e-06, + "loss": 0.1791, + "step": 7965 + }, + { + "epoch": 0.6310952663893841, + "grad_norm": 1.3883250428557548, + "learning_rate": 6.327488255013244e-06, + "loss": 0.1399, + "step": 7966 + }, + { + "epoch": 0.6311744899980194, + "grad_norm": 1.8687207046112349, + "learning_rate": 6.325101653874965e-06, + "loss": 0.2542, + "step": 7967 + }, + { + "epoch": 0.6312537136066548, + "grad_norm": 1.6830502920266992, + "learning_rate": 6.322715294711057e-06, + "loss": 0.2258, + "step": 7968 + }, + { + "epoch": 0.6313329372152902, + "grad_norm": 1.3694793381171697, + "learning_rate": 6.320329177678656e-06, + "loss": 0.1634, + "step": 7969 + }, + { + "epoch": 0.6314121608239255, + "grad_norm": 1.7936779428889684, + "learning_rate": 6.31794330293487e-06, + "loss": 0.2409, + "step": 7970 + }, + { + "epoch": 0.6314913844325609, + "grad_norm": 1.466690001573824, + "learning_rate": 6.315557670636803e-06, + "loss": 0.2139, + "step": 7971 + }, + { + "epoch": 0.6315706080411962, + "grad_norm": 1.960243302503434, + "learning_rate": 6.313172280941534e-06, + "loss": 0.2877, + "step": 7972 + }, + { + "epoch": 0.6316498316498317, + "grad_norm": 1.3559473092017202, + "learning_rate": 6.31078713400613e-06, + "loss": 0.18, + "step": 7973 + }, + { + "epoch": 0.631729055258467, + "grad_norm": 1.8740920555756857, + "learning_rate": 6.308402229987641e-06, + "loss": 0.1647, + "step": 7974 + }, + { + "epoch": 0.6318082788671024, + "grad_norm": 1.9188085070924887, + "learning_rate": 6.3060175690431055e-06, + "loss": 0.2129, + "step": 7975 + }, + { + "epoch": 0.6318875024757378, + "grad_norm": 1.206371226723691, + "learning_rate": 6.303633151329535e-06, + "loss": 0.1141, + "step": 7976 + }, + { + "epoch": 0.6319667260843731, + "grad_norm": 1.358942659246891, + "learning_rate": 6.3012489770039396e-06, + "loss": 0.1555, + "step": 7977 + }, + { + "epoch": 0.6320459496930085, + "grad_norm": 1.966331961576497, + "learning_rate": 6.2988650462232995e-06, + "loss": 0.2249, + "step": 7978 + }, + { + "epoch": 0.6321251733016439, + "grad_norm": 1.328642638467315, + "learning_rate": 6.296481359144587e-06, + "loss": 0.1313, + "step": 7979 + }, + { + "epoch": 0.6322043969102793, + "grad_norm": 1.7156812657943237, + "learning_rate": 6.29409791592476e-06, + "loss": 0.2576, + "step": 7980 + }, + { + "epoch": 0.6322836205189146, + "grad_norm": 1.4620505214140445, + "learning_rate": 6.2917147167207495e-06, + "loss": 0.1296, + "step": 7981 + }, + { + "epoch": 0.6323628441275501, + "grad_norm": 1.2551153104109596, + "learning_rate": 6.289331761689482e-06, + "loss": 0.1693, + "step": 7982 + }, + { + "epoch": 0.6324420677361854, + "grad_norm": 1.776775111513521, + "learning_rate": 6.286949050987868e-06, + "loss": 0.2249, + "step": 7983 + }, + { + "epoch": 0.6325212913448207, + "grad_norm": 1.6480028185670865, + "learning_rate": 6.284566584772791e-06, + "loss": 0.1856, + "step": 7984 + }, + { + "epoch": 0.6326005149534562, + "grad_norm": 1.229197666278799, + "learning_rate": 6.2821843632011245e-06, + "loss": 0.1475, + "step": 7985 + }, + { + "epoch": 0.6326797385620915, + "grad_norm": 2.0871086808008945, + "learning_rate": 6.2798023864297315e-06, + "loss": 0.2383, + "step": 7986 + }, + { + "epoch": 0.6327589621707269, + "grad_norm": 1.384433276064647, + "learning_rate": 6.277420654615449e-06, + "loss": 0.1619, + "step": 7987 + }, + { + "epoch": 0.6328381857793622, + "grad_norm": 1.2986283054057641, + "learning_rate": 6.275039167915103e-06, + "loss": 0.1619, + "step": 7988 + }, + { + "epoch": 0.6329174093879977, + "grad_norm": 1.1253483275005327, + "learning_rate": 6.2726579264855084e-06, + "loss": 0.1078, + "step": 7989 + }, + { + "epoch": 0.632996632996633, + "grad_norm": 1.647147818670608, + "learning_rate": 6.270276930483451e-06, + "loss": 0.1988, + "step": 7990 + }, + { + "epoch": 0.6330758566052683, + "grad_norm": 1.370382546463394, + "learning_rate": 6.267896180065711e-06, + "loss": 0.1607, + "step": 7991 + }, + { + "epoch": 0.6331550802139038, + "grad_norm": 1.3730958137614806, + "learning_rate": 6.265515675389053e-06, + "loss": 0.1264, + "step": 7992 + }, + { + "epoch": 0.6332343038225391, + "grad_norm": 1.714674195744542, + "learning_rate": 6.263135416610217e-06, + "loss": 0.2455, + "step": 7993 + }, + { + "epoch": 0.6333135274311745, + "grad_norm": 1.561534783085332, + "learning_rate": 6.260755403885934e-06, + "loss": 0.2773, + "step": 7994 + }, + { + "epoch": 0.6333927510398099, + "grad_norm": 1.6434314106560817, + "learning_rate": 6.258375637372914e-06, + "loss": 0.1878, + "step": 7995 + }, + { + "epoch": 0.6334719746484453, + "grad_norm": 1.5689953445907228, + "learning_rate": 6.2559961172278545e-06, + "loss": 0.1968, + "step": 7996 + }, + { + "epoch": 0.6335511982570806, + "grad_norm": 1.731654609938004, + "learning_rate": 6.253616843607439e-06, + "loss": 0.2052, + "step": 7997 + }, + { + "epoch": 0.6336304218657159, + "grad_norm": 1.4905339583145942, + "learning_rate": 6.251237816668324e-06, + "loss": 0.2378, + "step": 7998 + }, + { + "epoch": 0.6337096454743514, + "grad_norm": 1.439622356072016, + "learning_rate": 6.248859036567162e-06, + "loss": 0.2048, + "step": 7999 + }, + { + "epoch": 0.6337888690829867, + "grad_norm": 1.468302441482144, + "learning_rate": 6.246480503460585e-06, + "loss": 0.2264, + "step": 8000 + }, + { + "epoch": 0.6338680926916221, + "grad_norm": 1.6697032661549593, + "learning_rate": 6.2441022175052034e-06, + "loss": 0.2144, + "step": 8001 + }, + { + "epoch": 0.6339473163002575, + "grad_norm": 1.5476388956752491, + "learning_rate": 6.241724178857621e-06, + "loss": 0.1465, + "step": 8002 + }, + { + "epoch": 0.6340265399088929, + "grad_norm": 1.5791840926184573, + "learning_rate": 6.2393463876744165e-06, + "loss": 0.1702, + "step": 8003 + }, + { + "epoch": 0.6341057635175282, + "grad_norm": 1.4380500215576228, + "learning_rate": 6.236968844112157e-06, + "loss": 0.1442, + "step": 8004 + }, + { + "epoch": 0.6341849871261636, + "grad_norm": 1.763594671092584, + "learning_rate": 6.234591548327393e-06, + "loss": 0.2856, + "step": 8005 + }, + { + "epoch": 0.634264210734799, + "grad_norm": 1.8216817820410947, + "learning_rate": 6.232214500476657e-06, + "loss": 0.1784, + "step": 8006 + }, + { + "epoch": 0.6343434343434343, + "grad_norm": 1.5048471111398678, + "learning_rate": 6.229837700716465e-06, + "loss": 0.2304, + "step": 8007 + }, + { + "epoch": 0.6344226579520698, + "grad_norm": 1.2514229945530448, + "learning_rate": 6.227461149203324e-06, + "loss": 0.1691, + "step": 8008 + }, + { + "epoch": 0.6345018815607051, + "grad_norm": 1.4342496892213084, + "learning_rate": 6.225084846093711e-06, + "loss": 0.1324, + "step": 8009 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 1.5452281036492033, + "learning_rate": 6.222708791544098e-06, + "loss": 0.1848, + "step": 8010 + }, + { + "epoch": 0.6346603287779758, + "grad_norm": 1.6125328562744934, + "learning_rate": 6.220332985710936e-06, + "loss": 0.2322, + "step": 8011 + }, + { + "epoch": 0.6347395523866112, + "grad_norm": 1.6867538648031914, + "learning_rate": 6.21795742875066e-06, + "loss": 0.2695, + "step": 8012 + }, + { + "epoch": 0.6348187759952466, + "grad_norm": 1.542098298416949, + "learning_rate": 6.21558212081969e-06, + "loss": 0.2412, + "step": 8013 + }, + { + "epoch": 0.6348979996038819, + "grad_norm": 1.5604046403215577, + "learning_rate": 6.213207062074427e-06, + "loss": 0.1929, + "step": 8014 + }, + { + "epoch": 0.6349772232125174, + "grad_norm": 1.2822064583776942, + "learning_rate": 6.210832252671257e-06, + "loss": 0.1631, + "step": 8015 + }, + { + "epoch": 0.6350564468211527, + "grad_norm": 1.3393994963057834, + "learning_rate": 6.208457692766554e-06, + "loss": 0.1456, + "step": 8016 + }, + { + "epoch": 0.635135670429788, + "grad_norm": 1.4663882948028086, + "learning_rate": 6.206083382516665e-06, + "loss": 0.168, + "step": 8017 + }, + { + "epoch": 0.6352148940384235, + "grad_norm": 1.4450798951628312, + "learning_rate": 6.203709322077933e-06, + "loss": 0.1876, + "step": 8018 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 1.2051542041998966, + "learning_rate": 6.201335511606673e-06, + "loss": 0.1521, + "step": 8019 + }, + { + "epoch": 0.6353733412556942, + "grad_norm": 2.32793696185281, + "learning_rate": 6.198961951259193e-06, + "loss": 0.2423, + "step": 8020 + }, + { + "epoch": 0.6354525648643295, + "grad_norm": 1.4219563770923758, + "learning_rate": 6.196588641191778e-06, + "loss": 0.191, + "step": 8021 + }, + { + "epoch": 0.635531788472965, + "grad_norm": 1.3256285093862534, + "learning_rate": 6.194215581560701e-06, + "loss": 0.1817, + "step": 8022 + }, + { + "epoch": 0.6356110120816003, + "grad_norm": 1.598663212139321, + "learning_rate": 6.191842772522214e-06, + "loss": 0.1942, + "step": 8023 + }, + { + "epoch": 0.6356902356902356, + "grad_norm": 1.7049793762495458, + "learning_rate": 6.18947021423256e-06, + "loss": 0.2757, + "step": 8024 + }, + { + "epoch": 0.6357694592988711, + "grad_norm": 1.2680387316636024, + "learning_rate": 6.187097906847954e-06, + "loss": 0.1202, + "step": 8025 + }, + { + "epoch": 0.6358486829075064, + "grad_norm": 1.5761126784924702, + "learning_rate": 6.184725850524608e-06, + "loss": 0.1928, + "step": 8026 + }, + { + "epoch": 0.6359279065161418, + "grad_norm": 1.7719856018142073, + "learning_rate": 6.182354045418704e-06, + "loss": 0.2083, + "step": 8027 + }, + { + "epoch": 0.6360071301247772, + "grad_norm": 1.610537671700797, + "learning_rate": 6.179982491686416e-06, + "loss": 0.1607, + "step": 8028 + }, + { + "epoch": 0.6360863537334126, + "grad_norm": 1.3328018725701702, + "learning_rate": 6.177611189483903e-06, + "loss": 0.1896, + "step": 8029 + }, + { + "epoch": 0.6361655773420479, + "grad_norm": 1.5406390887106478, + "learning_rate": 6.175240138967299e-06, + "loss": 0.1942, + "step": 8030 + }, + { + "epoch": 0.6362448009506833, + "grad_norm": 1.4811725336319033, + "learning_rate": 6.172869340292729e-06, + "loss": 0.1503, + "step": 8031 + }, + { + "epoch": 0.6363240245593187, + "grad_norm": 1.183938542177892, + "learning_rate": 6.170498793616298e-06, + "loss": 0.1209, + "step": 8032 + }, + { + "epoch": 0.636403248167954, + "grad_norm": 1.3349842459706298, + "learning_rate": 6.168128499094095e-06, + "loss": 0.134, + "step": 8033 + }, + { + "epoch": 0.6364824717765895, + "grad_norm": 1.4687541232142616, + "learning_rate": 6.165758456882193e-06, + "loss": 0.2101, + "step": 8034 + }, + { + "epoch": 0.6365616953852248, + "grad_norm": 1.5803293593594072, + "learning_rate": 6.163388667136646e-06, + "loss": 0.2098, + "step": 8035 + }, + { + "epoch": 0.6366409189938602, + "grad_norm": 1.4815992085433343, + "learning_rate": 6.161019130013495e-06, + "loss": 0.1764, + "step": 8036 + }, + { + "epoch": 0.6367201426024955, + "grad_norm": 1.836571651703349, + "learning_rate": 6.158649845668764e-06, + "loss": 0.2195, + "step": 8037 + }, + { + "epoch": 0.6367993662111309, + "grad_norm": 1.0736019083955324, + "learning_rate": 6.156280814258455e-06, + "loss": 0.1194, + "step": 8038 + }, + { + "epoch": 0.6368785898197663, + "grad_norm": 1.3978079984978895, + "learning_rate": 6.153912035938559e-06, + "loss": 0.15, + "step": 8039 + }, + { + "epoch": 0.6369578134284016, + "grad_norm": 1.466119670831299, + "learning_rate": 6.151543510865053e-06, + "loss": 0.1445, + "step": 8040 + }, + { + "epoch": 0.6370370370370371, + "grad_norm": 1.4546352972210452, + "learning_rate": 6.149175239193887e-06, + "loss": 0.226, + "step": 8041 + }, + { + "epoch": 0.6371162606456724, + "grad_norm": 1.007920686287903, + "learning_rate": 6.1468072210810035e-06, + "loss": 0.0947, + "step": 8042 + }, + { + "epoch": 0.6371954842543078, + "grad_norm": 1.8470985737799304, + "learning_rate": 6.144439456682323e-06, + "loss": 0.2584, + "step": 8043 + }, + { + "epoch": 0.6372747078629432, + "grad_norm": 1.6944984526212388, + "learning_rate": 6.142071946153751e-06, + "loss": 0.2167, + "step": 8044 + }, + { + "epoch": 0.6373539314715785, + "grad_norm": 1.4051801216471371, + "learning_rate": 6.139704689651181e-06, + "loss": 0.1365, + "step": 8045 + }, + { + "epoch": 0.6374331550802139, + "grad_norm": 1.7739276664167622, + "learning_rate": 6.1373376873304814e-06, + "loss": 0.1506, + "step": 8046 + }, + { + "epoch": 0.6375123786888492, + "grad_norm": 1.2714198648717325, + "learning_rate": 6.134970939347511e-06, + "loss": 0.0867, + "step": 8047 + }, + { + "epoch": 0.6375916022974847, + "grad_norm": 1.0336402065408437, + "learning_rate": 6.132604445858104e-06, + "loss": 0.0899, + "step": 8048 + }, + { + "epoch": 0.63767082590612, + "grad_norm": 1.3218458711585934, + "learning_rate": 6.130238207018085e-06, + "loss": 0.1308, + "step": 8049 + }, + { + "epoch": 0.6377500495147554, + "grad_norm": 1.4970597557743586, + "learning_rate": 6.127872222983264e-06, + "loss": 0.2102, + "step": 8050 + }, + { + "epoch": 0.6378292731233908, + "grad_norm": 1.2775757732140645, + "learning_rate": 6.125506493909422e-06, + "loss": 0.143, + "step": 8051 + }, + { + "epoch": 0.6379084967320261, + "grad_norm": 1.7512105988245497, + "learning_rate": 6.123141019952334e-06, + "loss": 0.2026, + "step": 8052 + }, + { + "epoch": 0.6379877203406615, + "grad_norm": 1.5907695307831884, + "learning_rate": 6.1207758012677595e-06, + "loss": 0.1636, + "step": 8053 + }, + { + "epoch": 0.6380669439492969, + "grad_norm": 1.328670161964659, + "learning_rate": 6.11841083801143e-06, + "loss": 0.1514, + "step": 8054 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 1.8310762988425435, + "learning_rate": 6.116046130339073e-06, + "loss": 0.2295, + "step": 8055 + }, + { + "epoch": 0.6382253911665676, + "grad_norm": 1.9124203676510925, + "learning_rate": 6.1136816784063855e-06, + "loss": 0.2446, + "step": 8056 + }, + { + "epoch": 0.6383046147752031, + "grad_norm": 1.668881435707919, + "learning_rate": 6.1113174823690615e-06, + "loss": 0.1616, + "step": 8057 + }, + { + "epoch": 0.6383838383838384, + "grad_norm": 1.4796825705987922, + "learning_rate": 6.108953542382771e-06, + "loss": 0.1854, + "step": 8058 + }, + { + "epoch": 0.6384630619924737, + "grad_norm": 1.2890164689913686, + "learning_rate": 6.106589858603167e-06, + "loss": 0.1013, + "step": 8059 + }, + { + "epoch": 0.6385422856011091, + "grad_norm": 1.5697355718352175, + "learning_rate": 6.1042264311858845e-06, + "loss": 0.2417, + "step": 8060 + }, + { + "epoch": 0.6386215092097445, + "grad_norm": 2.177511507154298, + "learning_rate": 6.101863260286551e-06, + "loss": 0.2829, + "step": 8061 + }, + { + "epoch": 0.6387007328183799, + "grad_norm": 1.1668171544310744, + "learning_rate": 6.099500346060765e-06, + "loss": 0.0986, + "step": 8062 + }, + { + "epoch": 0.6387799564270152, + "grad_norm": 1.4728280492770907, + "learning_rate": 6.09713768866411e-06, + "loss": 0.168, + "step": 8063 + }, + { + "epoch": 0.6388591800356507, + "grad_norm": 1.3612029718774463, + "learning_rate": 6.094775288252157e-06, + "loss": 0.1492, + "step": 8064 + }, + { + "epoch": 0.638938403644286, + "grad_norm": 1.8434027969930993, + "learning_rate": 6.092413144980465e-06, + "loss": 0.2012, + "step": 8065 + }, + { + "epoch": 0.6390176272529213, + "grad_norm": 1.9581884779647414, + "learning_rate": 6.090051259004563e-06, + "loss": 0.1881, + "step": 8066 + }, + { + "epoch": 0.6390968508615568, + "grad_norm": 1.4612143064453014, + "learning_rate": 6.087689630479974e-06, + "loss": 0.2167, + "step": 8067 + }, + { + "epoch": 0.6391760744701921, + "grad_norm": 2.0995615313927676, + "learning_rate": 6.085328259562195e-06, + "loss": 0.2361, + "step": 8068 + }, + { + "epoch": 0.6392552980788275, + "grad_norm": 1.2713930323094393, + "learning_rate": 6.082967146406714e-06, + "loss": 0.1411, + "step": 8069 + }, + { + "epoch": 0.6393345216874629, + "grad_norm": 1.6035218399675333, + "learning_rate": 6.0806062911690025e-06, + "loss": 0.2151, + "step": 8070 + }, + { + "epoch": 0.6394137452960983, + "grad_norm": 1.5684345086926923, + "learning_rate": 6.078245694004503e-06, + "loss": 0.1732, + "step": 8071 + }, + { + "epoch": 0.6394929689047336, + "grad_norm": 1.5788360301344264, + "learning_rate": 6.075885355068658e-06, + "loss": 0.2223, + "step": 8072 + }, + { + "epoch": 0.6395721925133689, + "grad_norm": 1.896384841125591, + "learning_rate": 6.073525274516879e-06, + "loss": 0.2049, + "step": 8073 + }, + { + "epoch": 0.6396514161220044, + "grad_norm": 1.3144364732799731, + "learning_rate": 6.071165452504568e-06, + "loss": 0.1972, + "step": 8074 + }, + { + "epoch": 0.6397306397306397, + "grad_norm": 1.4977411743415656, + "learning_rate": 6.068805889187109e-06, + "loss": 0.1864, + "step": 8075 + }, + { + "epoch": 0.6398098633392751, + "grad_norm": 2.09234114149141, + "learning_rate": 6.066446584719864e-06, + "loss": 0.1632, + "step": 8076 + }, + { + "epoch": 0.6398890869479105, + "grad_norm": 1.4428675353555533, + "learning_rate": 6.064087539258186e-06, + "loss": 0.2058, + "step": 8077 + }, + { + "epoch": 0.6399683105565459, + "grad_norm": 1.7317110855492674, + "learning_rate": 6.061728752957406e-06, + "loss": 0.1766, + "step": 8078 + }, + { + "epoch": 0.6400475341651812, + "grad_norm": 1.715467553779848, + "learning_rate": 6.059370225972834e-06, + "loss": 0.2017, + "step": 8079 + }, + { + "epoch": 0.6401267577738166, + "grad_norm": 1.3981638790005932, + "learning_rate": 6.057011958459776e-06, + "loss": 0.1643, + "step": 8080 + }, + { + "epoch": 0.640205981382452, + "grad_norm": 1.9805349399470895, + "learning_rate": 6.0546539505735055e-06, + "loss": 0.1824, + "step": 8081 + }, + { + "epoch": 0.6402852049910873, + "grad_norm": 2.07334063393122, + "learning_rate": 6.052296202469288e-06, + "loss": 0.3186, + "step": 8082 + }, + { + "epoch": 0.6403644285997228, + "grad_norm": 1.8820150157041597, + "learning_rate": 6.049938714302372e-06, + "loss": 0.3467, + "step": 8083 + }, + { + "epoch": 0.6404436522083581, + "grad_norm": 1.494153780691814, + "learning_rate": 6.047581486227984e-06, + "loss": 0.1572, + "step": 8084 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 1.1695806773679274, + "learning_rate": 6.045224518401338e-06, + "loss": 0.1727, + "step": 8085 + }, + { + "epoch": 0.6406020994256288, + "grad_norm": 3.175382121935935, + "learning_rate": 6.04286781097763e-06, + "loss": 0.3162, + "step": 8086 + }, + { + "epoch": 0.6406813230342642, + "grad_norm": 1.5960691033404628, + "learning_rate": 6.040511364112034e-06, + "loss": 0.2031, + "step": 8087 + }, + { + "epoch": 0.6407605466428996, + "grad_norm": 1.8020629298657165, + "learning_rate": 6.038155177959715e-06, + "loss": 0.1975, + "step": 8088 + }, + { + "epoch": 0.6408397702515349, + "grad_norm": 1.3449893604145793, + "learning_rate": 6.035799252675811e-06, + "loss": 0.1279, + "step": 8089 + }, + { + "epoch": 0.6409189938601704, + "grad_norm": 1.388994974244062, + "learning_rate": 6.0334435884154526e-06, + "loss": 0.142, + "step": 8090 + }, + { + "epoch": 0.6409982174688057, + "grad_norm": 1.9092794094730143, + "learning_rate": 6.031088185333751e-06, + "loss": 0.2786, + "step": 8091 + }, + { + "epoch": 0.641077441077441, + "grad_norm": 1.6587613465913738, + "learning_rate": 6.028733043585793e-06, + "loss": 0.178, + "step": 8092 + }, + { + "epoch": 0.6411566646860765, + "grad_norm": 1.6876684821454455, + "learning_rate": 6.026378163326654e-06, + "loss": 0.2329, + "step": 8093 + }, + { + "epoch": 0.6412358882947118, + "grad_norm": 1.3562833297180532, + "learning_rate": 6.024023544711396e-06, + "loss": 0.1408, + "step": 8094 + }, + { + "epoch": 0.6413151119033472, + "grad_norm": 1.563157825770762, + "learning_rate": 6.021669187895054e-06, + "loss": 0.1734, + "step": 8095 + }, + { + "epoch": 0.6413943355119825, + "grad_norm": 1.7509772789149751, + "learning_rate": 6.019315093032656e-06, + "loss": 0.2235, + "step": 8096 + }, + { + "epoch": 0.641473559120618, + "grad_norm": 1.8134298364919954, + "learning_rate": 6.016961260279204e-06, + "loss": 0.2151, + "step": 8097 + }, + { + "epoch": 0.6415527827292533, + "grad_norm": 1.2975379844934873, + "learning_rate": 6.0146076897896865e-06, + "loss": 0.1656, + "step": 8098 + }, + { + "epoch": 0.6416320063378886, + "grad_norm": 2.1523475691966545, + "learning_rate": 6.012254381719078e-06, + "loss": 0.2399, + "step": 8099 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 1.3792555952901389, + "learning_rate": 6.0099013362223305e-06, + "loss": 0.1523, + "step": 8100 + }, + { + "epoch": 0.6417904535551594, + "grad_norm": 1.8247424909559966, + "learning_rate": 6.007548553454379e-06, + "loss": 0.2029, + "step": 8101 + }, + { + "epoch": 0.6418696771637948, + "grad_norm": 1.7282070409500825, + "learning_rate": 6.005196033570147e-06, + "loss": 0.1895, + "step": 8102 + }, + { + "epoch": 0.6419489007724302, + "grad_norm": 1.703459325217148, + "learning_rate": 6.002843776724534e-06, + "loss": 0.2743, + "step": 8103 + }, + { + "epoch": 0.6420281243810656, + "grad_norm": 1.4141519713853017, + "learning_rate": 6.000491783072426e-06, + "loss": 0.1719, + "step": 8104 + }, + { + "epoch": 0.6421073479897009, + "grad_norm": 1.5040055911682912, + "learning_rate": 5.998140052768687e-06, + "loss": 0.1692, + "step": 8105 + }, + { + "epoch": 0.6421865715983363, + "grad_norm": 1.396176221166053, + "learning_rate": 5.995788585968171e-06, + "loss": 0.1717, + "step": 8106 + }, + { + "epoch": 0.6422657952069717, + "grad_norm": 1.3728313497012354, + "learning_rate": 5.993437382825711e-06, + "loss": 0.1443, + "step": 8107 + }, + { + "epoch": 0.642345018815607, + "grad_norm": 1.6668970586745686, + "learning_rate": 5.991086443496119e-06, + "loss": 0.1624, + "step": 8108 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 1.6886237236606727, + "learning_rate": 5.9887357681341955e-06, + "loss": 0.1811, + "step": 8109 + }, + { + "epoch": 0.6425034660328778, + "grad_norm": 1.8403077008745683, + "learning_rate": 5.9863853568947215e-06, + "loss": 0.2747, + "step": 8110 + }, + { + "epoch": 0.6425826896415132, + "grad_norm": 1.3107299689935732, + "learning_rate": 5.9840352099324595e-06, + "loss": 0.161, + "step": 8111 + }, + { + "epoch": 0.6426619132501485, + "grad_norm": 1.3216407299508082, + "learning_rate": 5.981685327402156e-06, + "loss": 0.1133, + "step": 8112 + }, + { + "epoch": 0.6427411368587839, + "grad_norm": 1.4087810415080189, + "learning_rate": 5.9793357094585365e-06, + "loss": 0.1648, + "step": 8113 + }, + { + "epoch": 0.6428203604674193, + "grad_norm": 1.4012284831722943, + "learning_rate": 5.976986356256316e-06, + "loss": 0.1856, + "step": 8114 + }, + { + "epoch": 0.6428995840760546, + "grad_norm": 1.3091420768442874, + "learning_rate": 5.974637267950187e-06, + "loss": 0.1695, + "step": 8115 + }, + { + "epoch": 0.6429788076846901, + "grad_norm": 1.332204940460913, + "learning_rate": 5.972288444694822e-06, + "loss": 0.1816, + "step": 8116 + }, + { + "epoch": 0.6430580312933254, + "grad_norm": 1.719432836983353, + "learning_rate": 5.9699398866448846e-06, + "loss": 0.2237, + "step": 8117 + }, + { + "epoch": 0.6431372549019608, + "grad_norm": 1.6451888621811877, + "learning_rate": 5.967591593955016e-06, + "loss": 0.2473, + "step": 8118 + }, + { + "epoch": 0.6432164785105962, + "grad_norm": 1.2462144379853615, + "learning_rate": 5.965243566779837e-06, + "loss": 0.1873, + "step": 8119 + }, + { + "epoch": 0.6432957021192315, + "grad_norm": 1.6682313182268105, + "learning_rate": 5.962895805273956e-06, + "loss": 0.2454, + "step": 8120 + }, + { + "epoch": 0.6433749257278669, + "grad_norm": 1.522359786660533, + "learning_rate": 5.960548309591958e-06, + "loss": 0.1661, + "step": 8121 + }, + { + "epoch": 0.6434541493365022, + "grad_norm": 1.8856366859578277, + "learning_rate": 5.958201079888419e-06, + "loss": 0.2052, + "step": 8122 + }, + { + "epoch": 0.6435333729451377, + "grad_norm": 1.6425198675212893, + "learning_rate": 5.9558541163178915e-06, + "loss": 0.1978, + "step": 8123 + }, + { + "epoch": 0.643612596553773, + "grad_norm": 1.8107131526831806, + "learning_rate": 5.953507419034911e-06, + "loss": 0.2065, + "step": 8124 + }, + { + "epoch": 0.6436918201624084, + "grad_norm": 1.7537881617961941, + "learning_rate": 5.951160988193998e-06, + "loss": 0.296, + "step": 8125 + }, + { + "epoch": 0.6437710437710438, + "grad_norm": 1.4564610174228458, + "learning_rate": 5.948814823949649e-06, + "loss": 0.1765, + "step": 8126 + }, + { + "epoch": 0.6438502673796791, + "grad_norm": 1.5708870506658843, + "learning_rate": 5.946468926456352e-06, + "loss": 0.1873, + "step": 8127 + }, + { + "epoch": 0.6439294909883145, + "grad_norm": 0.9833438986469489, + "learning_rate": 5.944123295868574e-06, + "loss": 0.0986, + "step": 8128 + }, + { + "epoch": 0.6440087145969499, + "grad_norm": 1.55627241612504, + "learning_rate": 5.9417779323407576e-06, + "loss": 0.1905, + "step": 8129 + }, + { + "epoch": 0.6440879382055853, + "grad_norm": 1.65050525282498, + "learning_rate": 5.939432836027339e-06, + "loss": 0.2036, + "step": 8130 + }, + { + "epoch": 0.6441671618142206, + "grad_norm": 1.527701089722438, + "learning_rate": 5.937088007082731e-06, + "loss": 0.1863, + "step": 8131 + }, + { + "epoch": 0.6442463854228561, + "grad_norm": 1.619678841610621, + "learning_rate": 5.934743445661326e-06, + "loss": 0.1869, + "step": 8132 + }, + { + "epoch": 0.6443256090314914, + "grad_norm": 1.5934641414297468, + "learning_rate": 5.932399151917507e-06, + "loss": 0.2128, + "step": 8133 + }, + { + "epoch": 0.6444048326401267, + "grad_norm": 1.4272613283253828, + "learning_rate": 5.93005512600563e-06, + "loss": 0.2056, + "step": 8134 + }, + { + "epoch": 0.6444840562487621, + "grad_norm": 1.4895738772604328, + "learning_rate": 5.92771136808004e-06, + "loss": 0.1895, + "step": 8135 + }, + { + "epoch": 0.6445632798573975, + "grad_norm": 2.090757004547409, + "learning_rate": 5.925367878295063e-06, + "loss": 0.1864, + "step": 8136 + }, + { + "epoch": 0.6446425034660329, + "grad_norm": 1.5269085503875042, + "learning_rate": 5.9230246568050035e-06, + "loss": 0.1647, + "step": 8137 + }, + { + "epoch": 0.6447217270746682, + "grad_norm": 1.2712634497428263, + "learning_rate": 5.920681703764153e-06, + "loss": 0.1466, + "step": 8138 + }, + { + "epoch": 0.6448009506833037, + "grad_norm": 1.7711467342966756, + "learning_rate": 5.918339019326789e-06, + "loss": 0.2039, + "step": 8139 + }, + { + "epoch": 0.644880174291939, + "grad_norm": 1.6106787105529103, + "learning_rate": 5.915996603647157e-06, + "loss": 0.2296, + "step": 8140 + }, + { + "epoch": 0.6449593979005743, + "grad_norm": 1.9063362943262643, + "learning_rate": 5.913654456879496e-06, + "loss": 0.2585, + "step": 8141 + }, + { + "epoch": 0.6450386215092098, + "grad_norm": 1.7339440527619234, + "learning_rate": 5.911312579178028e-06, + "loss": 0.2452, + "step": 8142 + }, + { + "epoch": 0.6451178451178451, + "grad_norm": 1.5062218026206124, + "learning_rate": 5.908970970696955e-06, + "loss": 0.2279, + "step": 8143 + }, + { + "epoch": 0.6451970687264805, + "grad_norm": 1.3909630211376605, + "learning_rate": 5.906629631590457e-06, + "loss": 0.1188, + "step": 8144 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 1.3228113989492796, + "learning_rate": 5.904288562012703e-06, + "loss": 0.1277, + "step": 8145 + }, + { + "epoch": 0.6453555159437513, + "grad_norm": 1.3890436089938607, + "learning_rate": 5.901947762117838e-06, + "loss": 0.1785, + "step": 8146 + }, + { + "epoch": 0.6454347395523866, + "grad_norm": 1.4704428395392795, + "learning_rate": 5.899607232059994e-06, + "loss": 0.174, + "step": 8147 + }, + { + "epoch": 0.6455139631610219, + "grad_norm": 1.3975944557489626, + "learning_rate": 5.897266971993286e-06, + "loss": 0.2414, + "step": 8148 + }, + { + "epoch": 0.6455931867696574, + "grad_norm": 1.5378497945727905, + "learning_rate": 5.894926982071805e-06, + "loss": 0.1695, + "step": 8149 + }, + { + "epoch": 0.6456724103782927, + "grad_norm": 1.7136073115435613, + "learning_rate": 5.892587262449631e-06, + "loss": 0.2202, + "step": 8150 + }, + { + "epoch": 0.6457516339869281, + "grad_norm": 1.9443667063843741, + "learning_rate": 5.890247813280822e-06, + "loss": 0.2088, + "step": 8151 + }, + { + "epoch": 0.6458308575955635, + "grad_norm": 1.9345781076500688, + "learning_rate": 5.8879086347194196e-06, + "loss": 0.193, + "step": 8152 + }, + { + "epoch": 0.6459100812041989, + "grad_norm": 1.5134019496722813, + "learning_rate": 5.885569726919449e-06, + "loss": 0.1522, + "step": 8153 + }, + { + "epoch": 0.6459893048128342, + "grad_norm": 1.3538760965517709, + "learning_rate": 5.883231090034911e-06, + "loss": 0.1808, + "step": 8154 + }, + { + "epoch": 0.6460685284214696, + "grad_norm": 1.685558222770563, + "learning_rate": 5.8808927242197984e-06, + "loss": 0.2095, + "step": 8155 + }, + { + "epoch": 0.646147752030105, + "grad_norm": 1.4062584319448033, + "learning_rate": 5.878554629628081e-06, + "loss": 0.1904, + "step": 8156 + }, + { + "epoch": 0.6462269756387403, + "grad_norm": 1.4476301404899559, + "learning_rate": 5.87621680641371e-06, + "loss": 0.1975, + "step": 8157 + }, + { + "epoch": 0.6463061992473758, + "grad_norm": 1.1391212925893133, + "learning_rate": 5.873879254730621e-06, + "loss": 0.1301, + "step": 8158 + }, + { + "epoch": 0.6463854228560111, + "grad_norm": 1.7083001896441035, + "learning_rate": 5.871541974732727e-06, + "loss": 0.1384, + "step": 8159 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 1.7312239132605354, + "learning_rate": 5.869204966573929e-06, + "loss": 0.1969, + "step": 8160 + }, + { + "epoch": 0.6465438700732818, + "grad_norm": 1.4122443107824323, + "learning_rate": 5.866868230408111e-06, + "loss": 0.192, + "step": 8161 + }, + { + "epoch": 0.6466230936819172, + "grad_norm": 1.7282599416588857, + "learning_rate": 5.86453176638913e-06, + "loss": 0.1697, + "step": 8162 + }, + { + "epoch": 0.6467023172905526, + "grad_norm": 1.528172939160641, + "learning_rate": 5.862195574670834e-06, + "loss": 0.1729, + "step": 8163 + }, + { + "epoch": 0.6467815408991879, + "grad_norm": 1.8040622991011739, + "learning_rate": 5.85985965540705e-06, + "loss": 0.2788, + "step": 8164 + }, + { + "epoch": 0.6468607645078234, + "grad_norm": 1.5492634542794503, + "learning_rate": 5.857524008751586e-06, + "loss": 0.131, + "step": 8165 + }, + { + "epoch": 0.6469399881164587, + "grad_norm": 1.647262123542445, + "learning_rate": 5.855188634858235e-06, + "loss": 0.2611, + "step": 8166 + }, + { + "epoch": 0.647019211725094, + "grad_norm": 1.7564404844206631, + "learning_rate": 5.852853533880768e-06, + "loss": 0.1636, + "step": 8167 + }, + { + "epoch": 0.6470984353337295, + "grad_norm": 1.6331654529510657, + "learning_rate": 5.850518705972941e-06, + "loss": 0.2051, + "step": 8168 + }, + { + "epoch": 0.6471776589423648, + "grad_norm": 1.539374640159121, + "learning_rate": 5.848184151288492e-06, + "loss": 0.209, + "step": 8169 + }, + { + "epoch": 0.6472568825510002, + "grad_norm": 1.5721915772426165, + "learning_rate": 5.845849869981137e-06, + "loss": 0.1562, + "step": 8170 + }, + { + "epoch": 0.6473361061596355, + "grad_norm": 1.9839548267302927, + "learning_rate": 5.843515862204581e-06, + "loss": 0.2349, + "step": 8171 + }, + { + "epoch": 0.647415329768271, + "grad_norm": 1.4924276726996406, + "learning_rate": 5.841182128112506e-06, + "loss": 0.1834, + "step": 8172 + }, + { + "epoch": 0.6474945533769063, + "grad_norm": 1.4482647137077493, + "learning_rate": 5.838848667858577e-06, + "loss": 0.1912, + "step": 8173 + }, + { + "epoch": 0.6475737769855416, + "grad_norm": 1.2887753077490391, + "learning_rate": 5.83651548159644e-06, + "loss": 0.1492, + "step": 8174 + }, + { + "epoch": 0.6476530005941771, + "grad_norm": 1.8080667023935875, + "learning_rate": 5.834182569479727e-06, + "loss": 0.1839, + "step": 8175 + }, + { + "epoch": 0.6477322242028124, + "grad_norm": 1.0766910119065871, + "learning_rate": 5.831849931662047e-06, + "loss": 0.0994, + "step": 8176 + }, + { + "epoch": 0.6478114478114478, + "grad_norm": 1.359796154319617, + "learning_rate": 5.829517568296989e-06, + "loss": 0.1411, + "step": 8177 + }, + { + "epoch": 0.6478906714200832, + "grad_norm": 1.895470911194082, + "learning_rate": 5.827185479538138e-06, + "loss": 0.2375, + "step": 8178 + }, + { + "epoch": 0.6479698950287186, + "grad_norm": 1.950384073594469, + "learning_rate": 5.824853665539043e-06, + "loss": 0.1913, + "step": 8179 + }, + { + "epoch": 0.6480491186373539, + "grad_norm": 1.5935832675858823, + "learning_rate": 5.82252212645324e-06, + "loss": 0.1389, + "step": 8180 + }, + { + "epoch": 0.6481283422459893, + "grad_norm": 1.48907075446388, + "learning_rate": 5.820190862434259e-06, + "loss": 0.2118, + "step": 8181 + }, + { + "epoch": 0.6482075658546247, + "grad_norm": 1.6304056787421364, + "learning_rate": 5.8178598736355985e-06, + "loss": 0.1525, + "step": 8182 + }, + { + "epoch": 0.64828678946326, + "grad_norm": 1.7683509037124783, + "learning_rate": 5.815529160210738e-06, + "loss": 0.1849, + "step": 8183 + }, + { + "epoch": 0.6483660130718955, + "grad_norm": 1.597123957088113, + "learning_rate": 5.813198722313151e-06, + "loss": 0.2211, + "step": 8184 + }, + { + "epoch": 0.6484452366805308, + "grad_norm": 1.536023959144539, + "learning_rate": 5.810868560096283e-06, + "loss": 0.1865, + "step": 8185 + }, + { + "epoch": 0.6485244602891662, + "grad_norm": 1.7736708834620623, + "learning_rate": 5.808538673713564e-06, + "loss": 0.1155, + "step": 8186 + }, + { + "epoch": 0.6486036838978015, + "grad_norm": 1.8009006907956115, + "learning_rate": 5.8062090633184e-06, + "loss": 0.1892, + "step": 8187 + }, + { + "epoch": 0.6486829075064369, + "grad_norm": 1.6267354181361393, + "learning_rate": 5.803879729064195e-06, + "loss": 0.1243, + "step": 8188 + }, + { + "epoch": 0.6487621311150723, + "grad_norm": 1.92601166141222, + "learning_rate": 5.801550671104319e-06, + "loss": 0.2215, + "step": 8189 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 0.9616011594442138, + "learning_rate": 5.7992218895921256e-06, + "loss": 0.0962, + "step": 8190 + }, + { + "epoch": 0.6489205783323431, + "grad_norm": 1.2962036900258442, + "learning_rate": 5.796893384680964e-06, + "loss": 0.1686, + "step": 8191 + }, + { + "epoch": 0.6489998019409784, + "grad_norm": 1.3376939103189183, + "learning_rate": 5.7945651565241455e-06, + "loss": 0.1626, + "step": 8192 + }, + { + "epoch": 0.6490790255496138, + "grad_norm": 1.7175677058121748, + "learning_rate": 5.792237205274974e-06, + "loss": 0.1887, + "step": 8193 + }, + { + "epoch": 0.6491582491582492, + "grad_norm": 1.7276408709061848, + "learning_rate": 5.789909531086741e-06, + "loss": 0.262, + "step": 8194 + }, + { + "epoch": 0.6492374727668845, + "grad_norm": 1.5565591296273464, + "learning_rate": 5.787582134112706e-06, + "loss": 0.1948, + "step": 8195 + }, + { + "epoch": 0.6493166963755199, + "grad_norm": 1.4046768935070544, + "learning_rate": 5.785255014506115e-06, + "loss": 0.1559, + "step": 8196 + }, + { + "epoch": 0.6493959199841552, + "grad_norm": 1.884874702634769, + "learning_rate": 5.782928172420206e-06, + "loss": 0.1955, + "step": 8197 + }, + { + "epoch": 0.6494751435927907, + "grad_norm": 1.7392460271953147, + "learning_rate": 5.780601608008185e-06, + "loss": 0.2122, + "step": 8198 + }, + { + "epoch": 0.649554367201426, + "grad_norm": 1.547346264372615, + "learning_rate": 5.778275321423241e-06, + "loss": 0.2563, + "step": 8199 + }, + { + "epoch": 0.6496335908100614, + "grad_norm": 1.71903547514637, + "learning_rate": 5.7759493128185584e-06, + "loss": 0.2214, + "step": 8200 + }, + { + "epoch": 0.6497128144186968, + "grad_norm": 2.024562339691979, + "learning_rate": 5.773623582347289e-06, + "loss": 0.2489, + "step": 8201 + }, + { + "epoch": 0.6497920380273321, + "grad_norm": 1.7269643699186403, + "learning_rate": 5.77129813016257e-06, + "loss": 0.2683, + "step": 8202 + }, + { + "epoch": 0.6498712616359675, + "grad_norm": 1.4449158929677628, + "learning_rate": 5.768972956417518e-06, + "loss": 0.2212, + "step": 8203 + }, + { + "epoch": 0.6499504852446029, + "grad_norm": 1.4062302772332789, + "learning_rate": 5.766648061265242e-06, + "loss": 0.1284, + "step": 8204 + }, + { + "epoch": 0.6500297088532383, + "grad_norm": 1.5568032544909487, + "learning_rate": 5.764323444858823e-06, + "loss": 0.1682, + "step": 8205 + }, + { + "epoch": 0.6501089324618736, + "grad_norm": 2.0939044448197035, + "learning_rate": 5.761999107351319e-06, + "loss": 0.2166, + "step": 8206 + }, + { + "epoch": 0.6501881560705091, + "grad_norm": 1.535057755453949, + "learning_rate": 5.759675048895785e-06, + "loss": 0.2084, + "step": 8207 + }, + { + "epoch": 0.6502673796791444, + "grad_norm": 2.0202311168534464, + "learning_rate": 5.757351269645248e-06, + "loss": 0.3489, + "step": 8208 + }, + { + "epoch": 0.6503466032877797, + "grad_norm": 1.3078476275634898, + "learning_rate": 5.75502776975271e-06, + "loss": 0.1575, + "step": 8209 + }, + { + "epoch": 0.6504258268964151, + "grad_norm": 1.5249387928311076, + "learning_rate": 5.752704549371173e-06, + "loss": 0.1745, + "step": 8210 + }, + { + "epoch": 0.6505050505050505, + "grad_norm": 1.6182043081195674, + "learning_rate": 5.750381608653605e-06, + "loss": 0.2483, + "step": 8211 + }, + { + "epoch": 0.6505842741136859, + "grad_norm": 1.3265089391079636, + "learning_rate": 5.748058947752955e-06, + "loss": 0.1242, + "step": 8212 + }, + { + "epoch": 0.6506634977223212, + "grad_norm": 1.1294755049317864, + "learning_rate": 5.745736566822169e-06, + "loss": 0.1405, + "step": 8213 + }, + { + "epoch": 0.6507427213309567, + "grad_norm": 1.5363640615073666, + "learning_rate": 5.743414466014159e-06, + "loss": 0.2634, + "step": 8214 + }, + { + "epoch": 0.650821944939592, + "grad_norm": 1.7286602224365233, + "learning_rate": 5.7410926454818265e-06, + "loss": 0.2692, + "step": 8215 + }, + { + "epoch": 0.6509011685482273, + "grad_norm": 1.4254908976240517, + "learning_rate": 5.738771105378046e-06, + "loss": 0.1405, + "step": 8216 + }, + { + "epoch": 0.6509803921568628, + "grad_norm": 1.7378379782088345, + "learning_rate": 5.7364498458556914e-06, + "loss": 0.2609, + "step": 8217 + }, + { + "epoch": 0.6510596157654981, + "grad_norm": 1.7205630807578267, + "learning_rate": 5.734128867067593e-06, + "loss": 0.2305, + "step": 8218 + }, + { + "epoch": 0.6511388393741335, + "grad_norm": 1.4647885330041155, + "learning_rate": 5.731808169166586e-06, + "loss": 0.1838, + "step": 8219 + }, + { + "epoch": 0.6512180629827689, + "grad_norm": 1.6481536694466383, + "learning_rate": 5.7294877523054735e-06, + "loss": 0.16, + "step": 8220 + }, + { + "epoch": 0.6512972865914043, + "grad_norm": 1.589465223200507, + "learning_rate": 5.727167616637042e-06, + "loss": 0.2151, + "step": 8221 + }, + { + "epoch": 0.6513765102000396, + "grad_norm": 1.8164892706488056, + "learning_rate": 5.7248477623140655e-06, + "loss": 0.2106, + "step": 8222 + }, + { + "epoch": 0.6514557338086749, + "grad_norm": 1.326792198538502, + "learning_rate": 5.722528189489294e-06, + "loss": 0.144, + "step": 8223 + }, + { + "epoch": 0.6515349574173104, + "grad_norm": 1.505013969938144, + "learning_rate": 5.720208898315454e-06, + "loss": 0.2383, + "step": 8224 + }, + { + "epoch": 0.6516141810259457, + "grad_norm": 2.0294958874903655, + "learning_rate": 5.717889888945271e-06, + "loss": 0.2747, + "step": 8225 + }, + { + "epoch": 0.6516934046345811, + "grad_norm": 1.2296416167317286, + "learning_rate": 5.715571161531433e-06, + "loss": 0.1342, + "step": 8226 + }, + { + "epoch": 0.6517726282432165, + "grad_norm": 1.6248808580840808, + "learning_rate": 5.7132527162266194e-06, + "loss": 0.2235, + "step": 8227 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 1.3883865823436097, + "learning_rate": 5.710934553183484e-06, + "loss": 0.2057, + "step": 8228 + }, + { + "epoch": 0.6519310754604872, + "grad_norm": 1.5659566720195393, + "learning_rate": 5.708616672554675e-06, + "loss": 0.2167, + "step": 8229 + }, + { + "epoch": 0.6520102990691226, + "grad_norm": 1.5946119236074876, + "learning_rate": 5.7062990744928086e-06, + "loss": 0.2286, + "step": 8230 + }, + { + "epoch": 0.652089522677758, + "grad_norm": 1.6035955072047159, + "learning_rate": 5.703981759150483e-06, + "loss": 0.1704, + "step": 8231 + }, + { + "epoch": 0.6521687462863933, + "grad_norm": 1.2107460366565461, + "learning_rate": 5.701664726680294e-06, + "loss": 0.1186, + "step": 8232 + }, + { + "epoch": 0.6522479698950288, + "grad_norm": 1.5346477418657334, + "learning_rate": 5.699347977234799e-06, + "loss": 0.1731, + "step": 8233 + }, + { + "epoch": 0.6523271935036641, + "grad_norm": 1.2114196624198978, + "learning_rate": 5.697031510966542e-06, + "loss": 0.1235, + "step": 8234 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 1.5176144109473357, + "learning_rate": 5.69471532802806e-06, + "loss": 0.1862, + "step": 8235 + }, + { + "epoch": 0.6524856407209348, + "grad_norm": 2.258782924325125, + "learning_rate": 5.692399428571857e-06, + "loss": 0.2012, + "step": 8236 + }, + { + "epoch": 0.6525648643295702, + "grad_norm": 2.051295655870663, + "learning_rate": 5.690083812750422e-06, + "loss": 0.158, + "step": 8237 + }, + { + "epoch": 0.6526440879382056, + "grad_norm": 1.4422505800086367, + "learning_rate": 5.687768480716233e-06, + "loss": 0.2302, + "step": 8238 + }, + { + "epoch": 0.6527233115468409, + "grad_norm": 1.4627107526879832, + "learning_rate": 5.685453432621741e-06, + "loss": 0.1629, + "step": 8239 + }, + { + "epoch": 0.6528025351554764, + "grad_norm": 1.8691078174129367, + "learning_rate": 5.683138668619381e-06, + "loss": 0.1945, + "step": 8240 + }, + { + "epoch": 0.6528817587641117, + "grad_norm": 1.5677957642832823, + "learning_rate": 5.680824188861564e-06, + "loss": 0.1172, + "step": 8241 + }, + { + "epoch": 0.6529609823727471, + "grad_norm": 1.7956218253747118, + "learning_rate": 5.678509993500695e-06, + "loss": 0.1411, + "step": 8242 + }, + { + "epoch": 0.6530402059813825, + "grad_norm": 1.8814500390160878, + "learning_rate": 5.676196082689149e-06, + "loss": 0.2051, + "step": 8243 + }, + { + "epoch": 0.6531194295900178, + "grad_norm": 1.6182176307545273, + "learning_rate": 5.673882456579282e-06, + "loss": 0.1673, + "step": 8244 + }, + { + "epoch": 0.6531986531986532, + "grad_norm": 1.4898042752183114, + "learning_rate": 5.6715691153234445e-06, + "loss": 0.2146, + "step": 8245 + }, + { + "epoch": 0.6532778768072885, + "grad_norm": 1.2612371101175517, + "learning_rate": 5.669256059073953e-06, + "loss": 0.113, + "step": 8246 + }, + { + "epoch": 0.653357100415924, + "grad_norm": 1.501050117884269, + "learning_rate": 5.666943287983106e-06, + "loss": 0.219, + "step": 8247 + }, + { + "epoch": 0.6534363240245593, + "grad_norm": 2.567253359423979, + "learning_rate": 5.664630802203201e-06, + "loss": 0.2652, + "step": 8248 + }, + { + "epoch": 0.6535155476331946, + "grad_norm": 1.7313811344018724, + "learning_rate": 5.662318601886496e-06, + "loss": 0.2632, + "step": 8249 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 1.8437192287823179, + "learning_rate": 5.660006687185235e-06, + "loss": 0.2027, + "step": 8250 + }, + { + "epoch": 0.6536739948504654, + "grad_norm": 1.7258103204043658, + "learning_rate": 5.657695058251656e-06, + "loss": 0.2712, + "step": 8251 + }, + { + "epoch": 0.6537532184591008, + "grad_norm": 1.7009843071513742, + "learning_rate": 5.655383715237963e-06, + "loss": 0.2087, + "step": 8252 + }, + { + "epoch": 0.6538324420677362, + "grad_norm": 1.3926597375631904, + "learning_rate": 5.653072658296344e-06, + "loss": 0.1345, + "step": 8253 + }, + { + "epoch": 0.6539116656763716, + "grad_norm": 1.680716453565513, + "learning_rate": 5.650761887578977e-06, + "loss": 0.2294, + "step": 8254 + }, + { + "epoch": 0.6539908892850069, + "grad_norm": 1.6853990228446765, + "learning_rate": 5.648451403238013e-06, + "loss": 0.2854, + "step": 8255 + }, + { + "epoch": 0.6540701128936423, + "grad_norm": 1.5846803940244052, + "learning_rate": 5.646141205425586e-06, + "loss": 0.1508, + "step": 8256 + }, + { + "epoch": 0.6541493365022777, + "grad_norm": 1.426951628329609, + "learning_rate": 5.643831294293808e-06, + "loss": 0.1719, + "step": 8257 + }, + { + "epoch": 0.654228560110913, + "grad_norm": 1.5752766927968165, + "learning_rate": 5.641521669994782e-06, + "loss": 0.185, + "step": 8258 + }, + { + "epoch": 0.6543077837195485, + "grad_norm": 1.7943956435528206, + "learning_rate": 5.639212332680581e-06, + "loss": 0.1895, + "step": 8259 + }, + { + "epoch": 0.6543870073281838, + "grad_norm": 1.443419387285084, + "learning_rate": 5.636903282503263e-06, + "loss": 0.1647, + "step": 8260 + }, + { + "epoch": 0.6544662309368192, + "grad_norm": 1.6627029591077733, + "learning_rate": 5.6345945196148734e-06, + "loss": 0.2123, + "step": 8261 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 1.2083913394737136, + "learning_rate": 5.63228604416743e-06, + "loss": 0.0895, + "step": 8262 + }, + { + "epoch": 0.6546246781540899, + "grad_norm": 1.7234141931210247, + "learning_rate": 5.62997785631293e-06, + "loss": 0.1781, + "step": 8263 + }, + { + "epoch": 0.6547039017627253, + "grad_norm": 1.3367523194438058, + "learning_rate": 5.627669956203365e-06, + "loss": 0.1369, + "step": 8264 + }, + { + "epoch": 0.6547831253713606, + "grad_norm": 1.7990404996814022, + "learning_rate": 5.6253623439906955e-06, + "loss": 0.3385, + "step": 8265 + }, + { + "epoch": 0.6548623489799961, + "grad_norm": 1.5872115950775798, + "learning_rate": 5.623055019826862e-06, + "loss": 0.2337, + "step": 8266 + }, + { + "epoch": 0.6549415725886314, + "grad_norm": 2.0680215129891235, + "learning_rate": 5.6207479838637995e-06, + "loss": 0.155, + "step": 8267 + }, + { + "epoch": 0.6550207961972668, + "grad_norm": 1.4252105373700081, + "learning_rate": 5.618441236253411e-06, + "loss": 0.1609, + "step": 8268 + }, + { + "epoch": 0.6551000198059022, + "grad_norm": 1.6338568759994834, + "learning_rate": 5.616134777147578e-06, + "loss": 0.2116, + "step": 8269 + }, + { + "epoch": 0.6551792434145375, + "grad_norm": 1.8941688066899198, + "learning_rate": 5.6138286066981815e-06, + "loss": 0.2034, + "step": 8270 + }, + { + "epoch": 0.6552584670231729, + "grad_norm": 1.718581445993617, + "learning_rate": 5.611522725057067e-06, + "loss": 0.2025, + "step": 8271 + }, + { + "epoch": 0.6553376906318082, + "grad_norm": 1.4140951727805453, + "learning_rate": 5.6092171323760635e-06, + "loss": 0.2221, + "step": 8272 + }, + { + "epoch": 0.6554169142404437, + "grad_norm": 1.4434099805442253, + "learning_rate": 5.6069118288069824e-06, + "loss": 0.1456, + "step": 8273 + }, + { + "epoch": 0.655496137849079, + "grad_norm": 1.554167909055817, + "learning_rate": 5.604606814501623e-06, + "loss": 0.1654, + "step": 8274 + }, + { + "epoch": 0.6555753614577144, + "grad_norm": 1.308334470915961, + "learning_rate": 5.602302089611755e-06, + "loss": 0.108, + "step": 8275 + }, + { + "epoch": 0.6556545850663498, + "grad_norm": 1.360120676151067, + "learning_rate": 5.599997654289129e-06, + "loss": 0.1583, + "step": 8276 + }, + { + "epoch": 0.6557338086749851, + "grad_norm": 1.8632749560870947, + "learning_rate": 5.5976935086854914e-06, + "loss": 0.262, + "step": 8277 + }, + { + "epoch": 0.6558130322836205, + "grad_norm": 1.3594113781132353, + "learning_rate": 5.595389652952555e-06, + "loss": 0.1823, + "step": 8278 + }, + { + "epoch": 0.6558922558922559, + "grad_norm": 1.262093241615721, + "learning_rate": 5.59308608724201e-06, + "loss": 0.1383, + "step": 8279 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 1.6542865620802023, + "learning_rate": 5.590782811705547e-06, + "loss": 0.1484, + "step": 8280 + }, + { + "epoch": 0.6560507031095266, + "grad_norm": 1.7750792881244633, + "learning_rate": 5.588479826494817e-06, + "loss": 0.1872, + "step": 8281 + }, + { + "epoch": 0.6561299267181621, + "grad_norm": 1.381334562936964, + "learning_rate": 5.5861771317614624e-06, + "loss": 0.1633, + "step": 8282 + }, + { + "epoch": 0.6562091503267974, + "grad_norm": 1.5196390700062017, + "learning_rate": 5.583874727657109e-06, + "loss": 0.1739, + "step": 8283 + }, + { + "epoch": 0.6562883739354327, + "grad_norm": 1.226429081274975, + "learning_rate": 5.581572614333356e-06, + "loss": 0.1241, + "step": 8284 + }, + { + "epoch": 0.6563675975440681, + "grad_norm": 1.538723332684617, + "learning_rate": 5.579270791941787e-06, + "loss": 0.1281, + "step": 8285 + }, + { + "epoch": 0.6564468211527035, + "grad_norm": 1.862194889943743, + "learning_rate": 5.5769692606339584e-06, + "loss": 0.2415, + "step": 8286 + }, + { + "epoch": 0.6565260447613389, + "grad_norm": 1.6825072547604767, + "learning_rate": 5.574668020561428e-06, + "loss": 0.1878, + "step": 8287 + }, + { + "epoch": 0.6566052683699742, + "grad_norm": 1.5372704747108141, + "learning_rate": 5.572367071875715e-06, + "loss": 0.1954, + "step": 8288 + }, + { + "epoch": 0.6566844919786097, + "grad_norm": 2.1053733020299146, + "learning_rate": 5.570066414728321e-06, + "loss": 0.2638, + "step": 8289 + }, + { + "epoch": 0.656763715587245, + "grad_norm": 1.1772582933802798, + "learning_rate": 5.567766049270742e-06, + "loss": 0.1342, + "step": 8290 + }, + { + "epoch": 0.6568429391958803, + "grad_norm": 1.2301156329721834, + "learning_rate": 5.5654659756544425e-06, + "loss": 0.1527, + "step": 8291 + }, + { + "epoch": 0.6569221628045158, + "grad_norm": 1.6200880575345542, + "learning_rate": 5.563166194030868e-06, + "loss": 0.1969, + "step": 8292 + }, + { + "epoch": 0.6570013864131511, + "grad_norm": 2.293687925661075, + "learning_rate": 5.560866704551454e-06, + "loss": 0.3003, + "step": 8293 + }, + { + "epoch": 0.6570806100217865, + "grad_norm": 1.6941611386529323, + "learning_rate": 5.5585675073676085e-06, + "loss": 0.127, + "step": 8294 + }, + { + "epoch": 0.6571598336304219, + "grad_norm": 1.2594594997771908, + "learning_rate": 5.556268602630721e-06, + "loss": 0.1857, + "step": 8295 + }, + { + "epoch": 0.6572390572390573, + "grad_norm": 1.3723528282572734, + "learning_rate": 5.553969990492164e-06, + "loss": 0.1511, + "step": 8296 + }, + { + "epoch": 0.6573182808476926, + "grad_norm": 1.3414673081936097, + "learning_rate": 5.5516716711032906e-06, + "loss": 0.1842, + "step": 8297 + }, + { + "epoch": 0.6573975044563279, + "grad_norm": 1.7230731338225382, + "learning_rate": 5.54937364461543e-06, + "loss": 0.2817, + "step": 8298 + }, + { + "epoch": 0.6574767280649634, + "grad_norm": 1.7529165224556071, + "learning_rate": 5.547075911179902e-06, + "loss": 0.2023, + "step": 8299 + }, + { + "epoch": 0.6575559516735987, + "grad_norm": 1.4044143302093148, + "learning_rate": 5.544778470948001e-06, + "loss": 0.1699, + "step": 8300 + }, + { + "epoch": 0.6576351752822341, + "grad_norm": 1.529663986852082, + "learning_rate": 5.542481324070996e-06, + "loss": 0.2502, + "step": 8301 + }, + { + "epoch": 0.6577143988908695, + "grad_norm": 1.281111555356486, + "learning_rate": 5.540184470700152e-06, + "loss": 0.1496, + "step": 8302 + }, + { + "epoch": 0.6577936224995049, + "grad_norm": 1.4695873417340555, + "learning_rate": 5.537887910986701e-06, + "loss": 0.2307, + "step": 8303 + }, + { + "epoch": 0.6578728461081402, + "grad_norm": 2.2157231188358044, + "learning_rate": 5.535591645081857e-06, + "loss": 0.2534, + "step": 8304 + }, + { + "epoch": 0.6579520697167756, + "grad_norm": 1.5163328286495172, + "learning_rate": 5.5332956731368245e-06, + "loss": 0.1811, + "step": 8305 + }, + { + "epoch": 0.658031293325411, + "grad_norm": 1.2490733694337601, + "learning_rate": 5.530999995302781e-06, + "loss": 0.1601, + "step": 8306 + }, + { + "epoch": 0.6581105169340463, + "grad_norm": 1.2071918252900604, + "learning_rate": 5.528704611730879e-06, + "loss": 0.1427, + "step": 8307 + }, + { + "epoch": 0.6581897405426818, + "grad_norm": 1.4515794317796984, + "learning_rate": 5.5264095225722705e-06, + "loss": 0.2078, + "step": 8308 + }, + { + "epoch": 0.6582689641513171, + "grad_norm": 1.3364392852096108, + "learning_rate": 5.524114727978067e-06, + "loss": 0.155, + "step": 8309 + }, + { + "epoch": 0.6583481877599525, + "grad_norm": 1.0463251475146411, + "learning_rate": 5.5218202280993725e-06, + "loss": 0.095, + "step": 8310 + }, + { + "epoch": 0.6584274113685878, + "grad_norm": 1.2098217240018057, + "learning_rate": 5.519526023087265e-06, + "loss": 0.1147, + "step": 8311 + }, + { + "epoch": 0.6585066349772232, + "grad_norm": 1.5521991050920934, + "learning_rate": 5.517232113092814e-06, + "loss": 0.1669, + "step": 8312 + }, + { + "epoch": 0.6585858585858586, + "grad_norm": 1.7277931955754604, + "learning_rate": 5.5149384982670585e-06, + "loss": 0.2507, + "step": 8313 + }, + { + "epoch": 0.6586650821944939, + "grad_norm": 1.398993685418058, + "learning_rate": 5.512645178761018e-06, + "loss": 0.1573, + "step": 8314 + }, + { + "epoch": 0.6587443058031294, + "grad_norm": 2.4732740187214763, + "learning_rate": 5.5103521547257045e-06, + "loss": 0.1874, + "step": 8315 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 1.3552063616391397, + "learning_rate": 5.508059426312099e-06, + "loss": 0.1867, + "step": 8316 + }, + { + "epoch": 0.6589027530204001, + "grad_norm": 1.2871837519881852, + "learning_rate": 5.5057669936711625e-06, + "loss": 0.1181, + "step": 8317 + }, + { + "epoch": 0.6589819766290355, + "grad_norm": 1.26733278893884, + "learning_rate": 5.503474856953849e-06, + "loss": 0.1507, + "step": 8318 + }, + { + "epoch": 0.6590612002376708, + "grad_norm": 1.8799595411547216, + "learning_rate": 5.50118301631108e-06, + "loss": 0.2174, + "step": 8319 + }, + { + "epoch": 0.6591404238463062, + "grad_norm": 1.4596648033533681, + "learning_rate": 5.498891471893758e-06, + "loss": 0.2096, + "step": 8320 + }, + { + "epoch": 0.6592196474549415, + "grad_norm": 1.5792234032728056, + "learning_rate": 5.49660022385278e-06, + "loss": 0.2104, + "step": 8321 + }, + { + "epoch": 0.659298871063577, + "grad_norm": 1.5745039753729293, + "learning_rate": 5.494309272339007e-06, + "loss": 0.1684, + "step": 8322 + }, + { + "epoch": 0.6593780946722123, + "grad_norm": 1.066572703120052, + "learning_rate": 5.492018617503284e-06, + "loss": 0.1083, + "step": 8323 + }, + { + "epoch": 0.6594573182808476, + "grad_norm": 1.3584827431389204, + "learning_rate": 5.48972825949645e-06, + "loss": 0.1688, + "step": 8324 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 1.6090902305800052, + "learning_rate": 5.487438198469306e-06, + "loss": 0.2315, + "step": 8325 + }, + { + "epoch": 0.6596157654981184, + "grad_norm": 1.9324663753762454, + "learning_rate": 5.485148434572645e-06, + "loss": 0.2677, + "step": 8326 + }, + { + "epoch": 0.6596949891067538, + "grad_norm": 1.7636580760046552, + "learning_rate": 5.48285896795723e-06, + "loss": 0.3376, + "step": 8327 + }, + { + "epoch": 0.6597742127153892, + "grad_norm": 1.7047806139347148, + "learning_rate": 5.480569798773822e-06, + "loss": 0.2314, + "step": 8328 + }, + { + "epoch": 0.6598534363240246, + "grad_norm": 1.4507597322000743, + "learning_rate": 5.478280927173145e-06, + "loss": 0.1693, + "step": 8329 + }, + { + "epoch": 0.6599326599326599, + "grad_norm": 1.4116253261484475, + "learning_rate": 5.4759923533059105e-06, + "loss": 0.2657, + "step": 8330 + }, + { + "epoch": 0.6600118835412953, + "grad_norm": 1.3597514182091908, + "learning_rate": 5.473704077322814e-06, + "loss": 0.1797, + "step": 8331 + }, + { + "epoch": 0.6600911071499307, + "grad_norm": 1.5648248443606272, + "learning_rate": 5.471416099374525e-06, + "loss": 0.1506, + "step": 8332 + }, + { + "epoch": 0.660170330758566, + "grad_norm": 1.534842050700167, + "learning_rate": 5.469128419611691e-06, + "loss": 0.1571, + "step": 8333 + }, + { + "epoch": 0.6602495543672015, + "grad_norm": 1.298555509051071, + "learning_rate": 5.466841038184954e-06, + "loss": 0.126, + "step": 8334 + }, + { + "epoch": 0.6603287779758368, + "grad_norm": 1.5335731138737194, + "learning_rate": 5.464553955244922e-06, + "loss": 0.1817, + "step": 8335 + }, + { + "epoch": 0.6604080015844722, + "grad_norm": 2.0217865644026936, + "learning_rate": 5.4622671709421856e-06, + "loss": 0.2089, + "step": 8336 + }, + { + "epoch": 0.6604872251931075, + "grad_norm": 1.5634586385554383, + "learning_rate": 5.459980685427326e-06, + "loss": 0.2003, + "step": 8337 + }, + { + "epoch": 0.6605664488017429, + "grad_norm": 1.4022026133342103, + "learning_rate": 5.457694498850892e-06, + "loss": 0.1585, + "step": 8338 + }, + { + "epoch": 0.6606456724103783, + "grad_norm": 1.4575779322529085, + "learning_rate": 5.455408611363416e-06, + "loss": 0.1778, + "step": 8339 + }, + { + "epoch": 0.6607248960190136, + "grad_norm": 1.5424267678030532, + "learning_rate": 5.45312302311542e-06, + "loss": 0.1827, + "step": 8340 + }, + { + "epoch": 0.6608041196276491, + "grad_norm": 1.5607616711418228, + "learning_rate": 5.450837734257395e-06, + "loss": 0.177, + "step": 8341 + }, + { + "epoch": 0.6608833432362844, + "grad_norm": 1.3096820485429626, + "learning_rate": 5.448552744939815e-06, + "loss": 0.1646, + "step": 8342 + }, + { + "epoch": 0.6609625668449198, + "grad_norm": 1.364758389756013, + "learning_rate": 5.446268055313132e-06, + "loss": 0.1554, + "step": 8343 + }, + { + "epoch": 0.6610417904535552, + "grad_norm": 1.26670137609939, + "learning_rate": 5.443983665527792e-06, + "loss": 0.2004, + "step": 8344 + }, + { + "epoch": 0.6611210140621905, + "grad_norm": 1.5742824408510516, + "learning_rate": 5.441699575734204e-06, + "loss": 0.2143, + "step": 8345 + }, + { + "epoch": 0.6612002376708259, + "grad_norm": 1.4884088168282303, + "learning_rate": 5.439415786082762e-06, + "loss": 0.1784, + "step": 8346 + }, + { + "epoch": 0.6612794612794612, + "grad_norm": 1.5024291816741382, + "learning_rate": 5.437132296723852e-06, + "loss": 0.1437, + "step": 8347 + }, + { + "epoch": 0.6613586848880967, + "grad_norm": 1.3772197022212198, + "learning_rate": 5.434849107807823e-06, + "loss": 0.1261, + "step": 8348 + }, + { + "epoch": 0.661437908496732, + "grad_norm": 1.5384801302512507, + "learning_rate": 5.432566219485012e-06, + "loss": 0.2087, + "step": 8349 + }, + { + "epoch": 0.6615171321053674, + "grad_norm": 1.4540796280258301, + "learning_rate": 5.430283631905742e-06, + "loss": 0.2003, + "step": 8350 + }, + { + "epoch": 0.6615963557140028, + "grad_norm": 1.4486425732561794, + "learning_rate": 5.428001345220306e-06, + "loss": 0.1287, + "step": 8351 + }, + { + "epoch": 0.6616755793226381, + "grad_norm": 1.4328318261930029, + "learning_rate": 5.425719359578978e-06, + "loss": 0.1559, + "step": 8352 + }, + { + "epoch": 0.6617548029312735, + "grad_norm": 1.8826867500867754, + "learning_rate": 5.423437675132025e-06, + "loss": 0.2616, + "step": 8353 + }, + { + "epoch": 0.6618340265399089, + "grad_norm": 1.5010281861733883, + "learning_rate": 5.42115629202968e-06, + "loss": 0.211, + "step": 8354 + }, + { + "epoch": 0.6619132501485443, + "grad_norm": 1.467573700362327, + "learning_rate": 5.4188752104221565e-06, + "loss": 0.21, + "step": 8355 + }, + { + "epoch": 0.6619924737571796, + "grad_norm": 1.4197586019676525, + "learning_rate": 5.416594430459663e-06, + "loss": 0.2292, + "step": 8356 + }, + { + "epoch": 0.6620716973658151, + "grad_norm": 2.009264206716873, + "learning_rate": 5.41431395229237e-06, + "loss": 0.2649, + "step": 8357 + }, + { + "epoch": 0.6621509209744504, + "grad_norm": 1.331529473981928, + "learning_rate": 5.41203377607044e-06, + "loss": 0.1584, + "step": 8358 + }, + { + "epoch": 0.6622301445830857, + "grad_norm": 1.8404023642541216, + "learning_rate": 5.409753901944006e-06, + "loss": 0.2253, + "step": 8359 + }, + { + "epoch": 0.6623093681917211, + "grad_norm": 1.5647360236328731, + "learning_rate": 5.407474330063194e-06, + "loss": 0.1901, + "step": 8360 + }, + { + "epoch": 0.6623885918003565, + "grad_norm": 1.6595562334548706, + "learning_rate": 5.4051950605781e-06, + "loss": 0.1998, + "step": 8361 + }, + { + "epoch": 0.6624678154089919, + "grad_norm": 1.5990882896869136, + "learning_rate": 5.402916093638798e-06, + "loss": 0.2028, + "step": 8362 + }, + { + "epoch": 0.6625470390176272, + "grad_norm": 1.6378309146709, + "learning_rate": 5.400637429395357e-06, + "loss": 0.1957, + "step": 8363 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 1.6828487593144763, + "learning_rate": 5.398359067997808e-06, + "loss": 0.2299, + "step": 8364 + }, + { + "epoch": 0.662705486234898, + "grad_norm": 1.4290561703713485, + "learning_rate": 5.3960810095961705e-06, + "loss": 0.1934, + "step": 8365 + }, + { + "epoch": 0.6627847098435333, + "grad_norm": 1.6431002619883153, + "learning_rate": 5.39380325434045e-06, + "loss": 0.1656, + "step": 8366 + }, + { + "epoch": 0.6628639334521688, + "grad_norm": 1.5931842727217678, + "learning_rate": 5.3915258023806195e-06, + "loss": 0.2323, + "step": 8367 + }, + { + "epoch": 0.6629431570608041, + "grad_norm": 1.4720821005836915, + "learning_rate": 5.3892486538666386e-06, + "loss": 0.1248, + "step": 8368 + }, + { + "epoch": 0.6630223806694395, + "grad_norm": 1.4884125986257433, + "learning_rate": 5.386971808948451e-06, + "loss": 0.183, + "step": 8369 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 6.621177502333374, + "learning_rate": 5.384695267775975e-06, + "loss": 0.2898, + "step": 8370 + }, + { + "epoch": 0.6631808278867103, + "grad_norm": 1.8678937727996634, + "learning_rate": 5.382419030499107e-06, + "loss": 0.2177, + "step": 8371 + }, + { + "epoch": 0.6632600514953456, + "grad_norm": 1.4177163884251203, + "learning_rate": 5.380143097267723e-06, + "loss": 0.162, + "step": 8372 + }, + { + "epoch": 0.6633392751039809, + "grad_norm": 1.6491514939417578, + "learning_rate": 5.377867468231695e-06, + "loss": 0.1517, + "step": 8373 + }, + { + "epoch": 0.6634184987126164, + "grad_norm": 1.3102944757942518, + "learning_rate": 5.3755921435408464e-06, + "loss": 0.1488, + "step": 8374 + }, + { + "epoch": 0.6634977223212517, + "grad_norm": 1.3297498932773755, + "learning_rate": 5.373317123345008e-06, + "loss": 0.1735, + "step": 8375 + }, + { + "epoch": 0.6635769459298871, + "grad_norm": 1.3075060533436977, + "learning_rate": 5.371042407793974e-06, + "loss": 0.1232, + "step": 8376 + }, + { + "epoch": 0.6636561695385225, + "grad_norm": 1.6246313316613459, + "learning_rate": 5.368767997037521e-06, + "loss": 0.2386, + "step": 8377 + }, + { + "epoch": 0.6637353931471579, + "grad_norm": 1.2445499720680815, + "learning_rate": 5.366493891225415e-06, + "loss": 0.1381, + "step": 8378 + }, + { + "epoch": 0.6638146167557932, + "grad_norm": 1.2485231242478856, + "learning_rate": 5.3642200905073914e-06, + "loss": 0.1775, + "step": 8379 + }, + { + "epoch": 0.6638938403644286, + "grad_norm": 1.429042663051072, + "learning_rate": 5.361946595033165e-06, + "loss": 0.1709, + "step": 8380 + }, + { + "epoch": 0.663973063973064, + "grad_norm": 1.4968637694075029, + "learning_rate": 5.359673404952442e-06, + "loss": 0.183, + "step": 8381 + }, + { + "epoch": 0.6640522875816993, + "grad_norm": 1.3586940217272707, + "learning_rate": 5.357400520414898e-06, + "loss": 0.221, + "step": 8382 + }, + { + "epoch": 0.6641315111903348, + "grad_norm": 1.44246050573437, + "learning_rate": 5.355127941570191e-06, + "loss": 0.1813, + "step": 8383 + }, + { + "epoch": 0.6642107347989701, + "grad_norm": 1.1793195336546596, + "learning_rate": 5.352855668567956e-06, + "loss": 0.0759, + "step": 8384 + }, + { + "epoch": 0.6642899584076055, + "grad_norm": 1.8065011852859472, + "learning_rate": 5.350583701557816e-06, + "loss": 0.2097, + "step": 8385 + }, + { + "epoch": 0.6643691820162408, + "grad_norm": 1.4303164112575943, + "learning_rate": 5.348312040689369e-06, + "loss": 0.1464, + "step": 8386 + }, + { + "epoch": 0.6644484056248762, + "grad_norm": 1.3858884267602292, + "learning_rate": 5.346040686112189e-06, + "loss": 0.1678, + "step": 8387 + }, + { + "epoch": 0.6645276292335116, + "grad_norm": 1.7496808836607711, + "learning_rate": 5.34376963797584e-06, + "loss": 0.1958, + "step": 8388 + }, + { + "epoch": 0.6646068528421469, + "grad_norm": 1.632181402126805, + "learning_rate": 5.3414988964298555e-06, + "loss": 0.2187, + "step": 8389 + }, + { + "epoch": 0.6646860764507824, + "grad_norm": 1.6491838932869427, + "learning_rate": 5.3392284616237486e-06, + "loss": 0.1495, + "step": 8390 + }, + { + "epoch": 0.6647653000594177, + "grad_norm": 1.366183215607451, + "learning_rate": 5.336958333707026e-06, + "loss": 0.1416, + "step": 8391 + }, + { + "epoch": 0.6648445236680531, + "grad_norm": 1.8062614768417506, + "learning_rate": 5.33468851282916e-06, + "loss": 0.194, + "step": 8392 + }, + { + "epoch": 0.6649237472766885, + "grad_norm": 1.7222648244174699, + "learning_rate": 5.332418999139604e-06, + "loss": 0.196, + "step": 8393 + }, + { + "epoch": 0.6650029708853238, + "grad_norm": 1.3896866463691977, + "learning_rate": 5.330149792787801e-06, + "loss": 0.1467, + "step": 8394 + }, + { + "epoch": 0.6650821944939592, + "grad_norm": 2.078525204593878, + "learning_rate": 5.3278808939231654e-06, + "loss": 0.2262, + "step": 8395 + }, + { + "epoch": 0.6651614181025945, + "grad_norm": 1.8430189050471089, + "learning_rate": 5.32561230269509e-06, + "loss": 0.1865, + "step": 8396 + }, + { + "epoch": 0.66524064171123, + "grad_norm": 1.6271318672443138, + "learning_rate": 5.32334401925295e-06, + "loss": 0.1958, + "step": 8397 + }, + { + "epoch": 0.6653198653198653, + "grad_norm": 1.5949078852329102, + "learning_rate": 5.321076043746108e-06, + "loss": 0.2202, + "step": 8398 + }, + { + "epoch": 0.6653990889285007, + "grad_norm": 1.7590541902497947, + "learning_rate": 5.318808376323895e-06, + "loss": 0.2365, + "step": 8399 + }, + { + "epoch": 0.6654783125371361, + "grad_norm": 1.8846674449433654, + "learning_rate": 5.316541017135622e-06, + "loss": 0.1719, + "step": 8400 + }, + { + "epoch": 0.6655575361457714, + "grad_norm": 1.5738869492393905, + "learning_rate": 5.314273966330591e-06, + "loss": 0.1782, + "step": 8401 + }, + { + "epoch": 0.6656367597544068, + "grad_norm": 1.808684709339619, + "learning_rate": 5.3120072240580735e-06, + "loss": 0.2737, + "step": 8402 + }, + { + "epoch": 0.6657159833630422, + "grad_norm": 1.429516849546963, + "learning_rate": 5.309740790467319e-06, + "loss": 0.1813, + "step": 8403 + }, + { + "epoch": 0.6657952069716776, + "grad_norm": 1.6111987538637318, + "learning_rate": 5.307474665707569e-06, + "loss": 0.1756, + "step": 8404 + }, + { + "epoch": 0.6658744305803129, + "grad_norm": 1.237369458151828, + "learning_rate": 5.305208849928034e-06, + "loss": 0.1032, + "step": 8405 + }, + { + "epoch": 0.6659536541889483, + "grad_norm": 1.9310984511875209, + "learning_rate": 5.302943343277902e-06, + "loss": 0.2288, + "step": 8406 + }, + { + "epoch": 0.6660328777975837, + "grad_norm": 1.8574180080224556, + "learning_rate": 5.300678145906354e-06, + "loss": 0.2493, + "step": 8407 + }, + { + "epoch": 0.666112101406219, + "grad_norm": 1.5272791552720995, + "learning_rate": 5.298413257962538e-06, + "loss": 0.1952, + "step": 8408 + }, + { + "epoch": 0.6661913250148545, + "grad_norm": 1.4365339418974972, + "learning_rate": 5.296148679595583e-06, + "loss": 0.1676, + "step": 8409 + }, + { + "epoch": 0.6662705486234898, + "grad_norm": 1.373422826464152, + "learning_rate": 5.293884410954608e-06, + "loss": 0.1653, + "step": 8410 + }, + { + "epoch": 0.6663497722321252, + "grad_norm": 1.2286778427976257, + "learning_rate": 5.291620452188699e-06, + "loss": 0.1301, + "step": 8411 + }, + { + "epoch": 0.6664289958407605, + "grad_norm": 1.7126344463740855, + "learning_rate": 5.28935680344693e-06, + "loss": 0.1142, + "step": 8412 + }, + { + "epoch": 0.6665082194493959, + "grad_norm": 1.4778085301208934, + "learning_rate": 5.287093464878343e-06, + "loss": 0.1761, + "step": 8413 + }, + { + "epoch": 0.6665874430580313, + "grad_norm": 1.6638324173461492, + "learning_rate": 5.28483043663198e-06, + "loss": 0.221, + "step": 8414 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.4632197780577147, + "learning_rate": 5.282567718856845e-06, + "loss": 0.1946, + "step": 8415 + }, + { + "epoch": 0.6667458902753021, + "grad_norm": 2.0202293446375528, + "learning_rate": 5.280305311701921e-06, + "loss": 0.2782, + "step": 8416 + }, + { + "epoch": 0.6668251138839374, + "grad_norm": 1.6224498219175085, + "learning_rate": 5.278043215316189e-06, + "loss": 0.1863, + "step": 8417 + }, + { + "epoch": 0.6669043374925728, + "grad_norm": 1.758034308953991, + "learning_rate": 5.275781429848589e-06, + "loss": 0.2892, + "step": 8418 + }, + { + "epoch": 0.6669835611012082, + "grad_norm": 1.5396529438966589, + "learning_rate": 5.273519955448047e-06, + "loss": 0.1406, + "step": 8419 + }, + { + "epoch": 0.6670627847098435, + "grad_norm": 1.3633056986651022, + "learning_rate": 5.271258792263476e-06, + "loss": 0.1103, + "step": 8420 + }, + { + "epoch": 0.6671420083184789, + "grad_norm": 1.3332009559296645, + "learning_rate": 5.268997940443762e-06, + "loss": 0.1674, + "step": 8421 + }, + { + "epoch": 0.6672212319271142, + "grad_norm": 1.1836574719363169, + "learning_rate": 5.266737400137765e-06, + "loss": 0.1451, + "step": 8422 + }, + { + "epoch": 0.6673004555357497, + "grad_norm": 1.7315160507975944, + "learning_rate": 5.26447717149434e-06, + "loss": 0.2652, + "step": 8423 + }, + { + "epoch": 0.667379679144385, + "grad_norm": 1.5924275843617288, + "learning_rate": 5.2622172546623055e-06, + "loss": 0.2263, + "step": 8424 + }, + { + "epoch": 0.6674589027530204, + "grad_norm": 1.294274316531856, + "learning_rate": 5.259957649790466e-06, + "loss": 0.1423, + "step": 8425 + }, + { + "epoch": 0.6675381263616558, + "grad_norm": 1.6395766360027566, + "learning_rate": 5.257698357027609e-06, + "loss": 0.1958, + "step": 8426 + }, + { + "epoch": 0.6676173499702911, + "grad_norm": 1.576687506269694, + "learning_rate": 5.2554393765225e-06, + "loss": 0.1833, + "step": 8427 + }, + { + "epoch": 0.6676965735789265, + "grad_norm": 1.3230966278515903, + "learning_rate": 5.253180708423877e-06, + "loss": 0.1677, + "step": 8428 + }, + { + "epoch": 0.6677757971875619, + "grad_norm": 1.9005386966393554, + "learning_rate": 5.25092235288046e-06, + "loss": 0.2465, + "step": 8429 + }, + { + "epoch": 0.6678550207961973, + "grad_norm": 1.9547205849324198, + "learning_rate": 5.248664310040958e-06, + "loss": 0.3169, + "step": 8430 + }, + { + "epoch": 0.6679342444048326, + "grad_norm": 1.7870250985226812, + "learning_rate": 5.246406580054051e-06, + "loss": 0.2031, + "step": 8431 + }, + { + "epoch": 0.6680134680134681, + "grad_norm": 1.4253189701582614, + "learning_rate": 5.244149163068394e-06, + "loss": 0.2081, + "step": 8432 + }, + { + "epoch": 0.6680926916221034, + "grad_norm": 1.803340127607196, + "learning_rate": 5.241892059232634e-06, + "loss": 0.2729, + "step": 8433 + }, + { + "epoch": 0.6681719152307387, + "grad_norm": 1.4727999636960147, + "learning_rate": 5.239635268695386e-06, + "loss": 0.2445, + "step": 8434 + }, + { + "epoch": 0.6682511388393741, + "grad_norm": 1.326679382717561, + "learning_rate": 5.237378791605249e-06, + "loss": 0.1238, + "step": 8435 + }, + { + "epoch": 0.6683303624480095, + "grad_norm": 1.6343449249181874, + "learning_rate": 5.235122628110805e-06, + "loss": 0.1741, + "step": 8436 + }, + { + "epoch": 0.6684095860566449, + "grad_norm": 1.6066480354377857, + "learning_rate": 5.232866778360608e-06, + "loss": 0.1658, + "step": 8437 + }, + { + "epoch": 0.6684888096652802, + "grad_norm": 1.248186060559615, + "learning_rate": 5.230611242503193e-06, + "loss": 0.2099, + "step": 8438 + }, + { + "epoch": 0.6685680332739157, + "grad_norm": 1.4626188130594877, + "learning_rate": 5.228356020687082e-06, + "loss": 0.1338, + "step": 8439 + }, + { + "epoch": 0.668647256882551, + "grad_norm": 1.9456100702607746, + "learning_rate": 5.226101113060769e-06, + "loss": 0.3776, + "step": 8440 + }, + { + "epoch": 0.6687264804911863, + "grad_norm": 1.358269659326838, + "learning_rate": 5.223846519772722e-06, + "loss": 0.1433, + "step": 8441 + }, + { + "epoch": 0.6688057040998218, + "grad_norm": 1.1021769097165868, + "learning_rate": 5.221592240971403e-06, + "loss": 0.0847, + "step": 8442 + }, + { + "epoch": 0.6688849277084571, + "grad_norm": 1.482518461292099, + "learning_rate": 5.219338276805243e-06, + "loss": 0.2636, + "step": 8443 + }, + { + "epoch": 0.6689641513170925, + "grad_norm": 1.474678263545345, + "learning_rate": 5.217084627422656e-06, + "loss": 0.1746, + "step": 8444 + }, + { + "epoch": 0.6690433749257279, + "grad_norm": 1.7854770861797706, + "learning_rate": 5.214831292972027e-06, + "loss": 0.2393, + "step": 8445 + }, + { + "epoch": 0.6691225985343633, + "grad_norm": 1.571451543787154, + "learning_rate": 5.212578273601738e-06, + "loss": 0.2033, + "step": 8446 + }, + { + "epoch": 0.6692018221429986, + "grad_norm": 1.4525476942226887, + "learning_rate": 5.210325569460133e-06, + "loss": 0.1512, + "step": 8447 + }, + { + "epoch": 0.6692810457516339, + "grad_norm": 1.6151916703297589, + "learning_rate": 5.208073180695538e-06, + "loss": 0.2012, + "step": 8448 + }, + { + "epoch": 0.6693602693602694, + "grad_norm": 1.5837921310521381, + "learning_rate": 5.205821107456273e-06, + "loss": 0.2181, + "step": 8449 + }, + { + "epoch": 0.6694394929689047, + "grad_norm": 1.4985629769247983, + "learning_rate": 5.203569349890618e-06, + "loss": 0.1749, + "step": 8450 + }, + { + "epoch": 0.6695187165775401, + "grad_norm": 1.4477125860789766, + "learning_rate": 5.201317908146843e-06, + "loss": 0.14, + "step": 8451 + }, + { + "epoch": 0.6695979401861755, + "grad_norm": 1.2128356644954386, + "learning_rate": 5.199066782373194e-06, + "loss": 0.1041, + "step": 8452 + }, + { + "epoch": 0.6696771637948109, + "grad_norm": 1.7380161149990438, + "learning_rate": 5.196815972717897e-06, + "loss": 0.2273, + "step": 8453 + }, + { + "epoch": 0.6697563874034462, + "grad_norm": 1.320456483444575, + "learning_rate": 5.194565479329154e-06, + "loss": 0.1702, + "step": 8454 + }, + { + "epoch": 0.6698356110120816, + "grad_norm": 1.2760780946377717, + "learning_rate": 5.192315302355153e-06, + "loss": 0.121, + "step": 8455 + }, + { + "epoch": 0.669914834620717, + "grad_norm": 1.9602400991161062, + "learning_rate": 5.190065441944059e-06, + "loss": 0.2023, + "step": 8456 + }, + { + "epoch": 0.6699940582293523, + "grad_norm": 1.6601163865410746, + "learning_rate": 5.187815898244006e-06, + "loss": 0.1687, + "step": 8457 + }, + { + "epoch": 0.6700732818379878, + "grad_norm": 1.7714548939190813, + "learning_rate": 5.185566671403126e-06, + "loss": 0.2346, + "step": 8458 + }, + { + "epoch": 0.6701525054466231, + "grad_norm": 1.963645970252486, + "learning_rate": 5.183317761569515e-06, + "loss": 0.2416, + "step": 8459 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 1.6347264964106687, + "learning_rate": 5.181069168891248e-06, + "loss": 0.1938, + "step": 8460 + }, + { + "epoch": 0.6703109526638938, + "grad_norm": 1.316388528324005, + "learning_rate": 5.178820893516394e-06, + "loss": 0.1166, + "step": 8461 + }, + { + "epoch": 0.6703901762725292, + "grad_norm": 1.7660386645387534, + "learning_rate": 5.176572935592986e-06, + "loss": 0.179, + "step": 8462 + }, + { + "epoch": 0.6704693998811646, + "grad_norm": 1.692958187418363, + "learning_rate": 5.1743252952690385e-06, + "loss": 0.2476, + "step": 8463 + }, + { + "epoch": 0.6705486234897999, + "grad_norm": 1.455119160419921, + "learning_rate": 5.172077972692553e-06, + "loss": 0.2149, + "step": 8464 + }, + { + "epoch": 0.6706278470984354, + "grad_norm": 1.5222541645852785, + "learning_rate": 5.1698309680115024e-06, + "loss": 0.231, + "step": 8465 + }, + { + "epoch": 0.6707070707070707, + "grad_norm": 1.8655684011704334, + "learning_rate": 5.167584281373838e-06, + "loss": 0.2481, + "step": 8466 + }, + { + "epoch": 0.6707862943157061, + "grad_norm": 0.9371901567203602, + "learning_rate": 5.165337912927502e-06, + "loss": 0.0795, + "step": 8467 + }, + { + "epoch": 0.6708655179243415, + "grad_norm": 1.9303799189343307, + "learning_rate": 5.1630918628204e-06, + "loss": 0.1398, + "step": 8468 + }, + { + "epoch": 0.6709447415329768, + "grad_norm": 1.237869139683717, + "learning_rate": 5.1608461312004245e-06, + "loss": 0.1686, + "step": 8469 + }, + { + "epoch": 0.6710239651416122, + "grad_norm": 1.330205690766006, + "learning_rate": 5.158600718215443e-06, + "loss": 0.1524, + "step": 8470 + }, + { + "epoch": 0.6711031887502475, + "grad_norm": 1.4282553567661662, + "learning_rate": 5.156355624013314e-06, + "loss": 0.2584, + "step": 8471 + }, + { + "epoch": 0.671182412358883, + "grad_norm": 1.5205653402761046, + "learning_rate": 5.15411084874186e-06, + "loss": 0.2449, + "step": 8472 + }, + { + "epoch": 0.6712616359675183, + "grad_norm": 1.7682987176796745, + "learning_rate": 5.151866392548886e-06, + "loss": 0.1976, + "step": 8473 + }, + { + "epoch": 0.6713408595761537, + "grad_norm": 1.2274999670131204, + "learning_rate": 5.149622255582185e-06, + "loss": 0.1577, + "step": 8474 + }, + { + "epoch": 0.6714200831847891, + "grad_norm": 1.761366219411446, + "learning_rate": 5.147378437989522e-06, + "loss": 0.1403, + "step": 8475 + }, + { + "epoch": 0.6714993067934244, + "grad_norm": 1.6078488067989742, + "learning_rate": 5.145134939918634e-06, + "loss": 0.1449, + "step": 8476 + }, + { + "epoch": 0.6715785304020598, + "grad_norm": 1.522582411569965, + "learning_rate": 5.1428917615172555e-06, + "loss": 0.1931, + "step": 8477 + }, + { + "epoch": 0.6716577540106952, + "grad_norm": 1.3261246424459898, + "learning_rate": 5.140648902933083e-06, + "loss": 0.1422, + "step": 8478 + }, + { + "epoch": 0.6717369776193306, + "grad_norm": 1.2471759697094842, + "learning_rate": 5.138406364313795e-06, + "loss": 0.1292, + "step": 8479 + }, + { + "epoch": 0.6718162012279659, + "grad_norm": 1.4296797541151514, + "learning_rate": 5.136164145807059e-06, + "loss": 0.1702, + "step": 8480 + }, + { + "epoch": 0.6718954248366014, + "grad_norm": 2.0731282130675117, + "learning_rate": 5.13392224756051e-06, + "loss": 0.2812, + "step": 8481 + }, + { + "epoch": 0.6719746484452367, + "grad_norm": 1.8748077553442462, + "learning_rate": 5.131680669721768e-06, + "loss": 0.1908, + "step": 8482 + }, + { + "epoch": 0.672053872053872, + "grad_norm": 1.4970895713835946, + "learning_rate": 5.129439412438424e-06, + "loss": 0.2165, + "step": 8483 + }, + { + "epoch": 0.6721330956625075, + "grad_norm": 2.3464362368497955, + "learning_rate": 5.127198475858064e-06, + "loss": 0.338, + "step": 8484 + }, + { + "epoch": 0.6722123192711428, + "grad_norm": 1.442024160308719, + "learning_rate": 5.124957860128237e-06, + "loss": 0.1624, + "step": 8485 + }, + { + "epoch": 0.6722915428797782, + "grad_norm": 1.445072924192, + "learning_rate": 5.122717565396474e-06, + "loss": 0.2108, + "step": 8486 + }, + { + "epoch": 0.6723707664884135, + "grad_norm": 1.2247059397253373, + "learning_rate": 5.1204775918102955e-06, + "loss": 0.1416, + "step": 8487 + }, + { + "epoch": 0.6724499900970489, + "grad_norm": 1.536056124793062, + "learning_rate": 5.11823793951719e-06, + "loss": 0.133, + "step": 8488 + }, + { + "epoch": 0.6725292137056843, + "grad_norm": 1.95175779781337, + "learning_rate": 5.115998608664621e-06, + "loss": 0.2876, + "step": 8489 + }, + { + "epoch": 0.6726084373143196, + "grad_norm": 1.095147510776463, + "learning_rate": 5.1137595994000475e-06, + "loss": 0.124, + "step": 8490 + }, + { + "epoch": 0.6726876609229551, + "grad_norm": 1.3561832574624626, + "learning_rate": 5.111520911870894e-06, + "loss": 0.1466, + "step": 8491 + }, + { + "epoch": 0.6727668845315904, + "grad_norm": 1.6023852398879184, + "learning_rate": 5.109282546224563e-06, + "loss": 0.197, + "step": 8492 + }, + { + "epoch": 0.6728461081402258, + "grad_norm": 1.7465342089412041, + "learning_rate": 5.107044502608447e-06, + "loss": 0.2545, + "step": 8493 + }, + { + "epoch": 0.6729253317488612, + "grad_norm": 1.6874387232615442, + "learning_rate": 5.104806781169906e-06, + "loss": 0.1764, + "step": 8494 + }, + { + "epoch": 0.6730045553574965, + "grad_norm": 2.1778429413109133, + "learning_rate": 5.102569382056281e-06, + "loss": 0.2201, + "step": 8495 + }, + { + "epoch": 0.6730837789661319, + "grad_norm": 1.3040181587514807, + "learning_rate": 5.100332305414902e-06, + "loss": 0.1437, + "step": 8496 + }, + { + "epoch": 0.6731630025747672, + "grad_norm": 1.2939534298064392, + "learning_rate": 5.098095551393066e-06, + "loss": 0.1794, + "step": 8497 + }, + { + "epoch": 0.6732422261834027, + "grad_norm": 1.73128537738855, + "learning_rate": 5.095859120138049e-06, + "loss": 0.1837, + "step": 8498 + }, + { + "epoch": 0.673321449792038, + "grad_norm": 1.4551562330444414, + "learning_rate": 5.093623011797108e-06, + "loss": 0.1329, + "step": 8499 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 1.7354653375431057, + "learning_rate": 5.091387226517489e-06, + "loss": 0.2266, + "step": 8500 + }, + { + "epoch": 0.6734798970093088, + "grad_norm": 1.287523177847614, + "learning_rate": 5.089151764446403e-06, + "loss": 0.1332, + "step": 8501 + }, + { + "epoch": 0.6735591206179441, + "grad_norm": 1.1666422224273876, + "learning_rate": 5.086916625731038e-06, + "loss": 0.0948, + "step": 8502 + }, + { + "epoch": 0.6736383442265795, + "grad_norm": 1.9109298025577046, + "learning_rate": 5.084681810518577e-06, + "loss": 0.155, + "step": 8503 + }, + { + "epoch": 0.6737175678352149, + "grad_norm": 1.6270810382742522, + "learning_rate": 5.0824473189561695e-06, + "loss": 0.2308, + "step": 8504 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 1.4288671467595624, + "learning_rate": 5.080213151190938e-06, + "loss": 0.1473, + "step": 8505 + }, + { + "epoch": 0.6738760150524856, + "grad_norm": 1.870443493909983, + "learning_rate": 5.077979307370004e-06, + "loss": 0.2177, + "step": 8506 + }, + { + "epoch": 0.6739552386611211, + "grad_norm": 2.0361578428045224, + "learning_rate": 5.075745787640448e-06, + "loss": 0.3307, + "step": 8507 + }, + { + "epoch": 0.6740344622697564, + "grad_norm": 1.3653574228417908, + "learning_rate": 5.073512592149334e-06, + "loss": 0.1854, + "step": 8508 + }, + { + "epoch": 0.6741136858783917, + "grad_norm": 1.5414280084285639, + "learning_rate": 5.071279721043716e-06, + "loss": 0.1735, + "step": 8509 + }, + { + "epoch": 0.6741929094870271, + "grad_norm": 2.0740311219640004, + "learning_rate": 5.069047174470613e-06, + "loss": 0.2222, + "step": 8510 + }, + { + "epoch": 0.6742721330956625, + "grad_norm": 1.6227142469765012, + "learning_rate": 5.066814952577021e-06, + "loss": 0.2294, + "step": 8511 + }, + { + "epoch": 0.6743513567042979, + "grad_norm": 1.6920823613896925, + "learning_rate": 5.064583055509935e-06, + "loss": 0.2339, + "step": 8512 + }, + { + "epoch": 0.6744305803129332, + "grad_norm": 1.4006838480031687, + "learning_rate": 5.062351483416304e-06, + "loss": 0.196, + "step": 8513 + }, + { + "epoch": 0.6745098039215687, + "grad_norm": 1.4031934165158546, + "learning_rate": 5.060120236443071e-06, + "loss": 0.1596, + "step": 8514 + }, + { + "epoch": 0.674589027530204, + "grad_norm": 1.5377744173701284, + "learning_rate": 5.057889314737148e-06, + "loss": 0.1583, + "step": 8515 + }, + { + "epoch": 0.6746682511388393, + "grad_norm": 1.668182620156662, + "learning_rate": 5.055658718445435e-06, + "loss": 0.1983, + "step": 8516 + }, + { + "epoch": 0.6747474747474748, + "grad_norm": 2.1344458873589143, + "learning_rate": 5.053428447714806e-06, + "loss": 0.2957, + "step": 8517 + }, + { + "epoch": 0.6748266983561101, + "grad_norm": 1.413463487704143, + "learning_rate": 5.05119850269211e-06, + "loss": 0.1639, + "step": 8518 + }, + { + "epoch": 0.6749059219647455, + "grad_norm": 1.55141391647744, + "learning_rate": 5.048968883524182e-06, + "loss": 0.1378, + "step": 8519 + }, + { + "epoch": 0.6749851455733809, + "grad_norm": 1.3625622993554278, + "learning_rate": 5.046739590357832e-06, + "loss": 0.1699, + "step": 8520 + }, + { + "epoch": 0.6750643691820163, + "grad_norm": 1.3445522321860088, + "learning_rate": 5.044510623339842e-06, + "loss": 0.1691, + "step": 8521 + }, + { + "epoch": 0.6751435927906516, + "grad_norm": 1.8681283864517408, + "learning_rate": 5.042281982616986e-06, + "loss": 0.3032, + "step": 8522 + }, + { + "epoch": 0.6752228163992869, + "grad_norm": 1.3255191787263767, + "learning_rate": 5.0400536683360064e-06, + "loss": 0.1072, + "step": 8523 + }, + { + "epoch": 0.6753020400079224, + "grad_norm": 1.8354589583669219, + "learning_rate": 5.037825680643624e-06, + "loss": 0.14, + "step": 8524 + }, + { + "epoch": 0.6753812636165577, + "grad_norm": 1.589199015619873, + "learning_rate": 5.035598019686549e-06, + "loss": 0.1979, + "step": 8525 + }, + { + "epoch": 0.6754604872251931, + "grad_norm": 1.3600072006946753, + "learning_rate": 5.033370685611456e-06, + "loss": 0.1658, + "step": 8526 + }, + { + "epoch": 0.6755397108338285, + "grad_norm": 1.7568432837746937, + "learning_rate": 5.031143678565005e-06, + "loss": 0.2276, + "step": 8527 + }, + { + "epoch": 0.6756189344424639, + "grad_norm": 1.5257621077406234, + "learning_rate": 5.028916998693831e-06, + "loss": 0.1879, + "step": 8528 + }, + { + "epoch": 0.6756981580510992, + "grad_norm": 1.4386596301967385, + "learning_rate": 5.02669064614456e-06, + "loss": 0.1382, + "step": 8529 + }, + { + "epoch": 0.6757773816597346, + "grad_norm": 1.8160107495596027, + "learning_rate": 5.024464621063773e-06, + "loss": 0.2048, + "step": 8530 + }, + { + "epoch": 0.67585660526837, + "grad_norm": 1.5467975569183061, + "learning_rate": 5.022238923598055e-06, + "loss": 0.1968, + "step": 8531 + }, + { + "epoch": 0.6759358288770053, + "grad_norm": 1.3747405622906388, + "learning_rate": 5.020013553893952e-06, + "loss": 0.1485, + "step": 8532 + }, + { + "epoch": 0.6760150524856408, + "grad_norm": 1.6200214007104616, + "learning_rate": 5.017788512097989e-06, + "loss": 0.2157, + "step": 8533 + }, + { + "epoch": 0.6760942760942761, + "grad_norm": 1.6361064107047978, + "learning_rate": 5.015563798356684e-06, + "loss": 0.2593, + "step": 8534 + }, + { + "epoch": 0.6761734997029115, + "grad_norm": 1.347862985898063, + "learning_rate": 5.0133394128165204e-06, + "loss": 0.1423, + "step": 8535 + }, + { + "epoch": 0.6762527233115468, + "grad_norm": 1.3549275562574086, + "learning_rate": 5.011115355623957e-06, + "loss": 0.1891, + "step": 8536 + }, + { + "epoch": 0.6763319469201822, + "grad_norm": 1.5234051982373102, + "learning_rate": 5.008891626925447e-06, + "loss": 0.1835, + "step": 8537 + }, + { + "epoch": 0.6764111705288176, + "grad_norm": 1.3754411236015436, + "learning_rate": 5.006668226867407e-06, + "loss": 0.1301, + "step": 8538 + }, + { + "epoch": 0.6764903941374529, + "grad_norm": 1.6860678976739774, + "learning_rate": 5.004445155596238e-06, + "loss": 0.1941, + "step": 8539 + }, + { + "epoch": 0.6765696177460884, + "grad_norm": 1.1545212715442095, + "learning_rate": 5.0022224132583154e-06, + "loss": 0.1302, + "step": 8540 + }, + { + "epoch": 0.6766488413547237, + "grad_norm": 1.2298888040285771, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1129, + "step": 8541 + }, + { + "epoch": 0.6767280649633591, + "grad_norm": 1.324481207116287, + "learning_rate": 4.997777915967631e-06, + "loss": 0.1719, + "step": 8542 + }, + { + "epoch": 0.6768072885719945, + "grad_norm": 1.5778495961484131, + "learning_rate": 4.995556161307511e-06, + "loss": 0.2469, + "step": 8543 + }, + { + "epoch": 0.6768865121806298, + "grad_norm": 1.3587851286887267, + "learning_rate": 4.993334736165941e-06, + "loss": 0.1485, + "step": 8544 + }, + { + "epoch": 0.6769657357892652, + "grad_norm": 1.63471823791878, + "learning_rate": 4.991113640689189e-06, + "loss": 0.1794, + "step": 8545 + }, + { + "epoch": 0.6770449593979005, + "grad_norm": 1.6904132118066113, + "learning_rate": 4.988892875023499e-06, + "loss": 0.2388, + "step": 8546 + }, + { + "epoch": 0.677124183006536, + "grad_norm": 1.7335100884559311, + "learning_rate": 4.9866724393151044e-06, + "loss": 0.2165, + "step": 8547 + }, + { + "epoch": 0.6772034066151713, + "grad_norm": 1.2058727937874647, + "learning_rate": 4.984452333710207e-06, + "loss": 0.0942, + "step": 8548 + }, + { + "epoch": 0.6772826302238067, + "grad_norm": 1.5124110442235879, + "learning_rate": 4.982232558354986e-06, + "loss": 0.1408, + "step": 8549 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 1.282638828124292, + "learning_rate": 4.980013113395612e-06, + "loss": 0.133, + "step": 8550 + }, + { + "epoch": 0.6774410774410774, + "grad_norm": 1.5428026279784988, + "learning_rate": 4.9777939989782185e-06, + "loss": 0.2498, + "step": 8551 + }, + { + "epoch": 0.6775203010497128, + "grad_norm": 1.5973135123251054, + "learning_rate": 4.975575215248926e-06, + "loss": 0.2104, + "step": 8552 + }, + { + "epoch": 0.6775995246583482, + "grad_norm": 2.1667810637505824, + "learning_rate": 4.9733567623538245e-06, + "loss": 0.3219, + "step": 8553 + }, + { + "epoch": 0.6776787482669836, + "grad_norm": 1.40473252150783, + "learning_rate": 4.9711386404389995e-06, + "loss": 0.1391, + "step": 8554 + }, + { + "epoch": 0.6777579718756189, + "grad_norm": 2.131269419652134, + "learning_rate": 4.968920849650496e-06, + "loss": 0.2287, + "step": 8555 + }, + { + "epoch": 0.6778371954842544, + "grad_norm": 1.7496739998635797, + "learning_rate": 4.966703390134343e-06, + "loss": 0.1975, + "step": 8556 + }, + { + "epoch": 0.6779164190928897, + "grad_norm": 1.3402601225762008, + "learning_rate": 4.964486262036557e-06, + "loss": 0.1646, + "step": 8557 + }, + { + "epoch": 0.677995642701525, + "grad_norm": 1.5155336456082882, + "learning_rate": 4.962269465503121e-06, + "loss": 0.2325, + "step": 8558 + }, + { + "epoch": 0.6780748663101605, + "grad_norm": 1.596258719887241, + "learning_rate": 4.960053000679997e-06, + "loss": 0.2032, + "step": 8559 + }, + { + "epoch": 0.6781540899187958, + "grad_norm": 1.1478528809983357, + "learning_rate": 4.957836867713138e-06, + "loss": 0.1526, + "step": 8560 + }, + { + "epoch": 0.6782333135274312, + "grad_norm": 1.258742314957348, + "learning_rate": 4.955621066748457e-06, + "loss": 0.1128, + "step": 8561 + }, + { + "epoch": 0.6783125371360665, + "grad_norm": 1.2923005314200162, + "learning_rate": 4.953405597931854e-06, + "loss": 0.2014, + "step": 8562 + }, + { + "epoch": 0.6783917607447019, + "grad_norm": 1.540768290421965, + "learning_rate": 4.951190461409214e-06, + "loss": 0.2474, + "step": 8563 + }, + { + "epoch": 0.6784709843533373, + "grad_norm": 1.0842009487271456, + "learning_rate": 4.948975657326388e-06, + "loss": 0.1095, + "step": 8564 + }, + { + "epoch": 0.6785502079619726, + "grad_norm": 1.4855075907221102, + "learning_rate": 4.946761185829208e-06, + "loss": 0.197, + "step": 8565 + }, + { + "epoch": 0.6786294315706081, + "grad_norm": 1.4319057122684191, + "learning_rate": 4.944547047063493e-06, + "loss": 0.152, + "step": 8566 + }, + { + "epoch": 0.6787086551792434, + "grad_norm": 1.3974249544087434, + "learning_rate": 4.942333241175029e-06, + "loss": 0.2162, + "step": 8567 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 1.4998666889143208, + "learning_rate": 4.940119768309585e-06, + "loss": 0.2442, + "step": 8568 + }, + { + "epoch": 0.6788671023965142, + "grad_norm": 1.8821341372241676, + "learning_rate": 4.937906628612905e-06, + "loss": 0.2019, + "step": 8569 + }, + { + "epoch": 0.6789463260051495, + "grad_norm": 1.2868391946904214, + "learning_rate": 4.93569382223072e-06, + "loss": 0.1351, + "step": 8570 + }, + { + "epoch": 0.6790255496137849, + "grad_norm": 1.367902006728048, + "learning_rate": 4.933481349308728e-06, + "loss": 0.115, + "step": 8571 + }, + { + "epoch": 0.6791047732224202, + "grad_norm": 1.385837211257772, + "learning_rate": 4.931269209992607e-06, + "loss": 0.1499, + "step": 8572 + }, + { + "epoch": 0.6791839968310557, + "grad_norm": 1.7607382267167446, + "learning_rate": 4.929057404428023e-06, + "loss": 0.1897, + "step": 8573 + }, + { + "epoch": 0.679263220439691, + "grad_norm": 1.4065539419935775, + "learning_rate": 4.926845932760609e-06, + "loss": 0.1896, + "step": 8574 + }, + { + "epoch": 0.6793424440483264, + "grad_norm": 1.7514918908737478, + "learning_rate": 4.924634795135976e-06, + "loss": 0.2262, + "step": 8575 + }, + { + "epoch": 0.6794216676569618, + "grad_norm": 1.2586453017349515, + "learning_rate": 4.922423991699725e-06, + "loss": 0.1622, + "step": 8576 + }, + { + "epoch": 0.6795008912655971, + "grad_norm": 1.3165998443886482, + "learning_rate": 4.920213522597422e-06, + "loss": 0.1308, + "step": 8577 + }, + { + "epoch": 0.6795801148742325, + "grad_norm": 1.7436076082684013, + "learning_rate": 4.918003387974614e-06, + "loss": 0.1847, + "step": 8578 + }, + { + "epoch": 0.6796593384828679, + "grad_norm": 1.5991089520432948, + "learning_rate": 4.915793587976832e-06, + "loss": 0.1879, + "step": 8579 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 1.2925535099600607, + "learning_rate": 4.913584122749578e-06, + "loss": 0.1473, + "step": 8580 + }, + { + "epoch": 0.6798177857001386, + "grad_norm": 1.4197447927759301, + "learning_rate": 4.911374992438334e-06, + "loss": 0.1463, + "step": 8581 + }, + { + "epoch": 0.6798970093087741, + "grad_norm": 1.6233574438733154, + "learning_rate": 4.909166197188563e-06, + "loss": 0.1858, + "step": 8582 + }, + { + "epoch": 0.6799762329174094, + "grad_norm": 1.845523476573834, + "learning_rate": 4.906957737145703e-06, + "loss": 0.182, + "step": 8583 + }, + { + "epoch": 0.6800554565260447, + "grad_norm": 1.6731709411467286, + "learning_rate": 4.904749612455171e-06, + "loss": 0.2157, + "step": 8584 + }, + { + "epoch": 0.6801346801346801, + "grad_norm": 1.641858619300692, + "learning_rate": 4.902541823262356e-06, + "loss": 0.161, + "step": 8585 + }, + { + "epoch": 0.6802139037433155, + "grad_norm": 1.8308752657918839, + "learning_rate": 4.900334369712637e-06, + "loss": 0.2199, + "step": 8586 + }, + { + "epoch": 0.6802931273519509, + "grad_norm": 1.5417994791697767, + "learning_rate": 4.898127251951363e-06, + "loss": 0.2204, + "step": 8587 + }, + { + "epoch": 0.6803723509605862, + "grad_norm": 1.447278852663485, + "learning_rate": 4.895920470123857e-06, + "loss": 0.1726, + "step": 8588 + }, + { + "epoch": 0.6804515745692217, + "grad_norm": 1.6376487142919485, + "learning_rate": 4.893714024375432e-06, + "loss": 0.1854, + "step": 8589 + }, + { + "epoch": 0.680530798177857, + "grad_norm": 1.3599365330770072, + "learning_rate": 4.89150791485137e-06, + "loss": 0.1419, + "step": 8590 + }, + { + "epoch": 0.6806100217864923, + "grad_norm": 1.5969374573522095, + "learning_rate": 4.889302141696925e-06, + "loss": 0.1779, + "step": 8591 + }, + { + "epoch": 0.6806892453951278, + "grad_norm": 1.3955223042255005, + "learning_rate": 4.88709670505735e-06, + "loss": 0.1384, + "step": 8592 + }, + { + "epoch": 0.6807684690037631, + "grad_norm": 1.6557109219727237, + "learning_rate": 4.884891605077853e-06, + "loss": 0.1707, + "step": 8593 + }, + { + "epoch": 0.6808476926123985, + "grad_norm": 1.6475801155131908, + "learning_rate": 4.882686841903627e-06, + "loss": 0.212, + "step": 8594 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 1.503630840263843, + "learning_rate": 4.8804824156798544e-06, + "loss": 0.1634, + "step": 8595 + }, + { + "epoch": 0.6810061398296693, + "grad_norm": 1.5612340649170864, + "learning_rate": 4.878278326551682e-06, + "loss": 0.1869, + "step": 8596 + }, + { + "epoch": 0.6810853634383046, + "grad_norm": 1.2960560024780041, + "learning_rate": 4.876074574664232e-06, + "loss": 0.1259, + "step": 8597 + }, + { + "epoch": 0.6811645870469399, + "grad_norm": 1.393538759623158, + "learning_rate": 4.873871160162622e-06, + "loss": 0.158, + "step": 8598 + }, + { + "epoch": 0.6812438106555754, + "grad_norm": 1.6361564763009502, + "learning_rate": 4.871668083191931e-06, + "loss": 0.1977, + "step": 8599 + }, + { + "epoch": 0.6813230342642107, + "grad_norm": 1.9741867703390905, + "learning_rate": 4.8694653438972195e-06, + "loss": 0.1673, + "step": 8600 + }, + { + "epoch": 0.6814022578728461, + "grad_norm": 1.4387389722285093, + "learning_rate": 4.867262942423525e-06, + "loss": 0.161, + "step": 8601 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 1.701930408154907, + "learning_rate": 4.865060878915873e-06, + "loss": 0.1642, + "step": 8602 + }, + { + "epoch": 0.6815607050901169, + "grad_norm": 2.0021225121916526, + "learning_rate": 4.862859153519252e-06, + "loss": 0.2728, + "step": 8603 + }, + { + "epoch": 0.6816399286987522, + "grad_norm": 1.4913843276190428, + "learning_rate": 4.860657766378637e-06, + "loss": 0.1853, + "step": 8604 + }, + { + "epoch": 0.6817191523073876, + "grad_norm": 1.1807206607478606, + "learning_rate": 4.858456717638981e-06, + "loss": 0.1164, + "step": 8605 + }, + { + "epoch": 0.681798375916023, + "grad_norm": 1.3498192464577399, + "learning_rate": 4.856256007445211e-06, + "loss": 0.1281, + "step": 8606 + }, + { + "epoch": 0.6818775995246583, + "grad_norm": 1.2528592144148911, + "learning_rate": 4.8540556359422335e-06, + "loss": 0.1629, + "step": 8607 + }, + { + "epoch": 0.6819568231332938, + "grad_norm": 1.1578562032052755, + "learning_rate": 4.85185560327493e-06, + "loss": 0.143, + "step": 8608 + }, + { + "epoch": 0.6820360467419291, + "grad_norm": 1.5195694080293936, + "learning_rate": 4.849655909588165e-06, + "loss": 0.1908, + "step": 8609 + }, + { + "epoch": 0.6821152703505645, + "grad_norm": 1.838130817530145, + "learning_rate": 4.847456555026773e-06, + "loss": 0.2922, + "step": 8610 + }, + { + "epoch": 0.6821944939591998, + "grad_norm": 1.3583243790507356, + "learning_rate": 4.845257539735577e-06, + "loss": 0.1323, + "step": 8611 + }, + { + "epoch": 0.6822737175678352, + "grad_norm": 1.9146706970827245, + "learning_rate": 4.843058863859369e-06, + "loss": 0.2281, + "step": 8612 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 1.2631825782674775, + "learning_rate": 4.840860527542919e-06, + "loss": 0.1454, + "step": 8613 + }, + { + "epoch": 0.6824321647851059, + "grad_norm": 1.5780932205699636, + "learning_rate": 4.838662530930981e-06, + "loss": 0.1387, + "step": 8614 + }, + { + "epoch": 0.6825113883937414, + "grad_norm": 1.3333092449744257, + "learning_rate": 4.836464874168282e-06, + "loss": 0.1836, + "step": 8615 + }, + { + "epoch": 0.6825906120023767, + "grad_norm": 1.769202557702352, + "learning_rate": 4.834267557399521e-06, + "loss": 0.1831, + "step": 8616 + }, + { + "epoch": 0.6826698356110121, + "grad_norm": 1.346387300277102, + "learning_rate": 4.832070580769389e-06, + "loss": 0.1849, + "step": 8617 + }, + { + "epoch": 0.6827490592196475, + "grad_norm": 1.4092969389998287, + "learning_rate": 4.829873944422544e-06, + "loss": 0.1446, + "step": 8618 + }, + { + "epoch": 0.6828282828282828, + "grad_norm": 1.7339774517482605, + "learning_rate": 4.8276776485036185e-06, + "loss": 0.2278, + "step": 8619 + }, + { + "epoch": 0.6829075064369182, + "grad_norm": 1.6955699006646225, + "learning_rate": 4.825481693157235e-06, + "loss": 0.1879, + "step": 8620 + }, + { + "epoch": 0.6829867300455535, + "grad_norm": 1.3629165159761114, + "learning_rate": 4.823286078527984e-06, + "loss": 0.1189, + "step": 8621 + }, + { + "epoch": 0.683065953654189, + "grad_norm": 2.206024746466242, + "learning_rate": 4.8210908047604336e-06, + "loss": 0.2435, + "step": 8622 + }, + { + "epoch": 0.6831451772628243, + "grad_norm": 1.650706399093415, + "learning_rate": 4.818895871999136e-06, + "loss": 0.1769, + "step": 8623 + }, + { + "epoch": 0.6832244008714597, + "grad_norm": 1.4927354432691475, + "learning_rate": 4.816701280388617e-06, + "loss": 0.1381, + "step": 8624 + }, + { + "epoch": 0.6833036244800951, + "grad_norm": 1.342977216804004, + "learning_rate": 4.814507030073377e-06, + "loss": 0.1258, + "step": 8625 + }, + { + "epoch": 0.6833828480887304, + "grad_norm": 1.8638109438553099, + "learning_rate": 4.812313121197896e-06, + "loss": 0.2157, + "step": 8626 + }, + { + "epoch": 0.6834620716973658, + "grad_norm": 1.983207606433488, + "learning_rate": 4.810119553906637e-06, + "loss": 0.2239, + "step": 8627 + }, + { + "epoch": 0.6835412953060012, + "grad_norm": 1.5859256273035767, + "learning_rate": 4.807926328344033e-06, + "loss": 0.1384, + "step": 8628 + }, + { + "epoch": 0.6836205189146366, + "grad_norm": 1.5810360359678104, + "learning_rate": 4.805733444654496e-06, + "loss": 0.1854, + "step": 8629 + }, + { + "epoch": 0.6836997425232719, + "grad_norm": 1.4085303112832175, + "learning_rate": 4.8035409029824195e-06, + "loss": 0.168, + "step": 8630 + }, + { + "epoch": 0.6837789661319074, + "grad_norm": 1.4163183407444602, + "learning_rate": 4.801348703472173e-06, + "loss": 0.1802, + "step": 8631 + }, + { + "epoch": 0.6838581897405427, + "grad_norm": 2.0585092565426057, + "learning_rate": 4.7991568462680945e-06, + "loss": 0.1672, + "step": 8632 + }, + { + "epoch": 0.683937413349178, + "grad_norm": 1.2640581576220085, + "learning_rate": 4.796965331514517e-06, + "loss": 0.1103, + "step": 8633 + }, + { + "epoch": 0.6840166369578135, + "grad_norm": 1.333843195416572, + "learning_rate": 4.794774159355737e-06, + "loss": 0.1165, + "step": 8634 + }, + { + "epoch": 0.6840958605664488, + "grad_norm": 1.4288040296992421, + "learning_rate": 4.79258332993603e-06, + "loss": 0.2208, + "step": 8635 + }, + { + "epoch": 0.6841750841750842, + "grad_norm": 1.3548945775697205, + "learning_rate": 4.7903928433996576e-06, + "loss": 0.1256, + "step": 8636 + }, + { + "epoch": 0.6842543077837195, + "grad_norm": 1.3128577428498032, + "learning_rate": 4.788202699890848e-06, + "loss": 0.1582, + "step": 8637 + }, + { + "epoch": 0.684333531392355, + "grad_norm": 2.064313624811447, + "learning_rate": 4.786012899553815e-06, + "loss": 0.2567, + "step": 8638 + }, + { + "epoch": 0.6844127550009903, + "grad_norm": 1.3536405598336756, + "learning_rate": 4.783823442532739e-06, + "loss": 0.2065, + "step": 8639 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 1.4525240588117532, + "learning_rate": 4.781634328971796e-06, + "loss": 0.1703, + "step": 8640 + }, + { + "epoch": 0.6845712022182611, + "grad_norm": 1.5287689114225305, + "learning_rate": 4.779445559015122e-06, + "loss": 0.1837, + "step": 8641 + }, + { + "epoch": 0.6846504258268964, + "grad_norm": 1.9253531908599657, + "learning_rate": 4.777257132806835e-06, + "loss": 0.1983, + "step": 8642 + }, + { + "epoch": 0.6847296494355318, + "grad_norm": 1.3591657661896646, + "learning_rate": 4.775069050491039e-06, + "loss": 0.1243, + "step": 8643 + }, + { + "epoch": 0.6848088730441672, + "grad_norm": 1.519596700431314, + "learning_rate": 4.772881312211805e-06, + "loss": 0.2302, + "step": 8644 + }, + { + "epoch": 0.6848880966528025, + "grad_norm": 1.5135885297213851, + "learning_rate": 4.770693918113183e-06, + "loss": 0.1793, + "step": 8645 + }, + { + "epoch": 0.6849673202614379, + "grad_norm": 1.1440790841583879, + "learning_rate": 4.768506868339206e-06, + "loss": 0.1275, + "step": 8646 + }, + { + "epoch": 0.6850465438700732, + "grad_norm": 1.7197533491395491, + "learning_rate": 4.766320163033882e-06, + "loss": 0.2059, + "step": 8647 + }, + { + "epoch": 0.6851257674787087, + "grad_norm": 1.2826680838873479, + "learning_rate": 4.764133802341188e-06, + "loss": 0.1473, + "step": 8648 + }, + { + "epoch": 0.685204991087344, + "grad_norm": 1.5301814973492749, + "learning_rate": 4.761947786405092e-06, + "loss": 0.248, + "step": 8649 + }, + { + "epoch": 0.6852842146959794, + "grad_norm": 1.4016813442635385, + "learning_rate": 4.759762115369531e-06, + "loss": 0.1664, + "step": 8650 + }, + { + "epoch": 0.6853634383046148, + "grad_norm": 1.671388275849299, + "learning_rate": 4.7575767893784174e-06, + "loss": 0.1807, + "step": 8651 + }, + { + "epoch": 0.6854426619132501, + "grad_norm": 1.3489819520227455, + "learning_rate": 4.755391808575651e-06, + "loss": 0.143, + "step": 8652 + }, + { + "epoch": 0.6855218855218855, + "grad_norm": 1.9152517469065016, + "learning_rate": 4.7532071731050975e-06, + "loss": 0.2727, + "step": 8653 + }, + { + "epoch": 0.6856011091305209, + "grad_norm": 1.24174475158412, + "learning_rate": 4.7510228831106064e-06, + "loss": 0.134, + "step": 8654 + }, + { + "epoch": 0.6856803327391563, + "grad_norm": 1.6081747422509234, + "learning_rate": 4.748838938735999e-06, + "loss": 0.2238, + "step": 8655 + }, + { + "epoch": 0.6857595563477916, + "grad_norm": 1.2428472170982199, + "learning_rate": 4.746655340125082e-06, + "loss": 0.1564, + "step": 8656 + }, + { + "epoch": 0.6858387799564271, + "grad_norm": 1.6532301274027756, + "learning_rate": 4.744472087421635e-06, + "loss": 0.1467, + "step": 8657 + }, + { + "epoch": 0.6859180035650624, + "grad_norm": 1.2173960770815595, + "learning_rate": 4.74228918076941e-06, + "loss": 0.1144, + "step": 8658 + }, + { + "epoch": 0.6859972271736977, + "grad_norm": 1.4581783599010332, + "learning_rate": 4.740106620312147e-06, + "loss": 0.1811, + "step": 8659 + }, + { + "epoch": 0.6860764507823331, + "grad_norm": 1.4110417666553414, + "learning_rate": 4.737924406193554e-06, + "loss": 0.1634, + "step": 8660 + }, + { + "epoch": 0.6861556743909685, + "grad_norm": 1.3954298236239187, + "learning_rate": 4.735742538557316e-06, + "loss": 0.1551, + "step": 8661 + }, + { + "epoch": 0.6862348979996039, + "grad_norm": 1.7224920943508635, + "learning_rate": 4.733561017547104e-06, + "loss": 0.2154, + "step": 8662 + }, + { + "epoch": 0.6863141216082392, + "grad_norm": 1.4352759584573156, + "learning_rate": 4.73137984330656e-06, + "loss": 0.1982, + "step": 8663 + }, + { + "epoch": 0.6863933452168747, + "grad_norm": 1.1239504535986455, + "learning_rate": 4.729199015979298e-06, + "loss": 0.1372, + "step": 8664 + }, + { + "epoch": 0.68647256882551, + "grad_norm": 1.2280103386989374, + "learning_rate": 4.727018535708922e-06, + "loss": 0.1188, + "step": 8665 + }, + { + "epoch": 0.6865517924341453, + "grad_norm": 1.8473294865599927, + "learning_rate": 4.724838402639006e-06, + "loss": 0.2499, + "step": 8666 + }, + { + "epoch": 0.6866310160427808, + "grad_norm": 1.1661714225758504, + "learning_rate": 4.7226586169130925e-06, + "loss": 0.1323, + "step": 8667 + }, + { + "epoch": 0.6867102396514161, + "grad_norm": 1.5509361208713552, + "learning_rate": 4.7204791786747215e-06, + "loss": 0.1886, + "step": 8668 + }, + { + "epoch": 0.6867894632600515, + "grad_norm": 1.8026075803806696, + "learning_rate": 4.718300088067392e-06, + "loss": 0.1883, + "step": 8669 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 1.532212530360673, + "learning_rate": 4.716121345234589e-06, + "loss": 0.1784, + "step": 8670 + }, + { + "epoch": 0.6869479104773223, + "grad_norm": 1.7311533499141019, + "learning_rate": 4.713942950319767e-06, + "loss": 0.177, + "step": 8671 + }, + { + "epoch": 0.6870271340859576, + "grad_norm": 1.6446493445822377, + "learning_rate": 4.71176490346637e-06, + "loss": 0.1756, + "step": 8672 + }, + { + "epoch": 0.6871063576945929, + "grad_norm": 1.1700753689368821, + "learning_rate": 4.709587204817809e-06, + "loss": 0.0979, + "step": 8673 + }, + { + "epoch": 0.6871855813032284, + "grad_norm": 1.371687825816755, + "learning_rate": 4.707409854517471e-06, + "loss": 0.181, + "step": 8674 + }, + { + "epoch": 0.6872648049118637, + "grad_norm": 1.788528039065579, + "learning_rate": 4.705232852708732e-06, + "loss": 0.2303, + "step": 8675 + }, + { + "epoch": 0.6873440285204991, + "grad_norm": 1.6632465599959885, + "learning_rate": 4.703056199534933e-06, + "loss": 0.1777, + "step": 8676 + }, + { + "epoch": 0.6874232521291345, + "grad_norm": 1.1073877635271137, + "learning_rate": 4.700879895139391e-06, + "loss": 0.1267, + "step": 8677 + }, + { + "epoch": 0.6875024757377699, + "grad_norm": 1.5096000596143704, + "learning_rate": 4.698703939665414e-06, + "loss": 0.1709, + "step": 8678 + }, + { + "epoch": 0.6875816993464052, + "grad_norm": 1.5486701488331398, + "learning_rate": 4.696528333256275e-06, + "loss": 0.1769, + "step": 8679 + }, + { + "epoch": 0.6876609229550406, + "grad_norm": 1.5936474109007752, + "learning_rate": 4.694353076055222e-06, + "loss": 0.1784, + "step": 8680 + }, + { + "epoch": 0.687740146563676, + "grad_norm": 1.1665492738772993, + "learning_rate": 4.6921781682054954e-06, + "loss": 0.1231, + "step": 8681 + }, + { + "epoch": 0.6878193701723113, + "grad_norm": 1.2100819032032752, + "learning_rate": 4.6900036098502956e-06, + "loss": 0.1366, + "step": 8682 + }, + { + "epoch": 0.6878985937809468, + "grad_norm": 1.422333271504052, + "learning_rate": 4.687829401132804e-06, + "loss": 0.1838, + "step": 8683 + }, + { + "epoch": 0.6879778173895821, + "grad_norm": 0.9749413880131753, + "learning_rate": 4.685655542196194e-06, + "loss": 0.1383, + "step": 8684 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 1.3402066322109363, + "learning_rate": 4.6834820331835915e-06, + "loss": 0.1824, + "step": 8685 + }, + { + "epoch": 0.6881362646068528, + "grad_norm": 0.9367352461671669, + "learning_rate": 4.681308874238112e-06, + "loss": 0.0655, + "step": 8686 + }, + { + "epoch": 0.6882154882154882, + "grad_norm": 1.6035176560395976, + "learning_rate": 4.679136065502855e-06, + "loss": 0.2201, + "step": 8687 + }, + { + "epoch": 0.6882947118241236, + "grad_norm": 1.6486245767304224, + "learning_rate": 4.676963607120886e-06, + "loss": 0.2086, + "step": 8688 + }, + { + "epoch": 0.6883739354327589, + "grad_norm": 1.6928345935177183, + "learning_rate": 4.674791499235246e-06, + "loss": 0.2289, + "step": 8689 + }, + { + "epoch": 0.6884531590413944, + "grad_norm": 1.5480049299610399, + "learning_rate": 4.672619741988966e-06, + "loss": 0.1614, + "step": 8690 + }, + { + "epoch": 0.6885323826500297, + "grad_norm": 1.257892792112472, + "learning_rate": 4.670448335525043e-06, + "loss": 0.1631, + "step": 8691 + }, + { + "epoch": 0.6886116062586651, + "grad_norm": 1.1952894179582885, + "learning_rate": 4.66827727998645e-06, + "loss": 0.1308, + "step": 8692 + }, + { + "epoch": 0.6886908298673005, + "grad_norm": 1.971716570449774, + "learning_rate": 4.666106575516146e-06, + "loss": 0.3032, + "step": 8693 + }, + { + "epoch": 0.6887700534759358, + "grad_norm": 1.4230007737961512, + "learning_rate": 4.663936222257059e-06, + "loss": 0.1155, + "step": 8694 + }, + { + "epoch": 0.6888492770845712, + "grad_norm": 1.874785931223803, + "learning_rate": 4.661766220352098e-06, + "loss": 0.2348, + "step": 8695 + }, + { + "epoch": 0.6889285006932065, + "grad_norm": 1.4325628356186775, + "learning_rate": 4.659596569944139e-06, + "loss": 0.1523, + "step": 8696 + }, + { + "epoch": 0.689007724301842, + "grad_norm": 1.325834981926651, + "learning_rate": 4.657427271176055e-06, + "loss": 0.1668, + "step": 8697 + }, + { + "epoch": 0.6890869479104773, + "grad_norm": 2.0312528835403527, + "learning_rate": 4.655258324190678e-06, + "loss": 0.353, + "step": 8698 + }, + { + "epoch": 0.6891661715191127, + "grad_norm": 1.6178334823322276, + "learning_rate": 4.65308972913082e-06, + "loss": 0.2728, + "step": 8699 + }, + { + "epoch": 0.6892453951277481, + "grad_norm": 1.3541334824730633, + "learning_rate": 4.6509214861392785e-06, + "loss": 0.205, + "step": 8700 + }, + { + "epoch": 0.6893246187363834, + "grad_norm": 1.4349157628828468, + "learning_rate": 4.648753595358818e-06, + "loss": 0.2138, + "step": 8701 + }, + { + "epoch": 0.6894038423450188, + "grad_norm": 1.432057521051746, + "learning_rate": 4.646586056932183e-06, + "loss": 0.1773, + "step": 8702 + }, + { + "epoch": 0.6894830659536542, + "grad_norm": 1.4339160875258519, + "learning_rate": 4.6444188710021e-06, + "loss": 0.2104, + "step": 8703 + }, + { + "epoch": 0.6895622895622896, + "grad_norm": 1.7248934291141744, + "learning_rate": 4.6422520377112646e-06, + "loss": 0.1958, + "step": 8704 + }, + { + "epoch": 0.6896415131709249, + "grad_norm": 1.3388525654957464, + "learning_rate": 4.640085557202349e-06, + "loss": 0.1354, + "step": 8705 + }, + { + "epoch": 0.6897207367795604, + "grad_norm": 1.581499144453623, + "learning_rate": 4.637919429618014e-06, + "loss": 0.1607, + "step": 8706 + }, + { + "epoch": 0.6897999603881957, + "grad_norm": 0.8982538474255346, + "learning_rate": 4.635753655100883e-06, + "loss": 0.1012, + "step": 8707 + }, + { + "epoch": 0.689879183996831, + "grad_norm": 1.4749880481329436, + "learning_rate": 4.633588233793559e-06, + "loss": 0.1473, + "step": 8708 + }, + { + "epoch": 0.6899584076054665, + "grad_norm": 1.1470822206467024, + "learning_rate": 4.631423165838632e-06, + "loss": 0.0726, + "step": 8709 + }, + { + "epoch": 0.6900376312141018, + "grad_norm": 1.6982771812494444, + "learning_rate": 4.629258451378658e-06, + "loss": 0.2228, + "step": 8710 + }, + { + "epoch": 0.6901168548227372, + "grad_norm": 1.9688465047513715, + "learning_rate": 4.6270940905561725e-06, + "loss": 0.214, + "step": 8711 + }, + { + "epoch": 0.6901960784313725, + "grad_norm": 1.3314539680265636, + "learning_rate": 4.624930083513684e-06, + "loss": 0.109, + "step": 8712 + }, + { + "epoch": 0.690275302040008, + "grad_norm": 1.3986515785934175, + "learning_rate": 4.62276643039369e-06, + "loss": 0.2048, + "step": 8713 + }, + { + "epoch": 0.6903545256486433, + "grad_norm": 1.5742571587693621, + "learning_rate": 4.620603131338655e-06, + "loss": 0.1813, + "step": 8714 + }, + { + "epoch": 0.6904337492572786, + "grad_norm": 1.7884500502894105, + "learning_rate": 4.6184401864910136e-06, + "loss": 0.1735, + "step": 8715 + }, + { + "epoch": 0.6905129728659141, + "grad_norm": 1.660343164917738, + "learning_rate": 4.616277595993196e-06, + "loss": 0.1885, + "step": 8716 + }, + { + "epoch": 0.6905921964745494, + "grad_norm": 1.4109575409332926, + "learning_rate": 4.614115359987595e-06, + "loss": 0.1306, + "step": 8717 + }, + { + "epoch": 0.6906714200831848, + "grad_norm": 1.7199965159552877, + "learning_rate": 4.6119534786165765e-06, + "loss": 0.1405, + "step": 8718 + }, + { + "epoch": 0.6907506436918202, + "grad_norm": 1.5895875125245018, + "learning_rate": 4.609791952022501e-06, + "loss": 0.1988, + "step": 8719 + }, + { + "epoch": 0.6908298673004556, + "grad_norm": 1.8749744340682035, + "learning_rate": 4.607630780347689e-06, + "loss": 0.2327, + "step": 8720 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 1.7836794920850219, + "learning_rate": 4.60546996373444e-06, + "loss": 0.1699, + "step": 8721 + }, + { + "epoch": 0.6909883145177262, + "grad_norm": 1.7214650559801976, + "learning_rate": 4.603309502325041e-06, + "loss": 0.2047, + "step": 8722 + }, + { + "epoch": 0.6910675381263617, + "grad_norm": 1.8908823606983562, + "learning_rate": 4.601149396261744e-06, + "loss": 0.2338, + "step": 8723 + }, + { + "epoch": 0.691146761734997, + "grad_norm": 1.4957853564365362, + "learning_rate": 4.598989645686782e-06, + "loss": 0.098, + "step": 8724 + }, + { + "epoch": 0.6912259853436324, + "grad_norm": 1.7823556390883566, + "learning_rate": 4.596830250742359e-06, + "loss": 0.2294, + "step": 8725 + }, + { + "epoch": 0.6913052089522678, + "grad_norm": 1.2385164046684465, + "learning_rate": 4.594671211570671e-06, + "loss": 0.1321, + "step": 8726 + }, + { + "epoch": 0.6913844325609031, + "grad_norm": 1.828158162834079, + "learning_rate": 4.592512528313874e-06, + "loss": 0.1689, + "step": 8727 + }, + { + "epoch": 0.6914636561695385, + "grad_norm": 1.7926999264677321, + "learning_rate": 4.590354201114103e-06, + "loss": 0.2378, + "step": 8728 + }, + { + "epoch": 0.6915428797781739, + "grad_norm": 1.610186300693577, + "learning_rate": 4.588196230113483e-06, + "loss": 0.2109, + "step": 8729 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 1.5223618774951608, + "learning_rate": 4.586038615454102e-06, + "loss": 0.1677, + "step": 8730 + }, + { + "epoch": 0.6917013269954446, + "grad_norm": 1.3886398562031925, + "learning_rate": 4.583881357278023e-06, + "loss": 0.1972, + "step": 8731 + }, + { + "epoch": 0.6917805506040801, + "grad_norm": 1.7596511414833698, + "learning_rate": 4.5817244557273e-06, + "loss": 0.1948, + "step": 8732 + }, + { + "epoch": 0.6918597742127154, + "grad_norm": 1.1345967722354966, + "learning_rate": 4.5795679109439505e-06, + "loss": 0.1432, + "step": 8733 + }, + { + "epoch": 0.6919389978213507, + "grad_norm": 1.8069484754960743, + "learning_rate": 4.57741172306997e-06, + "loss": 0.2403, + "step": 8734 + }, + { + "epoch": 0.6920182214299861, + "grad_norm": 1.7172458869331075, + "learning_rate": 4.5752558922473376e-06, + "loss": 0.2964, + "step": 8735 + }, + { + "epoch": 0.6920974450386215, + "grad_norm": 1.631047794247156, + "learning_rate": 4.573100418618004e-06, + "loss": 0.1682, + "step": 8736 + }, + { + "epoch": 0.6921766686472569, + "grad_norm": 1.8455462208712523, + "learning_rate": 4.57094530232389e-06, + "loss": 0.2753, + "step": 8737 + }, + { + "epoch": 0.6922558922558922, + "grad_norm": 1.1787348654968008, + "learning_rate": 4.5687905435069106e-06, + "loss": 0.1247, + "step": 8738 + }, + { + "epoch": 0.6923351158645277, + "grad_norm": 2.195912253818307, + "learning_rate": 4.566636142308939e-06, + "loss": 0.1579, + "step": 8739 + }, + { + "epoch": 0.692414339473163, + "grad_norm": 3.091964749142207, + "learning_rate": 4.564482098871834e-06, + "loss": 0.2239, + "step": 8740 + }, + { + "epoch": 0.6924935630817983, + "grad_norm": 1.3567211670479813, + "learning_rate": 4.562328413337426e-06, + "loss": 0.1428, + "step": 8741 + }, + { + "epoch": 0.6925727866904338, + "grad_norm": 1.080283629361243, + "learning_rate": 4.56017508584753e-06, + "loss": 0.1262, + "step": 8742 + }, + { + "epoch": 0.6926520102990691, + "grad_norm": 1.606299752985764, + "learning_rate": 4.558022116543931e-06, + "loss": 0.1625, + "step": 8743 + }, + { + "epoch": 0.6927312339077045, + "grad_norm": 1.2212029780701115, + "learning_rate": 4.555869505568386e-06, + "loss": 0.1477, + "step": 8744 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 1.5689616085354774, + "learning_rate": 4.553717253062643e-06, + "loss": 0.2156, + "step": 8745 + }, + { + "epoch": 0.6928896811249753, + "grad_norm": 1.3995143891766861, + "learning_rate": 4.551565359168411e-06, + "loss": 0.1878, + "step": 8746 + }, + { + "epoch": 0.6929689047336106, + "grad_norm": 1.3027589775296664, + "learning_rate": 4.549413824027382e-06, + "loss": 0.1739, + "step": 8747 + }, + { + "epoch": 0.6930481283422459, + "grad_norm": 1.5205078437727644, + "learning_rate": 4.54726264778123e-06, + "loss": 0.2141, + "step": 8748 + }, + { + "epoch": 0.6931273519508814, + "grad_norm": 1.7471575973709945, + "learning_rate": 4.5451118305715954e-06, + "loss": 0.201, + "step": 8749 + }, + { + "epoch": 0.6932065755595167, + "grad_norm": 1.6023460534782106, + "learning_rate": 4.542961372540096e-06, + "loss": 0.1829, + "step": 8750 + }, + { + "epoch": 0.6932857991681521, + "grad_norm": 1.6900977030189235, + "learning_rate": 4.540811273828336e-06, + "loss": 0.1763, + "step": 8751 + }, + { + "epoch": 0.6933650227767875, + "grad_norm": 1.2231750768791714, + "learning_rate": 4.538661534577886e-06, + "loss": 0.1514, + "step": 8752 + }, + { + "epoch": 0.6934442463854229, + "grad_norm": 1.455701378263639, + "learning_rate": 4.5365121549302916e-06, + "loss": 0.1377, + "step": 8753 + }, + { + "epoch": 0.6935234699940582, + "grad_norm": 1.5172526079868556, + "learning_rate": 4.534363135027086e-06, + "loss": 0.1059, + "step": 8754 + }, + { + "epoch": 0.6936026936026936, + "grad_norm": 1.4068100376808945, + "learning_rate": 4.532214475009771e-06, + "loss": 0.1586, + "step": 8755 + }, + { + "epoch": 0.693681917211329, + "grad_norm": 1.510338022782643, + "learning_rate": 4.530066175019823e-06, + "loss": 0.1976, + "step": 8756 + }, + { + "epoch": 0.6937611408199643, + "grad_norm": 1.3681933467887286, + "learning_rate": 4.527918235198692e-06, + "loss": 0.1736, + "step": 8757 + }, + { + "epoch": 0.6938403644285998, + "grad_norm": 1.5002105724287578, + "learning_rate": 4.525770655687821e-06, + "loss": 0.1559, + "step": 8758 + }, + { + "epoch": 0.6939195880372351, + "grad_norm": 1.6395539335526241, + "learning_rate": 4.523623436628611e-06, + "loss": 0.1948, + "step": 8759 + }, + { + "epoch": 0.6939988116458705, + "grad_norm": 1.4844249947243022, + "learning_rate": 4.521476578162445e-06, + "loss": 0.1707, + "step": 8760 + }, + { + "epoch": 0.6940780352545058, + "grad_norm": 1.660084412834462, + "learning_rate": 4.519330080430687e-06, + "loss": 0.1619, + "step": 8761 + }, + { + "epoch": 0.6941572588631412, + "grad_norm": 1.5930209903055617, + "learning_rate": 4.517183943574673e-06, + "loss": 0.1126, + "step": 8762 + }, + { + "epoch": 0.6942364824717766, + "grad_norm": 1.53277951821348, + "learning_rate": 4.515038167735715e-06, + "loss": 0.2223, + "step": 8763 + }, + { + "epoch": 0.6943157060804119, + "grad_norm": 1.5668923502121344, + "learning_rate": 4.5128927530551e-06, + "loss": 0.1554, + "step": 8764 + }, + { + "epoch": 0.6943949296890474, + "grad_norm": 1.0271379523963158, + "learning_rate": 4.510747699674096e-06, + "loss": 0.0723, + "step": 8765 + }, + { + "epoch": 0.6944741532976827, + "grad_norm": 1.659729203208413, + "learning_rate": 4.50860300773394e-06, + "loss": 0.1685, + "step": 8766 + }, + { + "epoch": 0.6945533769063181, + "grad_norm": 1.52005897124796, + "learning_rate": 4.506458677375856e-06, + "loss": 0.2384, + "step": 8767 + }, + { + "epoch": 0.6946326005149535, + "grad_norm": 1.5540184120262797, + "learning_rate": 4.504314708741037e-06, + "loss": 0.2161, + "step": 8768 + }, + { + "epoch": 0.6947118241235888, + "grad_norm": 1.5411222086441576, + "learning_rate": 4.502171101970645e-06, + "loss": 0.1397, + "step": 8769 + }, + { + "epoch": 0.6947910477322242, + "grad_norm": 1.5807101510254558, + "learning_rate": 4.5000278572058365e-06, + "loss": 0.155, + "step": 8770 + }, + { + "epoch": 0.6948702713408595, + "grad_norm": 1.327370532286676, + "learning_rate": 4.497884974587729e-06, + "loss": 0.1607, + "step": 8771 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 1.6274011071040737, + "learning_rate": 4.495742454257418e-06, + "loss": 0.1787, + "step": 8772 + }, + { + "epoch": 0.6950287185581303, + "grad_norm": 1.7330720916192606, + "learning_rate": 4.493600296355986e-06, + "loss": 0.2272, + "step": 8773 + }, + { + "epoch": 0.6951079421667657, + "grad_norm": 1.6835440092590113, + "learning_rate": 4.491458501024479e-06, + "loss": 0.2284, + "step": 8774 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 1.4897429817901569, + "learning_rate": 4.489317068403919e-06, + "loss": 0.134, + "step": 8775 + }, + { + "epoch": 0.6952663893840364, + "grad_norm": 1.4167617005001414, + "learning_rate": 4.487175998635319e-06, + "loss": 0.1939, + "step": 8776 + }, + { + "epoch": 0.6953456129926718, + "grad_norm": 1.7576690089550147, + "learning_rate": 4.485035291859654e-06, + "loss": 0.1967, + "step": 8777 + }, + { + "epoch": 0.6954248366013072, + "grad_norm": 1.5485858735484967, + "learning_rate": 4.482894948217875e-06, + "loss": 0.1281, + "step": 8778 + }, + { + "epoch": 0.6955040602099426, + "grad_norm": 1.1978771136500905, + "learning_rate": 4.48075496785092e-06, + "loss": 0.1168, + "step": 8779 + }, + { + "epoch": 0.6955832838185779, + "grad_norm": 1.6552400910180027, + "learning_rate": 4.4786153508996944e-06, + "loss": 0.1665, + "step": 8780 + }, + { + "epoch": 0.6956625074272134, + "grad_norm": 1.4449352306810848, + "learning_rate": 4.47647609750508e-06, + "loss": 0.1752, + "step": 8781 + }, + { + "epoch": 0.6957417310358487, + "grad_norm": 1.4604329747333165, + "learning_rate": 4.4743372078079335e-06, + "loss": 0.2001, + "step": 8782 + }, + { + "epoch": 0.695820954644484, + "grad_norm": 1.6249618198063909, + "learning_rate": 4.472198681949098e-06, + "loss": 0.1974, + "step": 8783 + }, + { + "epoch": 0.6959001782531195, + "grad_norm": 1.6229216171237462, + "learning_rate": 4.470060520069381e-06, + "loss": 0.2059, + "step": 8784 + }, + { + "epoch": 0.6959794018617548, + "grad_norm": 1.5001605629559505, + "learning_rate": 4.467922722309567e-06, + "loss": 0.2107, + "step": 8785 + }, + { + "epoch": 0.6960586254703902, + "grad_norm": 1.5417537946201236, + "learning_rate": 4.465785288810427e-06, + "loss": 0.1873, + "step": 8786 + }, + { + "epoch": 0.6961378490790255, + "grad_norm": 1.5249782011005326, + "learning_rate": 4.4636482197126965e-06, + "loss": 0.1825, + "step": 8787 + }, + { + "epoch": 0.696217072687661, + "grad_norm": 1.27441986603953, + "learning_rate": 4.461511515157087e-06, + "loss": 0.1317, + "step": 8788 + }, + { + "epoch": 0.6962962962962963, + "grad_norm": 1.5797578812923285, + "learning_rate": 4.459375175284299e-06, + "loss": 0.1731, + "step": 8789 + }, + { + "epoch": 0.6963755199049316, + "grad_norm": 1.6589594867140478, + "learning_rate": 4.457239200234996e-06, + "loss": 0.2071, + "step": 8790 + }, + { + "epoch": 0.6964547435135671, + "grad_norm": 1.2242560138144052, + "learning_rate": 4.4551035901498186e-06, + "loss": 0.1612, + "step": 8791 + }, + { + "epoch": 0.6965339671222024, + "grad_norm": 1.2752944417907712, + "learning_rate": 4.4529683451693916e-06, + "loss": 0.1648, + "step": 8792 + }, + { + "epoch": 0.6966131907308378, + "grad_norm": 1.559373174148234, + "learning_rate": 4.45083346543431e-06, + "loss": 0.1804, + "step": 8793 + }, + { + "epoch": 0.6966924143394732, + "grad_norm": 1.514190432817593, + "learning_rate": 4.448698951085143e-06, + "loss": 0.1849, + "step": 8794 + }, + { + "epoch": 0.6967716379481086, + "grad_norm": 1.2216284628743566, + "learning_rate": 4.446564802262435e-06, + "loss": 0.0752, + "step": 8795 + }, + { + "epoch": 0.6968508615567439, + "grad_norm": 1.6983014512321104, + "learning_rate": 4.444431019106718e-06, + "loss": 0.1408, + "step": 8796 + }, + { + "epoch": 0.6969300851653792, + "grad_norm": 1.7940706976307725, + "learning_rate": 4.4422976017584866e-06, + "loss": 0.2423, + "step": 8797 + }, + { + "epoch": 0.6970093087740147, + "grad_norm": 1.4713389715624672, + "learning_rate": 4.440164550358212e-06, + "loss": 0.1736, + "step": 8798 + }, + { + "epoch": 0.69708853238265, + "grad_norm": 1.4864484111845657, + "learning_rate": 4.438031865046353e-06, + "loss": 0.1406, + "step": 8799 + }, + { + "epoch": 0.6971677559912854, + "grad_norm": 1.4426463147987603, + "learning_rate": 4.435899545963333e-06, + "loss": 0.1687, + "step": 8800 + }, + { + "epoch": 0.6972469795999208, + "grad_norm": 1.2427763308582487, + "learning_rate": 4.4337675932495515e-06, + "loss": 0.1252, + "step": 8801 + }, + { + "epoch": 0.6973262032085561, + "grad_norm": 1.4530691095101038, + "learning_rate": 4.431636007045396e-06, + "loss": 0.1211, + "step": 8802 + }, + { + "epoch": 0.6974054268171915, + "grad_norm": 1.5475078520014272, + "learning_rate": 4.429504787491214e-06, + "loss": 0.1457, + "step": 8803 + }, + { + "epoch": 0.6974846504258269, + "grad_norm": 1.662190831622774, + "learning_rate": 4.427373934727337e-06, + "loss": 0.3278, + "step": 8804 + }, + { + "epoch": 0.6975638740344623, + "grad_norm": 1.306433587450784, + "learning_rate": 4.425243448894074e-06, + "loss": 0.1169, + "step": 8805 + }, + { + "epoch": 0.6976430976430976, + "grad_norm": 1.4739453168690189, + "learning_rate": 4.423113330131708e-06, + "loss": 0.1825, + "step": 8806 + }, + { + "epoch": 0.6977223212517331, + "grad_norm": 2.0608129315474155, + "learning_rate": 4.42098357858049e-06, + "loss": 0.2356, + "step": 8807 + }, + { + "epoch": 0.6978015448603684, + "grad_norm": 1.3136278113741746, + "learning_rate": 4.418854194380663e-06, + "loss": 0.1831, + "step": 8808 + }, + { + "epoch": 0.6978807684690037, + "grad_norm": 1.1972763349322344, + "learning_rate": 4.416725177672432e-06, + "loss": 0.119, + "step": 8809 + }, + { + "epoch": 0.6979599920776391, + "grad_norm": 1.2721603945797595, + "learning_rate": 4.4145965285959836e-06, + "loss": 0.1668, + "step": 8810 + }, + { + "epoch": 0.6980392156862745, + "grad_norm": 1.4178797548286315, + "learning_rate": 4.412468247291474e-06, + "loss": 0.1585, + "step": 8811 + }, + { + "epoch": 0.6981184392949099, + "grad_norm": 1.4284786010511097, + "learning_rate": 4.410340333899049e-06, + "loss": 0.1732, + "step": 8812 + }, + { + "epoch": 0.6981976629035452, + "grad_norm": 1.7700032727310824, + "learning_rate": 4.408212788558818e-06, + "loss": 0.2066, + "step": 8813 + }, + { + "epoch": 0.6982768865121807, + "grad_norm": 1.9077384061794467, + "learning_rate": 4.406085611410864e-06, + "loss": 0.2425, + "step": 8814 + }, + { + "epoch": 0.698356110120816, + "grad_norm": 1.7435868726619532, + "learning_rate": 4.403958802595261e-06, + "loss": 0.2269, + "step": 8815 + }, + { + "epoch": 0.6984353337294513, + "grad_norm": 1.59591571426409, + "learning_rate": 4.401832362252044e-06, + "loss": 0.1892, + "step": 8816 + }, + { + "epoch": 0.6985145573380868, + "grad_norm": 1.5245462242490293, + "learning_rate": 4.399706290521225e-06, + "loss": 0.1577, + "step": 8817 + }, + { + "epoch": 0.6985937809467221, + "grad_norm": 1.6348777969074537, + "learning_rate": 4.397580587542805e-06, + "loss": 0.18, + "step": 8818 + }, + { + "epoch": 0.6986730045553575, + "grad_norm": 1.3656603707151018, + "learning_rate": 4.3954552534567455e-06, + "loss": 0.1871, + "step": 8819 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 1.5013686411109204, + "learning_rate": 4.393330288402986e-06, + "loss": 0.2102, + "step": 8820 + }, + { + "epoch": 0.6988314517726283, + "grad_norm": 1.7795540555044727, + "learning_rate": 4.391205692521453e-06, + "loss": 0.2148, + "step": 8821 + }, + { + "epoch": 0.6989106753812636, + "grad_norm": 1.544563262211698, + "learning_rate": 4.389081465952039e-06, + "loss": 0.2224, + "step": 8822 + }, + { + "epoch": 0.6989898989898989, + "grad_norm": 1.185014011067305, + "learning_rate": 4.386957608834607e-06, + "loss": 0.1303, + "step": 8823 + }, + { + "epoch": 0.6990691225985344, + "grad_norm": 1.7588568938329712, + "learning_rate": 4.384834121309013e-06, + "loss": 0.2658, + "step": 8824 + }, + { + "epoch": 0.6991483462071697, + "grad_norm": 1.7644065661209543, + "learning_rate": 4.382711003515072e-06, + "loss": 0.2251, + "step": 8825 + }, + { + "epoch": 0.6992275698158051, + "grad_norm": 1.4340255292466626, + "learning_rate": 4.3805882555925846e-06, + "loss": 0.1406, + "step": 8826 + }, + { + "epoch": 0.6993067934244405, + "grad_norm": 1.6475867037658725, + "learning_rate": 4.378465877681317e-06, + "loss": 0.179, + "step": 8827 + }, + { + "epoch": 0.6993860170330759, + "grad_norm": 1.5567330383733182, + "learning_rate": 4.376343869921027e-06, + "loss": 0.1759, + "step": 8828 + }, + { + "epoch": 0.6994652406417112, + "grad_norm": 1.6587662626714532, + "learning_rate": 4.374222232451433e-06, + "loss": 0.2468, + "step": 8829 + }, + { + "epoch": 0.6995444642503466, + "grad_norm": 1.3883333374420803, + "learning_rate": 4.3721009654122315e-06, + "loss": 0.1909, + "step": 8830 + }, + { + "epoch": 0.699623687858982, + "grad_norm": 1.046845583936344, + "learning_rate": 4.369980068943106e-06, + "loss": 0.1085, + "step": 8831 + }, + { + "epoch": 0.6997029114676173, + "grad_norm": 1.3935156296815707, + "learning_rate": 4.367859543183702e-06, + "loss": 0.1833, + "step": 8832 + }, + { + "epoch": 0.6997821350762528, + "grad_norm": 1.3237658160684287, + "learning_rate": 4.3657393882736456e-06, + "loss": 0.1298, + "step": 8833 + }, + { + "epoch": 0.6998613586848881, + "grad_norm": 1.5712109424456409, + "learning_rate": 4.3636196043525415e-06, + "loss": 0.2106, + "step": 8834 + }, + { + "epoch": 0.6999405822935235, + "grad_norm": 1.3725337887334645, + "learning_rate": 4.361500191559967e-06, + "loss": 0.1747, + "step": 8835 + }, + { + "epoch": 0.7000198059021588, + "grad_norm": 1.5623309104647334, + "learning_rate": 4.35938115003547e-06, + "loss": 0.216, + "step": 8836 + }, + { + "epoch": 0.7000990295107942, + "grad_norm": 1.2808268527345514, + "learning_rate": 4.357262479918587e-06, + "loss": 0.1374, + "step": 8837 + }, + { + "epoch": 0.7001782531194296, + "grad_norm": 1.4413433223481251, + "learning_rate": 4.355144181348819e-06, + "loss": 0.1567, + "step": 8838 + }, + { + "epoch": 0.7002574767280649, + "grad_norm": 1.3885586702137263, + "learning_rate": 4.353026254465642e-06, + "loss": 0.1537, + "step": 8839 + }, + { + "epoch": 0.7003367003367004, + "grad_norm": 1.3189938295933838, + "learning_rate": 4.350908699408521e-06, + "loss": 0.1915, + "step": 8840 + }, + { + "epoch": 0.7004159239453357, + "grad_norm": 1.5574676213052678, + "learning_rate": 4.348791516316878e-06, + "loss": 0.1929, + "step": 8841 + }, + { + "epoch": 0.7004951475539711, + "grad_norm": 1.519744105590134, + "learning_rate": 4.346674705330117e-06, + "loss": 0.211, + "step": 8842 + }, + { + "epoch": 0.7005743711626065, + "grad_norm": 1.4923351191068697, + "learning_rate": 4.344558266587628e-06, + "loss": 0.1728, + "step": 8843 + }, + { + "epoch": 0.7006535947712418, + "grad_norm": 1.2777758524075027, + "learning_rate": 4.342442200228766e-06, + "loss": 0.1165, + "step": 8844 + }, + { + "epoch": 0.7007328183798772, + "grad_norm": 1.4407727984525156, + "learning_rate": 4.340326506392859e-06, + "loss": 0.1399, + "step": 8845 + }, + { + "epoch": 0.7008120419885125, + "grad_norm": 1.408086677255607, + "learning_rate": 4.338211185219222e-06, + "loss": 0.1806, + "step": 8846 + }, + { + "epoch": 0.700891265597148, + "grad_norm": 1.8640505525064341, + "learning_rate": 4.336096236847136e-06, + "loss": 0.2228, + "step": 8847 + }, + { + "epoch": 0.7009704892057833, + "grad_norm": 1.4004136210739297, + "learning_rate": 4.333981661415856e-06, + "loss": 0.0971, + "step": 8848 + }, + { + "epoch": 0.7010497128144187, + "grad_norm": 1.9819534699666377, + "learning_rate": 4.331867459064623e-06, + "loss": 0.164, + "step": 8849 + }, + { + "epoch": 0.7011289364230541, + "grad_norm": 1.4553427709485196, + "learning_rate": 4.329753629932646e-06, + "loss": 0.2207, + "step": 8850 + }, + { + "epoch": 0.7012081600316894, + "grad_norm": 1.3298618348286386, + "learning_rate": 4.327640174159109e-06, + "loss": 0.1999, + "step": 8851 + }, + { + "epoch": 0.7012873836403248, + "grad_norm": 2.6685826773151295, + "learning_rate": 4.325527091883168e-06, + "loss": 0.1119, + "step": 8852 + }, + { + "epoch": 0.7013666072489602, + "grad_norm": 1.644816105366692, + "learning_rate": 4.323414383243969e-06, + "loss": 0.1971, + "step": 8853 + }, + { + "epoch": 0.7014458308575956, + "grad_norm": 1.311762122734592, + "learning_rate": 4.321302048380619e-06, + "loss": 0.1716, + "step": 8854 + }, + { + "epoch": 0.7015250544662309, + "grad_norm": 1.9886884479696556, + "learning_rate": 4.319190087432201e-06, + "loss": 0.2093, + "step": 8855 + }, + { + "epoch": 0.7016042780748664, + "grad_norm": 1.531879407864641, + "learning_rate": 4.317078500537785e-06, + "loss": 0.242, + "step": 8856 + }, + { + "epoch": 0.7016835016835017, + "grad_norm": 1.6220269552376552, + "learning_rate": 4.314967287836405e-06, + "loss": 0.1758, + "step": 8857 + }, + { + "epoch": 0.701762725292137, + "grad_norm": 1.8019874746318434, + "learning_rate": 4.3128564494670715e-06, + "loss": 0.1991, + "step": 8858 + }, + { + "epoch": 0.7018419489007724, + "grad_norm": 1.5588080158695856, + "learning_rate": 4.310745985568779e-06, + "loss": 0.192, + "step": 8859 + }, + { + "epoch": 0.7019211725094078, + "grad_norm": 1.76137450276114, + "learning_rate": 4.3086358962804885e-06, + "loss": 0.1973, + "step": 8860 + }, + { + "epoch": 0.7020003961180432, + "grad_norm": 1.6124635917507928, + "learning_rate": 4.306526181741135e-06, + "loss": 0.1804, + "step": 8861 + }, + { + "epoch": 0.7020796197266785, + "grad_norm": 1.635892995978041, + "learning_rate": 4.304416842089641e-06, + "loss": 0.1845, + "step": 8862 + }, + { + "epoch": 0.702158843335314, + "grad_norm": 1.9615192963171089, + "learning_rate": 4.302307877464893e-06, + "loss": 0.1946, + "step": 8863 + }, + { + "epoch": 0.7022380669439493, + "grad_norm": 1.4449076665617835, + "learning_rate": 4.300199288005753e-06, + "loss": 0.1803, + "step": 8864 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 1.3918651887048008, + "learning_rate": 4.298091073851066e-06, + "loss": 0.1124, + "step": 8865 + }, + { + "epoch": 0.7023965141612201, + "grad_norm": 1.095485170651712, + "learning_rate": 4.295983235139647e-06, + "loss": 0.109, + "step": 8866 + }, + { + "epoch": 0.7024757377698554, + "grad_norm": 2.058633734473713, + "learning_rate": 4.293875772010287e-06, + "loss": 0.1866, + "step": 8867 + }, + { + "epoch": 0.7025549613784908, + "grad_norm": 1.9308386793385959, + "learning_rate": 4.291768684601746e-06, + "loss": 0.2198, + "step": 8868 + }, + { + "epoch": 0.7026341849871262, + "grad_norm": 1.5543563999701846, + "learning_rate": 4.289661973052774e-06, + "loss": 0.1633, + "step": 8869 + }, + { + "epoch": 0.7027134085957616, + "grad_norm": 1.5453341596645978, + "learning_rate": 4.287555637502086e-06, + "loss": 0.1829, + "step": 8870 + }, + { + "epoch": 0.7027926322043969, + "grad_norm": 1.693391313464157, + "learning_rate": 4.285449678088369e-06, + "loss": 0.2424, + "step": 8871 + }, + { + "epoch": 0.7028718558130322, + "grad_norm": 1.4568989201158453, + "learning_rate": 4.283344094950297e-06, + "loss": 0.1886, + "step": 8872 + }, + { + "epoch": 0.7029510794216677, + "grad_norm": 1.6322315242858605, + "learning_rate": 4.2812388882265095e-06, + "loss": 0.1764, + "step": 8873 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 1.3878895168664522, + "learning_rate": 4.279134058055622e-06, + "loss": 0.1587, + "step": 8874 + }, + { + "epoch": 0.7031095266389384, + "grad_norm": 2.111072501513597, + "learning_rate": 4.2770296045762315e-06, + "loss": 0.1631, + "step": 8875 + }, + { + "epoch": 0.7031887502475738, + "grad_norm": 1.299754877623927, + "learning_rate": 4.274925527926907e-06, + "loss": 0.1962, + "step": 8876 + }, + { + "epoch": 0.7032679738562092, + "grad_norm": 1.373546269350586, + "learning_rate": 4.272821828246183e-06, + "loss": 0.1201, + "step": 8877 + }, + { + "epoch": 0.7033471974648445, + "grad_norm": 1.5419916617151102, + "learning_rate": 4.270718505672588e-06, + "loss": 0.1939, + "step": 8878 + }, + { + "epoch": 0.7034264210734799, + "grad_norm": 1.5567728449586737, + "learning_rate": 4.2686155603446134e-06, + "loss": 0.1811, + "step": 8879 + }, + { + "epoch": 0.7035056446821153, + "grad_norm": 1.702245443561457, + "learning_rate": 4.266512992400726e-06, + "loss": 0.2542, + "step": 8880 + }, + { + "epoch": 0.7035848682907506, + "grad_norm": 1.2994612534873151, + "learning_rate": 4.2644108019793665e-06, + "loss": 0.182, + "step": 8881 + }, + { + "epoch": 0.7036640918993861, + "grad_norm": 1.3977597901480356, + "learning_rate": 4.262308989218961e-06, + "loss": 0.1705, + "step": 8882 + }, + { + "epoch": 0.7037433155080214, + "grad_norm": 1.3800724286233983, + "learning_rate": 4.2602075542579e-06, + "loss": 0.1781, + "step": 8883 + }, + { + "epoch": 0.7038225391166567, + "grad_norm": 1.563107963642451, + "learning_rate": 4.258106497234551e-06, + "loss": 0.1775, + "step": 8884 + }, + { + "epoch": 0.7039017627252921, + "grad_norm": 1.7822286172705617, + "learning_rate": 4.256005818287265e-06, + "loss": 0.2098, + "step": 8885 + }, + { + "epoch": 0.7039809863339275, + "grad_norm": 1.3553466213219136, + "learning_rate": 4.253905517554356e-06, + "loss": 0.1552, + "step": 8886 + }, + { + "epoch": 0.7040602099425629, + "grad_norm": 1.2839616722318572, + "learning_rate": 4.251805595174117e-06, + "loss": 0.1365, + "step": 8887 + }, + { + "epoch": 0.7041394335511982, + "grad_norm": 1.493554645693374, + "learning_rate": 4.249706051284824e-06, + "loss": 0.1992, + "step": 8888 + }, + { + "epoch": 0.7042186571598337, + "grad_norm": 1.6374959145264998, + "learning_rate": 4.24760688602472e-06, + "loss": 0.2374, + "step": 8889 + }, + { + "epoch": 0.704297880768469, + "grad_norm": 1.6289974334495363, + "learning_rate": 4.245508099532021e-06, + "loss": 0.1874, + "step": 8890 + }, + { + "epoch": 0.7043771043771043, + "grad_norm": 1.68010282707641, + "learning_rate": 4.243409691944927e-06, + "loss": 0.2111, + "step": 8891 + }, + { + "epoch": 0.7044563279857398, + "grad_norm": 1.4476513419764068, + "learning_rate": 4.241311663401606e-06, + "loss": 0.1931, + "step": 8892 + }, + { + "epoch": 0.7045355515943751, + "grad_norm": 1.7292278035688797, + "learning_rate": 4.2392140140401996e-06, + "loss": 0.1543, + "step": 8893 + }, + { + "epoch": 0.7046147752030105, + "grad_norm": 1.6431321335300442, + "learning_rate": 4.237116743998835e-06, + "loss": 0.2225, + "step": 8894 + }, + { + "epoch": 0.7046939988116458, + "grad_norm": 1.3570553370735827, + "learning_rate": 4.235019853415603e-06, + "loss": 0.1912, + "step": 8895 + }, + { + "epoch": 0.7047732224202813, + "grad_norm": 1.227240285546194, + "learning_rate": 4.232923342428574e-06, + "loss": 0.1541, + "step": 8896 + }, + { + "epoch": 0.7048524460289166, + "grad_norm": 1.6151021982027407, + "learning_rate": 4.230827211175791e-06, + "loss": 0.2092, + "step": 8897 + }, + { + "epoch": 0.7049316696375519, + "grad_norm": 1.458975248192357, + "learning_rate": 4.22873145979528e-06, + "loss": 0.1252, + "step": 8898 + }, + { + "epoch": 0.7050108932461874, + "grad_norm": 1.5557373638847387, + "learning_rate": 4.226636088425033e-06, + "loss": 0.1744, + "step": 8899 + }, + { + "epoch": 0.7050901168548227, + "grad_norm": 1.6646056763561519, + "learning_rate": 4.2245410972030154e-06, + "loss": 0.1758, + "step": 8900 + }, + { + "epoch": 0.7051693404634581, + "grad_norm": 1.4150966729128942, + "learning_rate": 4.222446486267181e-06, + "loss": 0.167, + "step": 8901 + }, + { + "epoch": 0.7052485640720935, + "grad_norm": 1.6674804939628438, + "learning_rate": 4.220352255755445e-06, + "loss": 0.2148, + "step": 8902 + }, + { + "epoch": 0.7053277876807289, + "grad_norm": 1.3256800193370304, + "learning_rate": 4.218258405805701e-06, + "loss": 0.1189, + "step": 8903 + }, + { + "epoch": 0.7054070112893642, + "grad_norm": 1.673897002571839, + "learning_rate": 4.216164936555823e-06, + "loss": 0.2511, + "step": 8904 + }, + { + "epoch": 0.7054862348979996, + "grad_norm": 2.2486388415797007, + "learning_rate": 4.214071848143655e-06, + "loss": 0.3101, + "step": 8905 + }, + { + "epoch": 0.705565458506635, + "grad_norm": 1.34213107718111, + "learning_rate": 4.211979140707012e-06, + "loss": 0.1628, + "step": 8906 + }, + { + "epoch": 0.7056446821152703, + "grad_norm": 1.6337656733968111, + "learning_rate": 4.209886814383696e-06, + "loss": 0.2478, + "step": 8907 + }, + { + "epoch": 0.7057239057239058, + "grad_norm": 1.5786716513057881, + "learning_rate": 4.207794869311472e-06, + "loss": 0.2083, + "step": 8908 + }, + { + "epoch": 0.7058031293325411, + "grad_norm": 1.7102498400449593, + "learning_rate": 4.205703305628082e-06, + "loss": 0.191, + "step": 8909 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.49348634855011, + "learning_rate": 4.203612123471254e-06, + "loss": 0.214, + "step": 8910 + }, + { + "epoch": 0.7059615765498118, + "grad_norm": 1.1954013435293973, + "learning_rate": 4.201521322978677e-06, + "loss": 0.1191, + "step": 8911 + }, + { + "epoch": 0.7060408001584472, + "grad_norm": 1.6047416950397644, + "learning_rate": 4.19943090428802e-06, + "loss": 0.239, + "step": 8912 + }, + { + "epoch": 0.7061200237670826, + "grad_norm": 1.283928520375124, + "learning_rate": 4.197340867536923e-06, + "loss": 0.097, + "step": 8913 + }, + { + "epoch": 0.7061992473757179, + "grad_norm": 1.2820688306081138, + "learning_rate": 4.195251212863014e-06, + "loss": 0.1543, + "step": 8914 + }, + { + "epoch": 0.7062784709843534, + "grad_norm": 2.3078728324176594, + "learning_rate": 4.193161940403882e-06, + "loss": 0.3387, + "step": 8915 + }, + { + "epoch": 0.7063576945929887, + "grad_norm": 1.087293674546052, + "learning_rate": 4.191073050297091e-06, + "loss": 0.0712, + "step": 8916 + }, + { + "epoch": 0.7064369182016241, + "grad_norm": 1.8274393871135801, + "learning_rate": 4.188984542680192e-06, + "loss": 0.2214, + "step": 8917 + }, + { + "epoch": 0.7065161418102595, + "grad_norm": 1.599643811720071, + "learning_rate": 4.186896417690701e-06, + "loss": 0.1643, + "step": 8918 + }, + { + "epoch": 0.7065953654188948, + "grad_norm": 1.5811607679026036, + "learning_rate": 4.18480867546611e-06, + "loss": 0.1555, + "step": 8919 + }, + { + "epoch": 0.7066745890275302, + "grad_norm": 1.7593280724556661, + "learning_rate": 4.182721316143888e-06, + "loss": 0.2135, + "step": 8920 + }, + { + "epoch": 0.7067538126361655, + "grad_norm": 1.2460739511293573, + "learning_rate": 4.180634339861474e-06, + "loss": 0.1188, + "step": 8921 + }, + { + "epoch": 0.706833036244801, + "grad_norm": 1.5206373564706603, + "learning_rate": 4.178547746756285e-06, + "loss": 0.1931, + "step": 8922 + }, + { + "epoch": 0.7069122598534363, + "grad_norm": 1.5331165661126867, + "learning_rate": 4.17646153696572e-06, + "loss": 0.1353, + "step": 8923 + }, + { + "epoch": 0.7069914834620717, + "grad_norm": 1.4713285224804602, + "learning_rate": 4.174375710627141e-06, + "loss": 0.1645, + "step": 8924 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 1.3565004481785639, + "learning_rate": 4.172290267877887e-06, + "loss": 0.1651, + "step": 8925 + }, + { + "epoch": 0.7071499306793424, + "grad_norm": 1.1857166425406487, + "learning_rate": 4.170205208855281e-06, + "loss": 0.1084, + "step": 8926 + }, + { + "epoch": 0.7072291542879778, + "grad_norm": 1.4229593454891545, + "learning_rate": 4.1681205336966115e-06, + "loss": 0.1765, + "step": 8927 + }, + { + "epoch": 0.7073083778966132, + "grad_norm": 1.3232025580629834, + "learning_rate": 4.16603624253914e-06, + "loss": 0.1206, + "step": 8928 + }, + { + "epoch": 0.7073876015052486, + "grad_norm": 1.9225876309425236, + "learning_rate": 4.163952335520114e-06, + "loss": 0.3107, + "step": 8929 + }, + { + "epoch": 0.7074668251138839, + "grad_norm": 1.0757666515378737, + "learning_rate": 4.161868812776746e-06, + "loss": 0.1145, + "step": 8930 + }, + { + "epoch": 0.7075460487225194, + "grad_norm": 1.5022710811190123, + "learning_rate": 4.15978567444622e-06, + "loss": 0.1465, + "step": 8931 + }, + { + "epoch": 0.7076252723311547, + "grad_norm": 1.272838566728464, + "learning_rate": 4.157702920665712e-06, + "loss": 0.1375, + "step": 8932 + }, + { + "epoch": 0.70770449593979, + "grad_norm": 1.4737454732909687, + "learning_rate": 4.155620551572354e-06, + "loss": 0.2105, + "step": 8933 + }, + { + "epoch": 0.7077837195484254, + "grad_norm": 1.369309603297265, + "learning_rate": 4.153538567303258e-06, + "loss": 0.0912, + "step": 8934 + }, + { + "epoch": 0.7078629431570608, + "grad_norm": 1.8278682791174095, + "learning_rate": 4.151456967995519e-06, + "loss": 0.2138, + "step": 8935 + }, + { + "epoch": 0.7079421667656962, + "grad_norm": 1.2170316453851604, + "learning_rate": 4.149375753786198e-06, + "loss": 0.0982, + "step": 8936 + }, + { + "epoch": 0.7080213903743315, + "grad_norm": 1.6068213562410223, + "learning_rate": 4.147294924812332e-06, + "loss": 0.1574, + "step": 8937 + }, + { + "epoch": 0.708100613982967, + "grad_norm": 1.5793928878700478, + "learning_rate": 4.14521448121093e-06, + "loss": 0.1629, + "step": 8938 + }, + { + "epoch": 0.7081798375916023, + "grad_norm": 1.5107443079064404, + "learning_rate": 4.143134423118986e-06, + "loss": 0.1481, + "step": 8939 + }, + { + "epoch": 0.7082590612002376, + "grad_norm": 1.567755304830041, + "learning_rate": 4.14105475067346e-06, + "loss": 0.2081, + "step": 8940 + }, + { + "epoch": 0.7083382848088731, + "grad_norm": 1.6685796617507627, + "learning_rate": 4.138975464011284e-06, + "loss": 0.2137, + "step": 8941 + }, + { + "epoch": 0.7084175084175084, + "grad_norm": 1.6514773236188476, + "learning_rate": 4.136896563269375e-06, + "loss": 0.1853, + "step": 8942 + }, + { + "epoch": 0.7084967320261438, + "grad_norm": 1.9418999289604644, + "learning_rate": 4.1348180485846145e-06, + "loss": 0.1971, + "step": 8943 + }, + { + "epoch": 0.7085759556347792, + "grad_norm": 1.9723355117215167, + "learning_rate": 4.1327399200938625e-06, + "loss": 0.3027, + "step": 8944 + }, + { + "epoch": 0.7086551792434146, + "grad_norm": 1.2330073929045136, + "learning_rate": 4.1306621779339585e-06, + "loss": 0.121, + "step": 8945 + }, + { + "epoch": 0.7087344028520499, + "grad_norm": 1.8777243427088852, + "learning_rate": 4.128584822241708e-06, + "loss": 0.2223, + "step": 8946 + }, + { + "epoch": 0.7088136264606852, + "grad_norm": 1.35663460768155, + "learning_rate": 4.126507853153891e-06, + "loss": 0.1125, + "step": 8947 + }, + { + "epoch": 0.7088928500693207, + "grad_norm": 1.1195765404518176, + "learning_rate": 4.124431270807277e-06, + "loss": 0.121, + "step": 8948 + }, + { + "epoch": 0.708972073677956, + "grad_norm": 1.6917336592800893, + "learning_rate": 4.12235507533859e-06, + "loss": 0.2218, + "step": 8949 + }, + { + "epoch": 0.7090512972865914, + "grad_norm": 1.7499013182403091, + "learning_rate": 4.120279266884537e-06, + "loss": 0.2176, + "step": 8950 + }, + { + "epoch": 0.7091305208952268, + "grad_norm": 1.5489162800662826, + "learning_rate": 4.118203845581807e-06, + "loss": 0.2054, + "step": 8951 + }, + { + "epoch": 0.7092097445038622, + "grad_norm": 1.844033527755078, + "learning_rate": 4.11612881156705e-06, + "loss": 0.2106, + "step": 8952 + }, + { + "epoch": 0.7092889681124975, + "grad_norm": 1.6095966575988605, + "learning_rate": 4.114054164976902e-06, + "loss": 0.1587, + "step": 8953 + }, + { + "epoch": 0.7093681917211329, + "grad_norm": 1.4201294765393386, + "learning_rate": 4.111979905947961e-06, + "loss": 0.1545, + "step": 8954 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 1.1165702107681494, + "learning_rate": 4.109906034616816e-06, + "loss": 0.1274, + "step": 8955 + }, + { + "epoch": 0.7095266389384036, + "grad_norm": 1.4700168991468963, + "learning_rate": 4.107832551120017e-06, + "loss": 0.193, + "step": 8956 + }, + { + "epoch": 0.7096058625470391, + "grad_norm": 1.1709800529163041, + "learning_rate": 4.105759455594091e-06, + "loss": 0.1316, + "step": 8957 + }, + { + "epoch": 0.7096850861556744, + "grad_norm": 1.456579609825864, + "learning_rate": 4.103686748175545e-06, + "loss": 0.1847, + "step": 8958 + }, + { + "epoch": 0.7097643097643098, + "grad_norm": 1.325141186202832, + "learning_rate": 4.101614429000857e-06, + "loss": 0.1402, + "step": 8959 + }, + { + "epoch": 0.7098435333729451, + "grad_norm": 1.5067284432450918, + "learning_rate": 4.099542498206473e-06, + "loss": 0.1789, + "step": 8960 + }, + { + "epoch": 0.7099227569815805, + "grad_norm": 1.705231640912638, + "learning_rate": 4.0974709559288275e-06, + "loss": 0.2167, + "step": 8961 + }, + { + "epoch": 0.7100019805902159, + "grad_norm": 1.4578949177567861, + "learning_rate": 4.095399802304319e-06, + "loss": 0.1504, + "step": 8962 + }, + { + "epoch": 0.7100812041988512, + "grad_norm": 2.1730986850409555, + "learning_rate": 4.093329037469319e-06, + "loss": 0.2615, + "step": 8963 + }, + { + "epoch": 0.7101604278074867, + "grad_norm": 1.7145477781340637, + "learning_rate": 4.091258661560184e-06, + "loss": 0.1577, + "step": 8964 + }, + { + "epoch": 0.710239651416122, + "grad_norm": 2.156651765800912, + "learning_rate": 4.0891886747132356e-06, + "loss": 0.2114, + "step": 8965 + }, + { + "epoch": 0.7103188750247573, + "grad_norm": 1.6653329261705643, + "learning_rate": 4.087119077064772e-06, + "loss": 0.118, + "step": 8966 + }, + { + "epoch": 0.7103980986333928, + "grad_norm": 1.3683795423548373, + "learning_rate": 4.085049868751062e-06, + "loss": 0.1931, + "step": 8967 + }, + { + "epoch": 0.7104773222420281, + "grad_norm": 1.454576484937789, + "learning_rate": 4.082981049908362e-06, + "loss": 0.0995, + "step": 8968 + }, + { + "epoch": 0.7105565458506635, + "grad_norm": 1.579544254935876, + "learning_rate": 4.080912620672888e-06, + "loss": 0.2261, + "step": 8969 + }, + { + "epoch": 0.7106357694592988, + "grad_norm": 1.4682726161701445, + "learning_rate": 4.078844581180833e-06, + "loss": 0.1895, + "step": 8970 + }, + { + "epoch": 0.7107149930679343, + "grad_norm": 1.8744033965405815, + "learning_rate": 4.076776931568376e-06, + "loss": 0.2606, + "step": 8971 + }, + { + "epoch": 0.7107942166765696, + "grad_norm": 1.371485826727412, + "learning_rate": 4.074709671971657e-06, + "loss": 0.1256, + "step": 8972 + }, + { + "epoch": 0.7108734402852049, + "grad_norm": 1.4662934552176865, + "learning_rate": 4.0726428025267925e-06, + "loss": 0.1427, + "step": 8973 + }, + { + "epoch": 0.7109526638938404, + "grad_norm": 1.8949072818025252, + "learning_rate": 4.070576323369882e-06, + "loss": 0.2326, + "step": 8974 + }, + { + "epoch": 0.7110318875024757, + "grad_norm": 1.8991883085910681, + "learning_rate": 4.06851023463699e-06, + "loss": 0.2902, + "step": 8975 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 1.6392899595465211, + "learning_rate": 4.066444536464155e-06, + "loss": 0.2274, + "step": 8976 + }, + { + "epoch": 0.7111903347197465, + "grad_norm": 1.7578667092894662, + "learning_rate": 4.0643792289874e-06, + "loss": 0.2305, + "step": 8977 + }, + { + "epoch": 0.7112695583283819, + "grad_norm": 1.9626291415757584, + "learning_rate": 4.062314312342712e-06, + "loss": 0.1657, + "step": 8978 + }, + { + "epoch": 0.7113487819370172, + "grad_norm": 1.7389273444282596, + "learning_rate": 4.060249786666054e-06, + "loss": 0.2038, + "step": 8979 + }, + { + "epoch": 0.7114280055456526, + "grad_norm": 1.3703605387337299, + "learning_rate": 4.0581856520933706e-06, + "loss": 0.2161, + "step": 8980 + }, + { + "epoch": 0.711507229154288, + "grad_norm": 1.6247431069039082, + "learning_rate": 4.056121908760571e-06, + "loss": 0.2027, + "step": 8981 + }, + { + "epoch": 0.7115864527629233, + "grad_norm": 1.544129850790123, + "learning_rate": 4.054058556803544e-06, + "loss": 0.1786, + "step": 8982 + }, + { + "epoch": 0.7116656763715588, + "grad_norm": 1.6311335668482034, + "learning_rate": 4.051995596358147e-06, + "loss": 0.2208, + "step": 8983 + }, + { + "epoch": 0.7117448999801941, + "grad_norm": 3.124595982662079, + "learning_rate": 4.049933027560225e-06, + "loss": 0.3187, + "step": 8984 + }, + { + "epoch": 0.7118241235888295, + "grad_norm": 1.7271318448930384, + "learning_rate": 4.047870850545581e-06, + "loss": 0.1982, + "step": 8985 + }, + { + "epoch": 0.7119033471974648, + "grad_norm": 1.3808739422391276, + "learning_rate": 4.045809065449999e-06, + "loss": 0.1368, + "step": 8986 + }, + { + "epoch": 0.7119825708061002, + "grad_norm": 1.0924354926833104, + "learning_rate": 4.043747672409245e-06, + "loss": 0.1299, + "step": 8987 + }, + { + "epoch": 0.7120617944147356, + "grad_norm": 1.47416609643582, + "learning_rate": 4.041686671559046e-06, + "loss": 0.2158, + "step": 8988 + }, + { + "epoch": 0.7121410180233709, + "grad_norm": 1.7090478794970305, + "learning_rate": 4.039626063035107e-06, + "loss": 0.2056, + "step": 8989 + }, + { + "epoch": 0.7122202416320064, + "grad_norm": 1.3963615134577552, + "learning_rate": 4.0375658469731164e-06, + "loss": 0.1883, + "step": 8990 + }, + { + "epoch": 0.7122994652406417, + "grad_norm": 1.3498996238213241, + "learning_rate": 4.035506023508724e-06, + "loss": 0.1626, + "step": 8991 + }, + { + "epoch": 0.7123786888492771, + "grad_norm": 1.230653252995674, + "learning_rate": 4.033446592777558e-06, + "loss": 0.1322, + "step": 8992 + }, + { + "epoch": 0.7124579124579125, + "grad_norm": 1.1739276112170387, + "learning_rate": 4.031387554915228e-06, + "loss": 0.1593, + "step": 8993 + }, + { + "epoch": 0.7125371360665478, + "grad_norm": 1.4832182470118986, + "learning_rate": 4.029328910057308e-06, + "loss": 0.1964, + "step": 8994 + }, + { + "epoch": 0.7126163596751832, + "grad_norm": 1.5219491163295604, + "learning_rate": 4.027270658339347e-06, + "loss": 0.1321, + "step": 8995 + }, + { + "epoch": 0.7126955832838185, + "grad_norm": 1.6179589454559815, + "learning_rate": 4.025212799896881e-06, + "loss": 0.1946, + "step": 8996 + }, + { + "epoch": 0.712774806892454, + "grad_norm": 1.613501230362664, + "learning_rate": 4.023155334865401e-06, + "loss": 0.1631, + "step": 8997 + }, + { + "epoch": 0.7128540305010893, + "grad_norm": 1.7882736694309616, + "learning_rate": 4.0210982633803784e-06, + "loss": 0.1753, + "step": 8998 + }, + { + "epoch": 0.7129332541097247, + "grad_norm": 1.4327642269184218, + "learning_rate": 4.01904158557727e-06, + "loss": 0.1615, + "step": 8999 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 1.4956853444450924, + "learning_rate": 4.016985301591496e-06, + "loss": 0.1948, + "step": 9000 + }, + { + "epoch": 0.7130917013269954, + "grad_norm": 1.4077558249004838, + "learning_rate": 4.014929411558447e-06, + "loss": 0.1405, + "step": 9001 + }, + { + "epoch": 0.7131709249356308, + "grad_norm": 1.678598755123887, + "learning_rate": 4.012873915613501e-06, + "loss": 0.1211, + "step": 9002 + }, + { + "epoch": 0.7132501485442662, + "grad_norm": 3.0052966281151092, + "learning_rate": 4.010818813892e-06, + "loss": 0.2329, + "step": 9003 + }, + { + "epoch": 0.7133293721529016, + "grad_norm": 1.6560195408304361, + "learning_rate": 4.008764106529259e-06, + "loss": 0.2034, + "step": 9004 + }, + { + "epoch": 0.7134085957615369, + "grad_norm": 1.612467944984783, + "learning_rate": 4.006709793660577e-06, + "loss": 0.1936, + "step": 9005 + }, + { + "epoch": 0.7134878193701724, + "grad_norm": 1.5170749401348624, + "learning_rate": 4.004655875421217e-06, + "loss": 0.1835, + "step": 9006 + }, + { + "epoch": 0.7135670429788077, + "grad_norm": 1.7396927164660396, + "learning_rate": 4.00260235194642e-06, + "loss": 0.2247, + "step": 9007 + }, + { + "epoch": 0.713646266587443, + "grad_norm": 1.528678636120324, + "learning_rate": 4.0005492233713964e-06, + "loss": 0.2651, + "step": 9008 + }, + { + "epoch": 0.7137254901960784, + "grad_norm": 1.9833407046910196, + "learning_rate": 3.998496489831343e-06, + "loss": 0.2176, + "step": 9009 + }, + { + "epoch": 0.7138047138047138, + "grad_norm": 1.4040275471421508, + "learning_rate": 3.996444151461417e-06, + "loss": 0.1524, + "step": 9010 + }, + { + "epoch": 0.7138839374133492, + "grad_norm": 1.6851042233139983, + "learning_rate": 3.994392208396754e-06, + "loss": 0.2111, + "step": 9011 + }, + { + "epoch": 0.7139631610219845, + "grad_norm": 1.3036237866237972, + "learning_rate": 3.992340660772472e-06, + "loss": 0.1322, + "step": 9012 + }, + { + "epoch": 0.71404238463062, + "grad_norm": 1.5896881335335635, + "learning_rate": 3.990289508723648e-06, + "loss": 0.2005, + "step": 9013 + }, + { + "epoch": 0.7141216082392553, + "grad_norm": 1.1160630027882283, + "learning_rate": 3.988238752385341e-06, + "loss": 0.0934, + "step": 9014 + }, + { + "epoch": 0.7142008318478906, + "grad_norm": 1.9948499790509049, + "learning_rate": 3.986188391892587e-06, + "loss": 0.3515, + "step": 9015 + }, + { + "epoch": 0.7142800554565261, + "grad_norm": 1.7234042832334302, + "learning_rate": 3.984138427380393e-06, + "loss": 0.2016, + "step": 9016 + }, + { + "epoch": 0.7143592790651614, + "grad_norm": 1.640750103653538, + "learning_rate": 3.982088858983733e-06, + "loss": 0.2004, + "step": 9017 + }, + { + "epoch": 0.7144385026737968, + "grad_norm": 1.6074449983964227, + "learning_rate": 3.9800396868375675e-06, + "loss": 0.2131, + "step": 9018 + }, + { + "epoch": 0.7145177262824322, + "grad_norm": 1.186496968633592, + "learning_rate": 3.977990911076823e-06, + "loss": 0.1657, + "step": 9019 + }, + { + "epoch": 0.7145969498910676, + "grad_norm": 1.5971498052558097, + "learning_rate": 3.975942531836397e-06, + "loss": 0.2391, + "step": 9020 + }, + { + "epoch": 0.7146761734997029, + "grad_norm": 2.2254404987212038, + "learning_rate": 3.973894549251175e-06, + "loss": 0.2264, + "step": 9021 + }, + { + "epoch": 0.7147553971083382, + "grad_norm": 1.3796774919227595, + "learning_rate": 3.971846963455999e-06, + "loss": 0.167, + "step": 9022 + }, + { + "epoch": 0.7148346207169737, + "grad_norm": 1.5307251298664517, + "learning_rate": 3.969799774585696e-06, + "loss": 0.1065, + "step": 9023 + }, + { + "epoch": 0.714913844325609, + "grad_norm": 1.6004309959764664, + "learning_rate": 3.967752982775058e-06, + "loss": 0.1918, + "step": 9024 + }, + { + "epoch": 0.7149930679342444, + "grad_norm": 1.865688504205946, + "learning_rate": 3.965706588158865e-06, + "loss": 0.2491, + "step": 9025 + }, + { + "epoch": 0.7150722915428798, + "grad_norm": 1.478656642029139, + "learning_rate": 3.963660590871858e-06, + "loss": 0.1681, + "step": 9026 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 1.4210059308888805, + "learning_rate": 3.961614991048752e-06, + "loss": 0.1682, + "step": 9027 + }, + { + "epoch": 0.7152307387601505, + "grad_norm": 1.6888684210855989, + "learning_rate": 3.959569788824248e-06, + "loss": 0.1686, + "step": 9028 + }, + { + "epoch": 0.7153099623687859, + "grad_norm": 1.477158490709553, + "learning_rate": 3.957524984333009e-06, + "loss": 0.125, + "step": 9029 + }, + { + "epoch": 0.7153891859774213, + "grad_norm": 1.71168499741163, + "learning_rate": 3.955480577709672e-06, + "loss": 0.1873, + "step": 9030 + }, + { + "epoch": 0.7154684095860566, + "grad_norm": 1.779959349211946, + "learning_rate": 3.953436569088856e-06, + "loss": 0.1739, + "step": 9031 + }, + { + "epoch": 0.7155476331946921, + "grad_norm": 1.3560254000767384, + "learning_rate": 3.951392958605149e-06, + "loss": 0.1474, + "step": 9032 + }, + { + "epoch": 0.7156268568033274, + "grad_norm": 1.7455533481947527, + "learning_rate": 3.949349746393108e-06, + "loss": 0.1647, + "step": 9033 + }, + { + "epoch": 0.7157060804119628, + "grad_norm": 1.4528074698538187, + "learning_rate": 3.947306932587277e-06, + "loss": 0.184, + "step": 9034 + }, + { + "epoch": 0.7157853040205981, + "grad_norm": 1.2692687219878889, + "learning_rate": 3.945264517322159e-06, + "loss": 0.1542, + "step": 9035 + }, + { + "epoch": 0.7158645276292335, + "grad_norm": 1.4551494766354298, + "learning_rate": 3.943222500732241e-06, + "loss": 0.1578, + "step": 9036 + }, + { + "epoch": 0.7159437512378689, + "grad_norm": 1.353822794206381, + "learning_rate": 3.941180882951972e-06, + "loss": 0.2059, + "step": 9037 + }, + { + "epoch": 0.7160229748465042, + "grad_norm": 1.5168834929808608, + "learning_rate": 3.9391396641157945e-06, + "loss": 0.1856, + "step": 9038 + }, + { + "epoch": 0.7161021984551397, + "grad_norm": 1.2626094497243283, + "learning_rate": 3.937098844358106e-06, + "loss": 0.092, + "step": 9039 + }, + { + "epoch": 0.716181422063775, + "grad_norm": 1.50048188467296, + "learning_rate": 3.935058423813282e-06, + "loss": 0.2321, + "step": 9040 + }, + { + "epoch": 0.7162606456724103, + "grad_norm": 1.4221612612287615, + "learning_rate": 3.933018402615683e-06, + "loss": 0.16, + "step": 9041 + }, + { + "epoch": 0.7163398692810458, + "grad_norm": 1.7036620893662642, + "learning_rate": 3.9309787808996284e-06, + "loss": 0.1595, + "step": 9042 + }, + { + "epoch": 0.7164190928896811, + "grad_norm": 1.2712702350305998, + "learning_rate": 3.928939558799415e-06, + "loss": 0.0866, + "step": 9043 + }, + { + "epoch": 0.7164983164983165, + "grad_norm": 1.4241542221006147, + "learning_rate": 3.926900736449324e-06, + "loss": 0.1625, + "step": 9044 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 1.2039491565505709, + "learning_rate": 3.924862313983597e-06, + "loss": 0.1172, + "step": 9045 + }, + { + "epoch": 0.7166567637155873, + "grad_norm": 1.483794922138058, + "learning_rate": 3.922824291536452e-06, + "loss": 0.1898, + "step": 9046 + }, + { + "epoch": 0.7167359873242226, + "grad_norm": 1.1218696089849776, + "learning_rate": 3.920786669242089e-06, + "loss": 0.0843, + "step": 9047 + }, + { + "epoch": 0.7168152109328579, + "grad_norm": 1.8784289017637512, + "learning_rate": 3.918749447234674e-06, + "loss": 0.2198, + "step": 9048 + }, + { + "epoch": 0.7168944345414934, + "grad_norm": 1.5897861681388985, + "learning_rate": 3.9167126256483415e-06, + "loss": 0.1635, + "step": 9049 + }, + { + "epoch": 0.7169736581501287, + "grad_norm": 1.8535840508931065, + "learning_rate": 3.914676204617216e-06, + "loss": 0.1936, + "step": 9050 + }, + { + "epoch": 0.7170528817587641, + "grad_norm": 1.458693675235977, + "learning_rate": 3.912640184275381e-06, + "loss": 0.1803, + "step": 9051 + }, + { + "epoch": 0.7171321053673995, + "grad_norm": 1.482156299897613, + "learning_rate": 3.9106045647569005e-06, + "loss": 0.203, + "step": 9052 + }, + { + "epoch": 0.7172113289760349, + "grad_norm": 1.5779143610523423, + "learning_rate": 3.908569346195804e-06, + "loss": 0.1602, + "step": 9053 + }, + { + "epoch": 0.7172905525846702, + "grad_norm": 1.6529259345291873, + "learning_rate": 3.90653452872611e-06, + "loss": 0.1605, + "step": 9054 + }, + { + "epoch": 0.7173697761933056, + "grad_norm": 2.181993139608414, + "learning_rate": 3.904500112481798e-06, + "loss": 0.1725, + "step": 9055 + }, + { + "epoch": 0.717448999801941, + "grad_norm": 1.8115248726302062, + "learning_rate": 3.902466097596821e-06, + "loss": 0.1752, + "step": 9056 + }, + { + "epoch": 0.7175282234105763, + "grad_norm": 1.6566757803948426, + "learning_rate": 3.900432484205115e-06, + "loss": 0.1612, + "step": 9057 + }, + { + "epoch": 0.7176074470192118, + "grad_norm": 1.6978465046881248, + "learning_rate": 3.89839927244058e-06, + "loss": 0.208, + "step": 9058 + }, + { + "epoch": 0.7176866706278471, + "grad_norm": 1.2188902030872955, + "learning_rate": 3.89636646243709e-06, + "loss": 0.1135, + "step": 9059 + }, + { + "epoch": 0.7177658942364825, + "grad_norm": 1.3496657440401796, + "learning_rate": 3.894334054328505e-06, + "loss": 0.1914, + "step": 9060 + }, + { + "epoch": 0.7178451178451178, + "grad_norm": 1.62484859302476, + "learning_rate": 3.892302048248642e-06, + "loss": 0.2044, + "step": 9061 + }, + { + "epoch": 0.7179243414537532, + "grad_norm": 1.598928420791679, + "learning_rate": 3.890270444331298e-06, + "loss": 0.2004, + "step": 9062 + }, + { + "epoch": 0.7180035650623886, + "grad_norm": 1.1838584888680506, + "learning_rate": 3.888239242710251e-06, + "loss": 0.1285, + "step": 9063 + }, + { + "epoch": 0.7180827886710239, + "grad_norm": 1.2338198525155133, + "learning_rate": 3.886208443519242e-06, + "loss": 0.1618, + "step": 9064 + }, + { + "epoch": 0.7181620122796594, + "grad_norm": 1.5850887432876848, + "learning_rate": 3.884178046891984e-06, + "loss": 0.1891, + "step": 9065 + }, + { + "epoch": 0.7182412358882947, + "grad_norm": 1.92804658804293, + "learning_rate": 3.88214805296218e-06, + "loss": 0.2041, + "step": 9066 + }, + { + "epoch": 0.7183204594969301, + "grad_norm": 1.8915883200648644, + "learning_rate": 3.880118461863488e-06, + "loss": 0.2235, + "step": 9067 + }, + { + "epoch": 0.7183996831055655, + "grad_norm": 1.3440238910894948, + "learning_rate": 3.878089273729549e-06, + "loss": 0.1272, + "step": 9068 + }, + { + "epoch": 0.7184789067142008, + "grad_norm": 1.6136231808937114, + "learning_rate": 3.876060488693971e-06, + "loss": 0.2128, + "step": 9069 + }, + { + "epoch": 0.7185581303228362, + "grad_norm": 1.5417514075177148, + "learning_rate": 3.874032106890347e-06, + "loss": 0.2321, + "step": 9070 + }, + { + "epoch": 0.7186373539314715, + "grad_norm": 1.4619880366560432, + "learning_rate": 3.872004128452231e-06, + "loss": 0.1119, + "step": 9071 + }, + { + "epoch": 0.718716577540107, + "grad_norm": 1.5810958087342992, + "learning_rate": 3.8699765535131565e-06, + "loss": 0.1859, + "step": 9072 + }, + { + "epoch": 0.7187958011487423, + "grad_norm": 1.4974380723545042, + "learning_rate": 3.867949382206632e-06, + "loss": 0.1673, + "step": 9073 + }, + { + "epoch": 0.7188750247573777, + "grad_norm": 1.3916266660052012, + "learning_rate": 3.8659226146661344e-06, + "loss": 0.119, + "step": 9074 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 1.6227143749682817, + "learning_rate": 3.8638962510251175e-06, + "loss": 0.2311, + "step": 9075 + }, + { + "epoch": 0.7190334719746484, + "grad_norm": 1.7051339049259857, + "learning_rate": 3.861870291417008e-06, + "loss": 0.2257, + "step": 9076 + }, + { + "epoch": 0.7191126955832838, + "grad_norm": 1.5799577133004064, + "learning_rate": 3.859844735975205e-06, + "loss": 0.15, + "step": 9077 + }, + { + "epoch": 0.7191919191919192, + "grad_norm": 1.3884567522513096, + "learning_rate": 3.857819584833078e-06, + "loss": 0.1425, + "step": 9078 + }, + { + "epoch": 0.7192711428005546, + "grad_norm": 1.858541351918183, + "learning_rate": 3.855794838123981e-06, + "loss": 0.1674, + "step": 9079 + }, + { + "epoch": 0.7193503664091899, + "grad_norm": 1.4407869231448682, + "learning_rate": 3.85377049598123e-06, + "loss": 0.1161, + "step": 9080 + }, + { + "epoch": 0.7194295900178254, + "grad_norm": 1.2113437592070886, + "learning_rate": 3.851746558538113e-06, + "loss": 0.1173, + "step": 9081 + }, + { + "epoch": 0.7195088136264607, + "grad_norm": 1.6734177952858873, + "learning_rate": 3.849723025927907e-06, + "loss": 0.1468, + "step": 9082 + }, + { + "epoch": 0.719588037235096, + "grad_norm": 1.6853728356199706, + "learning_rate": 3.847699898283846e-06, + "loss": 0.2097, + "step": 9083 + }, + { + "epoch": 0.7196672608437314, + "grad_norm": 2.087847716941042, + "learning_rate": 3.84567717573914e-06, + "loss": 0.2189, + "step": 9084 + }, + { + "epoch": 0.7197464844523668, + "grad_norm": 1.6563505204227469, + "learning_rate": 3.843654858426981e-06, + "loss": 0.1887, + "step": 9085 + }, + { + "epoch": 0.7198257080610022, + "grad_norm": 1.550143621536969, + "learning_rate": 3.84163294648053e-06, + "loss": 0.1989, + "step": 9086 + }, + { + "epoch": 0.7199049316696375, + "grad_norm": 1.9169435307826304, + "learning_rate": 3.839611440032912e-06, + "loss": 0.2223, + "step": 9087 + }, + { + "epoch": 0.719984155278273, + "grad_norm": 1.9047305832146382, + "learning_rate": 3.837590339217243e-06, + "loss": 0.2023, + "step": 9088 + }, + { + "epoch": 0.7200633788869083, + "grad_norm": 2.5056144609398894, + "learning_rate": 3.835569644166599e-06, + "loss": 0.1646, + "step": 9089 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 1.5446784623354364, + "learning_rate": 3.833549355014028e-06, + "loss": 0.1638, + "step": 9090 + }, + { + "epoch": 0.7202218261041791, + "grad_norm": 1.7494204197502379, + "learning_rate": 3.8315294718925656e-06, + "loss": 0.1592, + "step": 9091 + }, + { + "epoch": 0.7203010497128144, + "grad_norm": 1.725774636577849, + "learning_rate": 3.829509994935206e-06, + "loss": 0.1985, + "step": 9092 + }, + { + "epoch": 0.7203802733214498, + "grad_norm": 1.3667122392620439, + "learning_rate": 3.827490924274922e-06, + "loss": 0.1727, + "step": 9093 + }, + { + "epoch": 0.7204594969300852, + "grad_norm": 1.3341205872074957, + "learning_rate": 3.825472260044658e-06, + "loss": 0.1124, + "step": 9094 + }, + { + "epoch": 0.7205387205387206, + "grad_norm": 1.5994718599360567, + "learning_rate": 3.8234540023773385e-06, + "loss": 0.1785, + "step": 9095 + }, + { + "epoch": 0.7206179441473559, + "grad_norm": 2.1104002211602193, + "learning_rate": 3.821436151405854e-06, + "loss": 0.1475, + "step": 9096 + }, + { + "epoch": 0.7206971677559912, + "grad_norm": 2.044061381966268, + "learning_rate": 3.819418707263065e-06, + "loss": 0.2787, + "step": 9097 + }, + { + "epoch": 0.7207763913646267, + "grad_norm": 1.3567126741090718, + "learning_rate": 3.8174016700818196e-06, + "loss": 0.1546, + "step": 9098 + }, + { + "epoch": 0.720855614973262, + "grad_norm": 1.623895295465704, + "learning_rate": 3.815385039994925e-06, + "loss": 0.1826, + "step": 9099 + }, + { + "epoch": 0.7209348385818974, + "grad_norm": 1.4418063123348093, + "learning_rate": 3.8133688171351645e-06, + "loss": 0.1761, + "step": 9100 + }, + { + "epoch": 0.7210140621905328, + "grad_norm": 1.3342184119627303, + "learning_rate": 3.811353001635302e-06, + "loss": 0.1474, + "step": 9101 + }, + { + "epoch": 0.7210932857991682, + "grad_norm": 1.2133827757075395, + "learning_rate": 3.8093375936280665e-06, + "loss": 0.1307, + "step": 9102 + }, + { + "epoch": 0.7211725094078035, + "grad_norm": 1.8670532793034826, + "learning_rate": 3.807322593246159e-06, + "loss": 0.2793, + "step": 9103 + }, + { + "epoch": 0.7212517330164389, + "grad_norm": 1.664944759625284, + "learning_rate": 3.805308000622265e-06, + "loss": 0.1602, + "step": 9104 + }, + { + "epoch": 0.7213309566250743, + "grad_norm": 1.6328438505490472, + "learning_rate": 3.8032938158890333e-06, + "loss": 0.2175, + "step": 9105 + }, + { + "epoch": 0.7214101802337096, + "grad_norm": 1.3106383845715255, + "learning_rate": 3.8012800391790814e-06, + "loss": 0.1164, + "step": 9106 + }, + { + "epoch": 0.7214894038423451, + "grad_norm": 1.4425097714308814, + "learning_rate": 3.799266670625018e-06, + "loss": 0.134, + "step": 9107 + }, + { + "epoch": 0.7215686274509804, + "grad_norm": 1.3673002516142871, + "learning_rate": 3.797253710359409e-06, + "loss": 0.1737, + "step": 9108 + }, + { + "epoch": 0.7216478510596158, + "grad_norm": 2.264027446084552, + "learning_rate": 3.7952411585147954e-06, + "loss": 0.2603, + "step": 9109 + }, + { + "epoch": 0.7217270746682511, + "grad_norm": 1.7800965293009932, + "learning_rate": 3.793229015223694e-06, + "loss": 0.2259, + "step": 9110 + }, + { + "epoch": 0.7218062982768865, + "grad_norm": 1.2537544880547102, + "learning_rate": 3.7912172806186e-06, + "loss": 0.1499, + "step": 9111 + }, + { + "epoch": 0.7218855218855219, + "grad_norm": 1.0522742003007164, + "learning_rate": 3.7892059548319726e-06, + "loss": 0.1023, + "step": 9112 + }, + { + "epoch": 0.7219647454941572, + "grad_norm": 1.752092008053303, + "learning_rate": 3.7871950379962463e-06, + "loss": 0.1653, + "step": 9113 + }, + { + "epoch": 0.7220439691027927, + "grad_norm": 1.7260211959752805, + "learning_rate": 3.785184530243835e-06, + "loss": 0.2495, + "step": 9114 + }, + { + "epoch": 0.722123192711428, + "grad_norm": 1.8364188992883017, + "learning_rate": 3.7831744317071194e-06, + "loss": 0.236, + "step": 9115 + }, + { + "epoch": 0.7222024163200634, + "grad_norm": 1.5612846767106707, + "learning_rate": 3.7811647425184508e-06, + "loss": 0.2291, + "step": 9116 + }, + { + "epoch": 0.7222816399286988, + "grad_norm": 1.3421249378434938, + "learning_rate": 3.7791554628101635e-06, + "loss": 0.1849, + "step": 9117 + }, + { + "epoch": 0.7223608635373341, + "grad_norm": 1.511705990419425, + "learning_rate": 3.777146592714557e-06, + "loss": 0.1425, + "step": 9118 + }, + { + "epoch": 0.7224400871459695, + "grad_norm": 1.5589922664458769, + "learning_rate": 3.7751381323639e-06, + "loss": 0.1957, + "step": 9119 + }, + { + "epoch": 0.7225193107546048, + "grad_norm": 1.4560120098600908, + "learning_rate": 3.7731300818904494e-06, + "loss": 0.1872, + "step": 9120 + }, + { + "epoch": 0.7225985343632403, + "grad_norm": 1.2815084024791408, + "learning_rate": 3.7711224414264216e-06, + "loss": 0.1342, + "step": 9121 + }, + { + "epoch": 0.7226777579718756, + "grad_norm": 1.4272790731694156, + "learning_rate": 3.7691152111040087e-06, + "loss": 0.232, + "step": 9122 + }, + { + "epoch": 0.7227569815805109, + "grad_norm": 1.4058234830114376, + "learning_rate": 3.767108391055374e-06, + "loss": 0.1705, + "step": 9123 + }, + { + "epoch": 0.7228362051891464, + "grad_norm": 1.3415809643670769, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.1456, + "step": 9124 + }, + { + "epoch": 0.7229154287977817, + "grad_norm": 1.5248380112282414, + "learning_rate": 3.7630959823079914e-06, + "loss": 0.1814, + "step": 9125 + }, + { + "epoch": 0.7229946524064171, + "grad_norm": 1.0422113591476505, + "learning_rate": 3.761090393873432e-06, + "loss": 0.0957, + "step": 9126 + }, + { + "epoch": 0.7230738760150525, + "grad_norm": 1.3098492919112241, + "learning_rate": 3.7590852162410553e-06, + "loss": 0.1041, + "step": 9127 + }, + { + "epoch": 0.7231530996236879, + "grad_norm": 1.56969249461988, + "learning_rate": 3.757080449542887e-06, + "loss": 0.2144, + "step": 9128 + }, + { + "epoch": 0.7232323232323232, + "grad_norm": 2.1130355348476346, + "learning_rate": 3.7550760939109287e-06, + "loss": 0.2178, + "step": 9129 + }, + { + "epoch": 0.7233115468409586, + "grad_norm": 1.4914250784479086, + "learning_rate": 3.7530721494771648e-06, + "loss": 0.1873, + "step": 9130 + }, + { + "epoch": 0.723390770449594, + "grad_norm": 1.5065213659543368, + "learning_rate": 3.751068616373541e-06, + "loss": 0.1754, + "step": 9131 + }, + { + "epoch": 0.7234699940582293, + "grad_norm": 1.4524169435189498, + "learning_rate": 3.749065494731978e-06, + "loss": 0.1318, + "step": 9132 + }, + { + "epoch": 0.7235492176668648, + "grad_norm": 1.5319927189304092, + "learning_rate": 3.747062784684378e-06, + "loss": 0.1856, + "step": 9133 + }, + { + "epoch": 0.7236284412755001, + "grad_norm": 1.5811166745348826, + "learning_rate": 3.7450604863626063e-06, + "loss": 0.1493, + "step": 9134 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 1.954204786441343, + "learning_rate": 3.7430585998985004e-06, + "loss": 0.2334, + "step": 9135 + }, + { + "epoch": 0.7237868884927708, + "grad_norm": 1.3651315244576763, + "learning_rate": 3.7410571254238835e-06, + "loss": 0.1827, + "step": 9136 + }, + { + "epoch": 0.7238661121014062, + "grad_norm": 1.498571782129823, + "learning_rate": 3.7390560630705387e-06, + "loss": 0.1772, + "step": 9137 + }, + { + "epoch": 0.7239453357100416, + "grad_norm": 1.1379557011995205, + "learning_rate": 3.7370554129702265e-06, + "loss": 0.1121, + "step": 9138 + }, + { + "epoch": 0.7240245593186769, + "grad_norm": 1.6552899317659178, + "learning_rate": 3.735055175254676e-06, + "loss": 0.1971, + "step": 9139 + }, + { + "epoch": 0.7241037829273124, + "grad_norm": 1.3128254971816442, + "learning_rate": 3.733055350055601e-06, + "loss": 0.144, + "step": 9140 + }, + { + "epoch": 0.7241830065359477, + "grad_norm": 1.3069125179203875, + "learning_rate": 3.7310559375046774e-06, + "loss": 0.1707, + "step": 9141 + }, + { + "epoch": 0.7242622301445831, + "grad_norm": 1.1198542872534016, + "learning_rate": 3.7290569377335517e-06, + "loss": 0.1011, + "step": 9142 + }, + { + "epoch": 0.7243414537532185, + "grad_norm": 1.4731175458061112, + "learning_rate": 3.7270583508738565e-06, + "loss": 0.1449, + "step": 9143 + }, + { + "epoch": 0.7244206773618538, + "grad_norm": 1.672754668615413, + "learning_rate": 3.725060177057185e-06, + "loss": 0.2707, + "step": 9144 + }, + { + "epoch": 0.7244999009704892, + "grad_norm": 1.6761777278488457, + "learning_rate": 3.723062416415105e-06, + "loss": 0.1937, + "step": 9145 + }, + { + "epoch": 0.7245791245791245, + "grad_norm": 1.300352270630626, + "learning_rate": 3.721065069079165e-06, + "loss": 0.1403, + "step": 9146 + }, + { + "epoch": 0.72465834818776, + "grad_norm": 1.537838132912087, + "learning_rate": 3.7190681351808778e-06, + "loss": 0.166, + "step": 9147 + }, + { + "epoch": 0.7247375717963953, + "grad_norm": 1.9220091644531176, + "learning_rate": 3.7170716148517294e-06, + "loss": 0.1937, + "step": 9148 + }, + { + "epoch": 0.7248167954050307, + "grad_norm": 1.2515736621489992, + "learning_rate": 3.715075508223187e-06, + "loss": 0.1435, + "step": 9149 + }, + { + "epoch": 0.7248960190136661, + "grad_norm": 1.3327992702954297, + "learning_rate": 3.71307981542668e-06, + "loss": 0.1823, + "step": 9150 + }, + { + "epoch": 0.7249752426223014, + "grad_norm": 1.4698977263415958, + "learning_rate": 3.7110845365936144e-06, + "loss": 0.1367, + "step": 9151 + }, + { + "epoch": 0.7250544662309368, + "grad_norm": 1.528357635526201, + "learning_rate": 3.709089671855378e-06, + "loss": 0.1785, + "step": 9152 + }, + { + "epoch": 0.7251336898395722, + "grad_norm": 1.3134327684528684, + "learning_rate": 3.707095221343313e-06, + "loss": 0.1364, + "step": 9153 + }, + { + "epoch": 0.7252129134482076, + "grad_norm": 1.378788395914772, + "learning_rate": 3.7051011851887455e-06, + "loss": 0.1766, + "step": 9154 + }, + { + "epoch": 0.7252921370568429, + "grad_norm": 1.6670125378340088, + "learning_rate": 3.7031075635229787e-06, + "loss": 0.2311, + "step": 9155 + }, + { + "epoch": 0.7253713606654784, + "grad_norm": 1.488651109597415, + "learning_rate": 3.70111435647728e-06, + "loss": 0.1319, + "step": 9156 + }, + { + "epoch": 0.7254505842741137, + "grad_norm": 1.3467921873259778, + "learning_rate": 3.6991215641828903e-06, + "loss": 0.1528, + "step": 9157 + }, + { + "epoch": 0.725529807882749, + "grad_norm": 2.3256477098154433, + "learning_rate": 3.6971291867710303e-06, + "loss": 0.3004, + "step": 9158 + }, + { + "epoch": 0.7256090314913844, + "grad_norm": 1.1369956053814774, + "learning_rate": 3.6951372243728854e-06, + "loss": 0.1017, + "step": 9159 + }, + { + "epoch": 0.7256882551000198, + "grad_norm": 1.562159969793376, + "learning_rate": 3.693145677119615e-06, + "loss": 0.2144, + "step": 9160 + }, + { + "epoch": 0.7257674787086552, + "grad_norm": 1.9340070299399497, + "learning_rate": 3.691154545142357e-06, + "loss": 0.2413, + "step": 9161 + }, + { + "epoch": 0.7258467023172905, + "grad_norm": 1.9384254252035054, + "learning_rate": 3.6891638285722176e-06, + "loss": 0.1882, + "step": 9162 + }, + { + "epoch": 0.725925925925926, + "grad_norm": 1.8001180384584912, + "learning_rate": 3.687173527540273e-06, + "loss": 0.2041, + "step": 9163 + }, + { + "epoch": 0.7260051495345613, + "grad_norm": 1.9409758761693925, + "learning_rate": 3.6851836421775733e-06, + "loss": 0.2015, + "step": 9164 + }, + { + "epoch": 0.7260843731431966, + "grad_norm": 1.4502710972461779, + "learning_rate": 3.683194172615149e-06, + "loss": 0.1716, + "step": 9165 + }, + { + "epoch": 0.7261635967518321, + "grad_norm": 1.4684737044771066, + "learning_rate": 3.681205118983995e-06, + "loss": 0.1832, + "step": 9166 + }, + { + "epoch": 0.7262428203604674, + "grad_norm": 1.3543165687679024, + "learning_rate": 3.6792164814150756e-06, + "loss": 0.1187, + "step": 9167 + }, + { + "epoch": 0.7263220439691028, + "grad_norm": 1.3247265132719996, + "learning_rate": 3.6772282600393393e-06, + "loss": 0.1292, + "step": 9168 + }, + { + "epoch": 0.7264012675777382, + "grad_norm": 1.8899551304933995, + "learning_rate": 3.675240454987701e-06, + "loss": 0.1392, + "step": 9169 + }, + { + "epoch": 0.7264804911863736, + "grad_norm": 1.8459340470535968, + "learning_rate": 3.6732530663910415e-06, + "loss": 0.1454, + "step": 9170 + }, + { + "epoch": 0.7265597147950089, + "grad_norm": 1.5808212763076916, + "learning_rate": 3.6712660943802292e-06, + "loss": 0.1177, + "step": 9171 + }, + { + "epoch": 0.7266389384036442, + "grad_norm": 1.8172895966787481, + "learning_rate": 3.6692795390860913e-06, + "loss": 0.1691, + "step": 9172 + }, + { + "epoch": 0.7267181620122797, + "grad_norm": 1.6153578932395543, + "learning_rate": 3.667293400639432e-06, + "loss": 0.1876, + "step": 9173 + }, + { + "epoch": 0.726797385620915, + "grad_norm": 1.3174571376265898, + "learning_rate": 3.665307679171034e-06, + "loss": 0.1448, + "step": 9174 + }, + { + "epoch": 0.7268766092295504, + "grad_norm": 1.6078091448188845, + "learning_rate": 3.6633223748116454e-06, + "loss": 0.1641, + "step": 9175 + }, + { + "epoch": 0.7269558328381858, + "grad_norm": 1.781059429319135, + "learning_rate": 3.661337487691985e-06, + "loss": 0.2085, + "step": 9176 + }, + { + "epoch": 0.7270350564468212, + "grad_norm": 1.5872870806572175, + "learning_rate": 3.659353017942754e-06, + "loss": 0.26, + "step": 9177 + }, + { + "epoch": 0.7271142800554565, + "grad_norm": 1.4096828592033954, + "learning_rate": 3.6573689656946177e-06, + "loss": 0.1607, + "step": 9178 + }, + { + "epoch": 0.7271935036640919, + "grad_norm": 1.3417721067761144, + "learning_rate": 3.655385331078217e-06, + "loss": 0.1115, + "step": 9179 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.871430729197127, + "learning_rate": 3.6534021142241595e-06, + "loss": 0.1815, + "step": 9180 + }, + { + "epoch": 0.7273519508813626, + "grad_norm": 2.07880282717452, + "learning_rate": 3.6514193152630382e-06, + "loss": 0.2215, + "step": 9181 + }, + { + "epoch": 0.7274311744899981, + "grad_norm": 1.4841112804861578, + "learning_rate": 3.649436934325409e-06, + "loss": 0.1639, + "step": 9182 + }, + { + "epoch": 0.7275103980986334, + "grad_norm": 1.417370074331635, + "learning_rate": 3.647454971541796e-06, + "loss": 0.1585, + "step": 9183 + }, + { + "epoch": 0.7275896217072688, + "grad_norm": 1.0574246149159716, + "learning_rate": 3.6454734270427107e-06, + "loss": 0.0789, + "step": 9184 + }, + { + "epoch": 0.7276688453159041, + "grad_norm": 1.806201252304038, + "learning_rate": 3.6434923009586244e-06, + "loss": 0.2075, + "step": 9185 + }, + { + "epoch": 0.7277480689245395, + "grad_norm": 1.3665577693885003, + "learning_rate": 3.6415115934199795e-06, + "loss": 0.143, + "step": 9186 + }, + { + "epoch": 0.7278272925331749, + "grad_norm": 1.658120082405557, + "learning_rate": 3.6395313045572055e-06, + "loss": 0.2047, + "step": 9187 + }, + { + "epoch": 0.7279065161418102, + "grad_norm": 1.3744935377552778, + "learning_rate": 3.6375514345006913e-06, + "loss": 0.1451, + "step": 9188 + }, + { + "epoch": 0.7279857397504457, + "grad_norm": 1.425797368850368, + "learning_rate": 3.635571983380797e-06, + "loss": 0.1265, + "step": 9189 + }, + { + "epoch": 0.728064963359081, + "grad_norm": 1.8360229975625335, + "learning_rate": 3.6335929513278667e-06, + "loss": 0.2016, + "step": 9190 + }, + { + "epoch": 0.7281441869677164, + "grad_norm": 1.2554453497590992, + "learning_rate": 3.631614338472208e-06, + "loss": 0.1475, + "step": 9191 + }, + { + "epoch": 0.7282234105763518, + "grad_norm": 1.2209398196114771, + "learning_rate": 3.6296361449440985e-06, + "loss": 0.1611, + "step": 9192 + }, + { + "epoch": 0.7283026341849871, + "grad_norm": 1.380453108321186, + "learning_rate": 3.6276583708738013e-06, + "loss": 0.1574, + "step": 9193 + }, + { + "epoch": 0.7283818577936225, + "grad_norm": 1.8224506098765345, + "learning_rate": 3.6256810163915368e-06, + "loss": 0.1603, + "step": 9194 + }, + { + "epoch": 0.7284610814022578, + "grad_norm": 1.5291065648693936, + "learning_rate": 3.623704081627507e-06, + "loss": 0.2159, + "step": 9195 + }, + { + "epoch": 0.7285403050108933, + "grad_norm": 1.2218064221955427, + "learning_rate": 3.62172756671188e-06, + "loss": 0.1109, + "step": 9196 + }, + { + "epoch": 0.7286195286195286, + "grad_norm": 1.531883415160384, + "learning_rate": 3.619751471774805e-06, + "loss": 0.1907, + "step": 9197 + }, + { + "epoch": 0.728698752228164, + "grad_norm": 1.350046992375423, + "learning_rate": 3.6177757969463956e-06, + "loss": 0.1451, + "step": 9198 + }, + { + "epoch": 0.7287779758367994, + "grad_norm": 1.5308613939871516, + "learning_rate": 3.615800542356738e-06, + "loss": 0.1539, + "step": 9199 + }, + { + "epoch": 0.7288571994454347, + "grad_norm": 1.2910480544851555, + "learning_rate": 3.6138257081358985e-06, + "loss": 0.1513, + "step": 9200 + }, + { + "epoch": 0.7289364230540701, + "grad_norm": 1.5349775397024419, + "learning_rate": 3.6118512944139084e-06, + "loss": 0.2038, + "step": 9201 + }, + { + "epoch": 0.7290156466627055, + "grad_norm": 1.7447375925960995, + "learning_rate": 3.609877301320769e-06, + "loss": 0.2418, + "step": 9202 + }, + { + "epoch": 0.7290948702713409, + "grad_norm": 1.7899060858289482, + "learning_rate": 3.607903728986465e-06, + "loss": 0.2047, + "step": 9203 + }, + { + "epoch": 0.7291740938799762, + "grad_norm": 1.6093271768899993, + "learning_rate": 3.6059305775409435e-06, + "loss": 0.2542, + "step": 9204 + }, + { + "epoch": 0.7292533174886116, + "grad_norm": 2.6406290776511367, + "learning_rate": 3.6039578471141244e-06, + "loss": 0.1979, + "step": 9205 + }, + { + "epoch": 0.729332541097247, + "grad_norm": 1.639071862333085, + "learning_rate": 3.6019855378359092e-06, + "loss": 0.1922, + "step": 9206 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 1.1598429744597092, + "learning_rate": 3.6000136498361605e-06, + "loss": 0.0941, + "step": 9207 + }, + { + "epoch": 0.7294909883145178, + "grad_norm": 0.7114323589978904, + "learning_rate": 3.5980421832447188e-06, + "loss": 0.0859, + "step": 9208 + }, + { + "epoch": 0.7295702119231531, + "grad_norm": 1.4064092688666743, + "learning_rate": 3.5960711381913904e-06, + "loss": 0.1573, + "step": 9209 + }, + { + "epoch": 0.7296494355317885, + "grad_norm": 1.3669230885100645, + "learning_rate": 3.5941005148059684e-06, + "loss": 0.2027, + "step": 9210 + }, + { + "epoch": 0.7297286591404238, + "grad_norm": 1.4249676741684758, + "learning_rate": 3.5921303132182038e-06, + "loss": 0.1921, + "step": 9211 + }, + { + "epoch": 0.7298078827490592, + "grad_norm": 1.5814114857052903, + "learning_rate": 3.5901605335578214e-06, + "loss": 0.2168, + "step": 9212 + }, + { + "epoch": 0.7298871063576946, + "grad_norm": 1.6356478831002252, + "learning_rate": 3.5881911759545296e-06, + "loss": 0.2195, + "step": 9213 + }, + { + "epoch": 0.7299663299663299, + "grad_norm": 1.3394345448527973, + "learning_rate": 3.5862222405379975e-06, + "loss": 0.1075, + "step": 9214 + }, + { + "epoch": 0.7300455535749654, + "grad_norm": 1.739069549305948, + "learning_rate": 3.584253727437866e-06, + "loss": 0.1567, + "step": 9215 + }, + { + "epoch": 0.7301247771836007, + "grad_norm": 1.5177721771550932, + "learning_rate": 3.5822856367837587e-06, + "loss": 0.1675, + "step": 9216 + }, + { + "epoch": 0.7302040007922361, + "grad_norm": 1.4747053893799875, + "learning_rate": 3.5803179687052636e-06, + "loss": 0.1502, + "step": 9217 + }, + { + "epoch": 0.7302832244008715, + "grad_norm": 1.4593506848127025, + "learning_rate": 3.578350723331937e-06, + "loss": 0.1312, + "step": 9218 + }, + { + "epoch": 0.7303624480095068, + "grad_norm": 1.7632007157436047, + "learning_rate": 3.5763839007933186e-06, + "loss": 0.2735, + "step": 9219 + }, + { + "epoch": 0.7304416716181422, + "grad_norm": 1.502594467754403, + "learning_rate": 3.574417501218913e-06, + "loss": 0.1413, + "step": 9220 + }, + { + "epoch": 0.7305208952267775, + "grad_norm": 1.5803799208773885, + "learning_rate": 3.572451524738193e-06, + "loss": 0.2593, + "step": 9221 + }, + { + "epoch": 0.730600118835413, + "grad_norm": 1.3277431484787081, + "learning_rate": 3.5704859714806162e-06, + "loss": 0.1601, + "step": 9222 + }, + { + "epoch": 0.7306793424440483, + "grad_norm": 1.3712244420537074, + "learning_rate": 3.568520841575601e-06, + "loss": 0.147, + "step": 9223 + }, + { + "epoch": 0.7307585660526837, + "grad_norm": 1.682802466160093, + "learning_rate": 3.5665561351525423e-06, + "loss": 0.1993, + "step": 9224 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 1.5173816752061855, + "learning_rate": 3.564591852340803e-06, + "loss": 0.1526, + "step": 9225 + }, + { + "epoch": 0.7309170132699544, + "grad_norm": 1.503958862699897, + "learning_rate": 3.562627993269728e-06, + "loss": 0.1438, + "step": 9226 + }, + { + "epoch": 0.7309962368785898, + "grad_norm": 1.929073476943581, + "learning_rate": 3.5606645580686262e-06, + "loss": 0.2218, + "step": 9227 + }, + { + "epoch": 0.7310754604872252, + "grad_norm": 1.434326311432558, + "learning_rate": 3.558701546866775e-06, + "loss": 0.1942, + "step": 9228 + }, + { + "epoch": 0.7311546840958606, + "grad_norm": 1.5581366239017558, + "learning_rate": 3.5567389597934367e-06, + "loss": 0.1801, + "step": 9229 + }, + { + "epoch": 0.7312339077044959, + "grad_norm": 1.5939197873046507, + "learning_rate": 3.5547767969778355e-06, + "loss": 0.1234, + "step": 9230 + }, + { + "epoch": 0.7313131313131314, + "grad_norm": 1.8396690125298694, + "learning_rate": 3.5528150585491695e-06, + "loss": 0.2584, + "step": 9231 + }, + { + "epoch": 0.7313923549217667, + "grad_norm": 1.6265254160412324, + "learning_rate": 3.5508537446366097e-06, + "loss": 0.197, + "step": 9232 + }, + { + "epoch": 0.731471578530402, + "grad_norm": 1.6252653754846322, + "learning_rate": 3.548892855369299e-06, + "loss": 0.1918, + "step": 9233 + }, + { + "epoch": 0.7315508021390374, + "grad_norm": 1.3207003598644642, + "learning_rate": 3.5469323908763507e-06, + "loss": 0.1296, + "step": 9234 + }, + { + "epoch": 0.7316300257476728, + "grad_norm": 1.5902352053182993, + "learning_rate": 3.544972351286857e-06, + "loss": 0.1903, + "step": 9235 + }, + { + "epoch": 0.7317092493563082, + "grad_norm": 1.4170522780616377, + "learning_rate": 3.543012736729875e-06, + "loss": 0.1426, + "step": 9236 + }, + { + "epoch": 0.7317884729649435, + "grad_norm": 1.6738582315907702, + "learning_rate": 3.541053547334431e-06, + "loss": 0.1555, + "step": 9237 + }, + { + "epoch": 0.731867696573579, + "grad_norm": 2.017358471201567, + "learning_rate": 3.5390947832295366e-06, + "loss": 0.1397, + "step": 9238 + }, + { + "epoch": 0.7319469201822143, + "grad_norm": 1.8306103500607795, + "learning_rate": 3.5371364445441624e-06, + "loss": 0.209, + "step": 9239 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 1.2752453648619118, + "learning_rate": 3.535178531407253e-06, + "loss": 0.1937, + "step": 9240 + }, + { + "epoch": 0.7321053673994851, + "grad_norm": 1.6076145454420305, + "learning_rate": 3.5332210439477334e-06, + "loss": 0.1656, + "step": 9241 + }, + { + "epoch": 0.7321845910081204, + "grad_norm": 1.494379230965058, + "learning_rate": 3.5312639822944917e-06, + "loss": 0.1089, + "step": 9242 + }, + { + "epoch": 0.7322638146167558, + "grad_norm": 1.478836369687022, + "learning_rate": 3.529307346576388e-06, + "loss": 0.2161, + "step": 9243 + }, + { + "epoch": 0.7323430382253912, + "grad_norm": 1.6050499741314956, + "learning_rate": 3.527351136922265e-06, + "loss": 0.1685, + "step": 9244 + }, + { + "epoch": 0.7324222618340266, + "grad_norm": 1.722522583946821, + "learning_rate": 3.525395353460924e-06, + "loss": 0.1525, + "step": 9245 + }, + { + "epoch": 0.7325014854426619, + "grad_norm": 1.3641376699368235, + "learning_rate": 3.5234399963211418e-06, + "loss": 0.1404, + "step": 9246 + }, + { + "epoch": 0.7325807090512972, + "grad_norm": 1.4039749195769666, + "learning_rate": 3.521485065631677e-06, + "loss": 0.1449, + "step": 9247 + }, + { + "epoch": 0.7326599326599327, + "grad_norm": 1.6744201148765419, + "learning_rate": 3.5195305615212473e-06, + "loss": 0.1677, + "step": 9248 + }, + { + "epoch": 0.732739156268568, + "grad_norm": 1.584820045873605, + "learning_rate": 3.517576484118549e-06, + "loss": 0.1324, + "step": 9249 + }, + { + "epoch": 0.7328183798772034, + "grad_norm": 1.5973881571667095, + "learning_rate": 3.5156228335522434e-06, + "loss": 0.1633, + "step": 9250 + }, + { + "epoch": 0.7328976034858388, + "grad_norm": 1.628524444441464, + "learning_rate": 3.513669609950977e-06, + "loss": 0.1978, + "step": 9251 + }, + { + "epoch": 0.7329768270944742, + "grad_norm": 1.2756077508679642, + "learning_rate": 3.5117168134433566e-06, + "loss": 0.157, + "step": 9252 + }, + { + "epoch": 0.7330560507031095, + "grad_norm": 1.8602242017760275, + "learning_rate": 3.5097644441579602e-06, + "loss": 0.2217, + "step": 9253 + }, + { + "epoch": 0.7331352743117449, + "grad_norm": 1.8319572116481992, + "learning_rate": 3.507812502223351e-06, + "loss": 0.198, + "step": 9254 + }, + { + "epoch": 0.7332144979203803, + "grad_norm": 1.4028066469281366, + "learning_rate": 3.5058609877680495e-06, + "loss": 0.1739, + "step": 9255 + }, + { + "epoch": 0.7332937215290156, + "grad_norm": 1.3785013343154175, + "learning_rate": 3.5039099009205503e-06, + "loss": 0.1412, + "step": 9256 + }, + { + "epoch": 0.7333729451376511, + "grad_norm": 1.1510380172008035, + "learning_rate": 3.5019592418093306e-06, + "loss": 0.1381, + "step": 9257 + }, + { + "epoch": 0.7334521687462864, + "grad_norm": 1.7562008977207655, + "learning_rate": 3.5000090105628282e-06, + "loss": 0.1684, + "step": 9258 + }, + { + "epoch": 0.7335313923549218, + "grad_norm": 1.8937344156720113, + "learning_rate": 3.4980592073094533e-06, + "loss": 0.2403, + "step": 9259 + }, + { + "epoch": 0.7336106159635571, + "grad_norm": 1.4054696221329706, + "learning_rate": 3.4961098321775978e-06, + "loss": 0.1423, + "step": 9260 + }, + { + "epoch": 0.7336898395721925, + "grad_norm": 1.3761612147329114, + "learning_rate": 3.4941608852956143e-06, + "loss": 0.1328, + "step": 9261 + }, + { + "epoch": 0.7337690631808279, + "grad_norm": 1.701598253677154, + "learning_rate": 3.4922123667918305e-06, + "loss": 0.1821, + "step": 9262 + }, + { + "epoch": 0.7338482867894632, + "grad_norm": 1.8066939691741852, + "learning_rate": 3.4902642767945506e-06, + "loss": 0.2017, + "step": 9263 + }, + { + "epoch": 0.7339275103980987, + "grad_norm": 1.3923068824241427, + "learning_rate": 3.488316615432047e-06, + "loss": 0.151, + "step": 9264 + }, + { + "epoch": 0.734006734006734, + "grad_norm": 1.731344961583298, + "learning_rate": 3.486369382832561e-06, + "loss": 0.188, + "step": 9265 + }, + { + "epoch": 0.7340859576153694, + "grad_norm": 1.9471917214352137, + "learning_rate": 3.484422579124306e-06, + "loss": 0.2709, + "step": 9266 + }, + { + "epoch": 0.7341651812240048, + "grad_norm": 1.4567886137717176, + "learning_rate": 3.4824762044354763e-06, + "loss": 0.1739, + "step": 9267 + }, + { + "epoch": 0.7342444048326401, + "grad_norm": 1.9122218009324379, + "learning_rate": 3.480530258894229e-06, + "loss": 0.2177, + "step": 9268 + }, + { + "epoch": 0.7343236284412755, + "grad_norm": 1.308680544729355, + "learning_rate": 3.478584742628691e-06, + "loss": 0.1649, + "step": 9269 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 1.5787516637827443, + "learning_rate": 3.4766396557669712e-06, + "loss": 0.2063, + "step": 9270 + }, + { + "epoch": 0.7344820756585463, + "grad_norm": 1.6720076858463355, + "learning_rate": 3.4746949984371425e-06, + "loss": 0.2134, + "step": 9271 + }, + { + "epoch": 0.7345612992671816, + "grad_norm": 1.6553901487085008, + "learning_rate": 3.472750770767247e-06, + "loss": 0.2069, + "step": 9272 + }, + { + "epoch": 0.734640522875817, + "grad_norm": 1.7169479553977398, + "learning_rate": 3.470806972885309e-06, + "loss": 0.1681, + "step": 9273 + }, + { + "epoch": 0.7347197464844524, + "grad_norm": 1.2962242597923626, + "learning_rate": 3.468863604919316e-06, + "loss": 0.121, + "step": 9274 + }, + { + "epoch": 0.7347989700930877, + "grad_norm": 1.6369086372682258, + "learning_rate": 3.4669206669972254e-06, + "loss": 0.1773, + "step": 9275 + }, + { + "epoch": 0.7348781937017231, + "grad_norm": 1.3073963991436224, + "learning_rate": 3.4649781592469765e-06, + "loss": 0.1174, + "step": 9276 + }, + { + "epoch": 0.7349574173103585, + "grad_norm": 1.5673592417489588, + "learning_rate": 3.4630360817964715e-06, + "loss": 0.2609, + "step": 9277 + }, + { + "epoch": 0.7350366409189939, + "grad_norm": 1.8549002411668731, + "learning_rate": 3.4610944347735864e-06, + "loss": 0.2089, + "step": 9278 + }, + { + "epoch": 0.7351158645276292, + "grad_norm": 1.5531267832692022, + "learning_rate": 3.459153218306167e-06, + "loss": 0.1936, + "step": 9279 + }, + { + "epoch": 0.7351950881362646, + "grad_norm": 1.3987359798899248, + "learning_rate": 3.457212432522038e-06, + "loss": 0.1778, + "step": 9280 + }, + { + "epoch": 0.7352743117449, + "grad_norm": 1.6450946295576279, + "learning_rate": 3.455272077548989e-06, + "loss": 0.2221, + "step": 9281 + }, + { + "epoch": 0.7353535353535353, + "grad_norm": 1.6690793689600167, + "learning_rate": 3.453332153514779e-06, + "loss": 0.2068, + "step": 9282 + }, + { + "epoch": 0.7354327589621708, + "grad_norm": 1.5494664313052775, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.1461, + "step": 9283 + }, + { + "epoch": 0.7355119825708061, + "grad_norm": 1.3877877055880727, + "learning_rate": 3.449453598773804e-06, + "loss": 0.1266, + "step": 9284 + }, + { + "epoch": 0.7355912061794415, + "grad_norm": 1.425633582182246, + "learning_rate": 3.4475149683224164e-06, + "loss": 0.2326, + "step": 9285 + }, + { + "epoch": 0.7356704297880768, + "grad_norm": 1.438177584247859, + "learning_rate": 3.445576769320642e-06, + "loss": 0.1388, + "step": 9286 + }, + { + "epoch": 0.7357496533967122, + "grad_norm": 1.4824285657855427, + "learning_rate": 3.4436390018960997e-06, + "loss": 0.202, + "step": 9287 + }, + { + "epoch": 0.7358288770053476, + "grad_norm": 1.430415837162107, + "learning_rate": 3.4417016661763793e-06, + "loss": 0.1713, + "step": 9288 + }, + { + "epoch": 0.7359081006139829, + "grad_norm": 1.5975277218062287, + "learning_rate": 3.439764762289051e-06, + "loss": 0.1513, + "step": 9289 + }, + { + "epoch": 0.7359873242226184, + "grad_norm": 1.3793798901900824, + "learning_rate": 3.4378282903616457e-06, + "loss": 0.1301, + "step": 9290 + }, + { + "epoch": 0.7360665478312537, + "grad_norm": 1.6763429774800287, + "learning_rate": 3.4358922505216707e-06, + "loss": 0.2338, + "step": 9291 + }, + { + "epoch": 0.7361457714398891, + "grad_norm": 1.199526460764395, + "learning_rate": 3.4339566428966086e-06, + "loss": 0.1005, + "step": 9292 + }, + { + "epoch": 0.7362249950485245, + "grad_norm": 1.4249566403907705, + "learning_rate": 3.4320214676139087e-06, + "loss": 0.1774, + "step": 9293 + }, + { + "epoch": 0.7363042186571598, + "grad_norm": 2.2759126802080303, + "learning_rate": 3.4300867248009917e-06, + "loss": 0.2253, + "step": 9294 + }, + { + "epoch": 0.7363834422657952, + "grad_norm": 1.8030010609026996, + "learning_rate": 3.4281524145852485e-06, + "loss": 0.1618, + "step": 9295 + }, + { + "epoch": 0.7364626658744305, + "grad_norm": 1.5076780114117971, + "learning_rate": 3.4262185370940504e-06, + "loss": 0.1124, + "step": 9296 + }, + { + "epoch": 0.736541889483066, + "grad_norm": 1.2693258024399388, + "learning_rate": 3.4242850924547297e-06, + "loss": 0.0947, + "step": 9297 + }, + { + "epoch": 0.7366211130917013, + "grad_norm": 1.6414827986512128, + "learning_rate": 3.422352080794593e-06, + "loss": 0.1355, + "step": 9298 + }, + { + "epoch": 0.7367003367003367, + "grad_norm": 1.777672336959917, + "learning_rate": 3.4204195022409247e-06, + "loss": 0.1664, + "step": 9299 + }, + { + "epoch": 0.7367795603089721, + "grad_norm": 2.083096831392534, + "learning_rate": 3.418487356920974e-06, + "loss": 0.2306, + "step": 9300 + }, + { + "epoch": 0.7368587839176074, + "grad_norm": 1.4858841545435992, + "learning_rate": 3.4165556449619584e-06, + "loss": 0.1507, + "step": 9301 + }, + { + "epoch": 0.7369380075262428, + "grad_norm": 2.4406334926710573, + "learning_rate": 3.4146243664910804e-06, + "loss": 0.1751, + "step": 9302 + }, + { + "epoch": 0.7370172311348782, + "grad_norm": 1.0159323870260486, + "learning_rate": 3.4126935216355005e-06, + "loss": 0.0891, + "step": 9303 + }, + { + "epoch": 0.7370964547435136, + "grad_norm": 1.5574304939224075, + "learning_rate": 3.4107631105223528e-06, + "loss": 0.199, + "step": 9304 + }, + { + "epoch": 0.7371756783521489, + "grad_norm": 1.8613206664797182, + "learning_rate": 3.4088331332787527e-06, + "loss": 0.1968, + "step": 9305 + }, + { + "epoch": 0.7372549019607844, + "grad_norm": 1.6016464423022756, + "learning_rate": 3.406903590031776e-06, + "loss": 0.2066, + "step": 9306 + }, + { + "epoch": 0.7373341255694197, + "grad_norm": 1.2872302419669481, + "learning_rate": 3.4049744809084697e-06, + "loss": 0.1503, + "step": 9307 + }, + { + "epoch": 0.737413349178055, + "grad_norm": 1.2911361225106146, + "learning_rate": 3.4030458060358682e-06, + "loss": 0.1504, + "step": 9308 + }, + { + "epoch": 0.7374925727866904, + "grad_norm": 1.8083152712854311, + "learning_rate": 3.4011175655409546e-06, + "loss": 0.195, + "step": 9309 + }, + { + "epoch": 0.7375717963953258, + "grad_norm": 1.3395017364910222, + "learning_rate": 3.399189759550694e-06, + "loss": 0.1429, + "step": 9310 + }, + { + "epoch": 0.7376510200039612, + "grad_norm": 1.4825468625516056, + "learning_rate": 3.3972623881920296e-06, + "loss": 0.1936, + "step": 9311 + }, + { + "epoch": 0.7377302436125965, + "grad_norm": 1.385687659095172, + "learning_rate": 3.3953354515918667e-06, + "loss": 0.1776, + "step": 9312 + }, + { + "epoch": 0.737809467221232, + "grad_norm": 1.3280931032098882, + "learning_rate": 3.3934089498770816e-06, + "loss": 0.1491, + "step": 9313 + }, + { + "epoch": 0.7378886908298673, + "grad_norm": 1.6269056777615707, + "learning_rate": 3.3914828831745306e-06, + "loss": 0.1568, + "step": 9314 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.584624723752255, + "learning_rate": 3.3895572516110353e-06, + "loss": 0.2033, + "step": 9315 + }, + { + "epoch": 0.7380471380471381, + "grad_norm": 1.6475655587896085, + "learning_rate": 3.3876320553133834e-06, + "loss": 0.154, + "step": 9316 + }, + { + "epoch": 0.7381263616557734, + "grad_norm": 1.4345396080179786, + "learning_rate": 3.385707294408347e-06, + "loss": 0.1832, + "step": 9317 + }, + { + "epoch": 0.7382055852644088, + "grad_norm": 1.5228599321570073, + "learning_rate": 3.38378296902266e-06, + "loss": 0.1833, + "step": 9318 + }, + { + "epoch": 0.7382848088730442, + "grad_norm": 1.158066749423035, + "learning_rate": 3.3818590792830285e-06, + "loss": 0.1293, + "step": 9319 + }, + { + "epoch": 0.7383640324816796, + "grad_norm": 1.8639376219796586, + "learning_rate": 3.3799356253161288e-06, + "loss": 0.2306, + "step": 9320 + }, + { + "epoch": 0.7384432560903149, + "grad_norm": 1.4582046048947686, + "learning_rate": 3.3780126072486188e-06, + "loss": 0.1537, + "step": 9321 + }, + { + "epoch": 0.7385224796989502, + "grad_norm": 1.6614339670549445, + "learning_rate": 3.376090025207115e-06, + "loss": 0.2216, + "step": 9322 + }, + { + "epoch": 0.7386017033075857, + "grad_norm": 1.522492898509982, + "learning_rate": 3.3741678793182077e-06, + "loss": 0.1495, + "step": 9323 + }, + { + "epoch": 0.738680926916221, + "grad_norm": 1.5782764997232395, + "learning_rate": 3.372246169708466e-06, + "loss": 0.1377, + "step": 9324 + }, + { + "epoch": 0.7387601505248564, + "grad_norm": 1.2941478117245584, + "learning_rate": 3.3703248965044253e-06, + "loss": 0.1381, + "step": 9325 + }, + { + "epoch": 0.7388393741334918, + "grad_norm": 1.991255140355642, + "learning_rate": 3.368404059832586e-06, + "loss": 0.2822, + "step": 9326 + }, + { + "epoch": 0.7389185977421272, + "grad_norm": 1.3842367449387394, + "learning_rate": 3.366483659819434e-06, + "loss": 0.144, + "step": 9327 + }, + { + "epoch": 0.7389978213507625, + "grad_norm": 1.5745940123767712, + "learning_rate": 3.364563696591414e-06, + "loss": 0.1691, + "step": 9328 + }, + { + "epoch": 0.7390770449593979, + "grad_norm": 1.1482600007162196, + "learning_rate": 3.3626441702749436e-06, + "loss": 0.1174, + "step": 9329 + }, + { + "epoch": 0.7391562685680333, + "grad_norm": 1.4965772429888502, + "learning_rate": 3.360725080996421e-06, + "loss": 0.1475, + "step": 9330 + }, + { + "epoch": 0.7392354921766686, + "grad_norm": 1.8151353352812318, + "learning_rate": 3.3588064288822055e-06, + "loss": 0.2591, + "step": 9331 + }, + { + "epoch": 0.739314715785304, + "grad_norm": 1.9651925188489936, + "learning_rate": 3.356888214058629e-06, + "loss": 0.1459, + "step": 9332 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 2.1504601771332825, + "learning_rate": 3.354970436652001e-06, + "loss": 0.2426, + "step": 9333 + }, + { + "epoch": 0.7394731630025748, + "grad_norm": 1.4223477195328218, + "learning_rate": 3.3530530967885964e-06, + "loss": 0.1395, + "step": 9334 + }, + { + "epoch": 0.7395523866112101, + "grad_norm": 1.1037716480956214, + "learning_rate": 3.351136194594662e-06, + "loss": 0.0865, + "step": 9335 + }, + { + "epoch": 0.7396316102198455, + "grad_norm": 1.4491905539308725, + "learning_rate": 3.3492197301964145e-06, + "loss": 0.1498, + "step": 9336 + }, + { + "epoch": 0.7397108338284809, + "grad_norm": 1.9894708986344223, + "learning_rate": 3.3473037037200484e-06, + "loss": 0.2727, + "step": 9337 + }, + { + "epoch": 0.7397900574371162, + "grad_norm": 1.6990974711784488, + "learning_rate": 3.345388115291723e-06, + "loss": 0.2105, + "step": 9338 + }, + { + "epoch": 0.7398692810457517, + "grad_norm": 1.443708267902268, + "learning_rate": 3.3434729650375675e-06, + "loss": 0.1643, + "step": 9339 + }, + { + "epoch": 0.739948504654387, + "grad_norm": 1.6362432477609234, + "learning_rate": 3.341558253083692e-06, + "loss": 0.1577, + "step": 9340 + }, + { + "epoch": 0.7400277282630224, + "grad_norm": 1.8078991881716104, + "learning_rate": 3.3396439795561662e-06, + "loss": 0.2295, + "step": 9341 + }, + { + "epoch": 0.7401069518716578, + "grad_norm": 1.7496771299142653, + "learning_rate": 3.3377301445810327e-06, + "loss": 0.1531, + "step": 9342 + }, + { + "epoch": 0.7401861754802931, + "grad_norm": 1.594778358872988, + "learning_rate": 3.3358167482843173e-06, + "loss": 0.1776, + "step": 9343 + }, + { + "epoch": 0.7402653990889285, + "grad_norm": 1.6781356099973521, + "learning_rate": 3.3339037907920024e-06, + "loss": 0.1384, + "step": 9344 + }, + { + "epoch": 0.7403446226975638, + "grad_norm": 1.5182639092873484, + "learning_rate": 3.331991272230044e-06, + "loss": 0.1541, + "step": 9345 + }, + { + "epoch": 0.7404238463061993, + "grad_norm": 1.4540555762844187, + "learning_rate": 3.330079192724379e-06, + "loss": 0.173, + "step": 9346 + }, + { + "epoch": 0.7405030699148346, + "grad_norm": 1.6712152429125322, + "learning_rate": 3.328167552400906e-06, + "loss": 0.2466, + "step": 9347 + }, + { + "epoch": 0.74058229352347, + "grad_norm": 1.6033877694117369, + "learning_rate": 3.326256351385494e-06, + "loss": 0.1412, + "step": 9348 + }, + { + "epoch": 0.7406615171321054, + "grad_norm": 1.1948111751591572, + "learning_rate": 3.324345589803991e-06, + "loss": 0.1195, + "step": 9349 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 1.2426886441315634, + "learning_rate": 3.3224352677822115e-06, + "loss": 0.1144, + "step": 9350 + }, + { + "epoch": 0.7408199643493761, + "grad_norm": 1.3915999862572395, + "learning_rate": 3.3205253854459386e-06, + "loss": 0.1528, + "step": 9351 + }, + { + "epoch": 0.7408991879580115, + "grad_norm": 2.1206972558200867, + "learning_rate": 3.3186159429209263e-06, + "loss": 0.1917, + "step": 9352 + }, + { + "epoch": 0.7409784115666469, + "grad_norm": 1.5048324376194773, + "learning_rate": 3.316706940332908e-06, + "loss": 0.1529, + "step": 9353 + }, + { + "epoch": 0.7410576351752822, + "grad_norm": 1.5914554689880618, + "learning_rate": 3.314798377807581e-06, + "loss": 0.1827, + "step": 9354 + }, + { + "epoch": 0.7411368587839177, + "grad_norm": 1.7302336924830695, + "learning_rate": 3.312890255470609e-06, + "loss": 0.2293, + "step": 9355 + }, + { + "epoch": 0.741216082392553, + "grad_norm": 1.3001394612183932, + "learning_rate": 3.3109825734476407e-06, + "loss": 0.1528, + "step": 9356 + }, + { + "epoch": 0.7412953060011883, + "grad_norm": 1.3730314342935777, + "learning_rate": 3.3090753318642855e-06, + "loss": 0.1331, + "step": 9357 + }, + { + "epoch": 0.7413745296098238, + "grad_norm": 1.4589420005796052, + "learning_rate": 3.307168530846121e-06, + "loss": 0.1331, + "step": 9358 + }, + { + "epoch": 0.7414537532184591, + "grad_norm": 1.5200790879887773, + "learning_rate": 3.3052621705187083e-06, + "loss": 0.2065, + "step": 9359 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 1.6862318268938457, + "learning_rate": 3.303356251007569e-06, + "loss": 0.1894, + "step": 9360 + }, + { + "epoch": 0.7416122004357298, + "grad_norm": 1.1620113580523268, + "learning_rate": 3.301450772438195e-06, + "loss": 0.1255, + "step": 9361 + }, + { + "epoch": 0.7416914240443652, + "grad_norm": 1.3621520942691776, + "learning_rate": 3.2995457349360595e-06, + "loss": 0.1105, + "step": 9362 + }, + { + "epoch": 0.7417706476530006, + "grad_norm": 1.577920579072157, + "learning_rate": 3.297641138626597e-06, + "loss": 0.279, + "step": 9363 + }, + { + "epoch": 0.7418498712616359, + "grad_norm": 1.6949766842956953, + "learning_rate": 3.295736983635215e-06, + "loss": 0.2005, + "step": 9364 + }, + { + "epoch": 0.7419290948702714, + "grad_norm": 1.4354012168338248, + "learning_rate": 3.293833270087291e-06, + "loss": 0.1744, + "step": 9365 + }, + { + "epoch": 0.7420083184789067, + "grad_norm": 1.3364671176844904, + "learning_rate": 3.291929998108182e-06, + "loss": 0.1422, + "step": 9366 + }, + { + "epoch": 0.7420875420875421, + "grad_norm": 1.5726889955569778, + "learning_rate": 3.2900271678232045e-06, + "loss": 0.1848, + "step": 9367 + }, + { + "epoch": 0.7421667656961775, + "grad_norm": 1.4682398322336907, + "learning_rate": 3.2881247793576488e-06, + "loss": 0.1602, + "step": 9368 + }, + { + "epoch": 0.7422459893048128, + "grad_norm": 1.7601241974422648, + "learning_rate": 3.286222832836784e-06, + "loss": 0.1995, + "step": 9369 + }, + { + "epoch": 0.7423252129134482, + "grad_norm": 1.13206513269023, + "learning_rate": 3.284321328385842e-06, + "loss": 0.1277, + "step": 9370 + }, + { + "epoch": 0.7424044365220835, + "grad_norm": 1.0826381798264555, + "learning_rate": 3.282420266130022e-06, + "loss": 0.1018, + "step": 9371 + }, + { + "epoch": 0.742483660130719, + "grad_norm": 1.46516471297859, + "learning_rate": 3.280519646194509e-06, + "loss": 0.2235, + "step": 9372 + }, + { + "epoch": 0.7425628837393543, + "grad_norm": 1.2397203554498002, + "learning_rate": 3.278619468704445e-06, + "loss": 0.1266, + "step": 9373 + }, + { + "epoch": 0.7426421073479897, + "grad_norm": 1.4545359333169543, + "learning_rate": 3.276719733784943e-06, + "loss": 0.1848, + "step": 9374 + }, + { + "epoch": 0.7427213309566251, + "grad_norm": 1.6500272433078569, + "learning_rate": 3.2748204415611016e-06, + "loss": 0.1366, + "step": 9375 + }, + { + "epoch": 0.7428005545652604, + "grad_norm": 1.6325770696106963, + "learning_rate": 3.2729215921579738e-06, + "loss": 0.2028, + "step": 9376 + }, + { + "epoch": 0.7428797781738958, + "grad_norm": 1.15602012670514, + "learning_rate": 3.271023185700587e-06, + "loss": 0.1228, + "step": 9377 + }, + { + "epoch": 0.7429590017825312, + "grad_norm": 1.843362747977543, + "learning_rate": 3.269125222313949e-06, + "loss": 0.203, + "step": 9378 + }, + { + "epoch": 0.7430382253911666, + "grad_norm": 1.307618746222223, + "learning_rate": 3.2672277021230283e-06, + "loss": 0.1149, + "step": 9379 + }, + { + "epoch": 0.7431174489998019, + "grad_norm": 1.4540246733583013, + "learning_rate": 3.2653306252527673e-06, + "loss": 0.1659, + "step": 9380 + }, + { + "epoch": 0.7431966726084374, + "grad_norm": 1.557183174892844, + "learning_rate": 3.2634339918280765e-06, + "loss": 0.2084, + "step": 9381 + }, + { + "epoch": 0.7432758962170727, + "grad_norm": 1.3097215630282952, + "learning_rate": 3.2615378019738455e-06, + "loss": 0.1541, + "step": 9382 + }, + { + "epoch": 0.743355119825708, + "grad_norm": 1.406429453010852, + "learning_rate": 3.2596420558149277e-06, + "loss": 0.2183, + "step": 9383 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 1.453859466775615, + "learning_rate": 3.257746753476144e-06, + "loss": 0.1952, + "step": 9384 + }, + { + "epoch": 0.7435135670429788, + "grad_norm": 1.262752854981948, + "learning_rate": 3.255851895082299e-06, + "loss": 0.1466, + "step": 9385 + }, + { + "epoch": 0.7435927906516142, + "grad_norm": 1.396992348994309, + "learning_rate": 3.2539574807581555e-06, + "loss": 0.16, + "step": 9386 + }, + { + "epoch": 0.7436720142602495, + "grad_norm": 2.014407761236723, + "learning_rate": 3.2520635106284516e-06, + "loss": 0.1563, + "step": 9387 + }, + { + "epoch": 0.743751237868885, + "grad_norm": 1.744449288493045, + "learning_rate": 3.250169984817897e-06, + "loss": 0.2748, + "step": 9388 + }, + { + "epoch": 0.7438304614775203, + "grad_norm": 1.725947055342265, + "learning_rate": 3.248276903451171e-06, + "loss": 0.1688, + "step": 9389 + }, + { + "epoch": 0.7439096850861556, + "grad_norm": 1.78963784822978, + "learning_rate": 3.24638426665292e-06, + "loss": 0.2706, + "step": 9390 + }, + { + "epoch": 0.7439889086947911, + "grad_norm": 2.019731931146409, + "learning_rate": 3.2444920745477727e-06, + "loss": 0.1885, + "step": 9391 + }, + { + "epoch": 0.7440681323034264, + "grad_norm": 1.512758703037509, + "learning_rate": 3.2426003272603158e-06, + "loss": 0.2144, + "step": 9392 + }, + { + "epoch": 0.7441473559120618, + "grad_norm": 1.5672812969176664, + "learning_rate": 3.2407090249151105e-06, + "loss": 0.1679, + "step": 9393 + }, + { + "epoch": 0.7442265795206972, + "grad_norm": 1.4761115109775866, + "learning_rate": 3.238818167636695e-06, + "loss": 0.1309, + "step": 9394 + }, + { + "epoch": 0.7443058031293326, + "grad_norm": 1.3793122019726884, + "learning_rate": 3.2369277555495705e-06, + "loss": 0.1615, + "step": 9395 + }, + { + "epoch": 0.7443850267379679, + "grad_norm": 1.8198411149293479, + "learning_rate": 3.235037788778208e-06, + "loss": 0.1588, + "step": 9396 + }, + { + "epoch": 0.7444642503466032, + "grad_norm": 1.5415560755805162, + "learning_rate": 3.2331482674470605e-06, + "loss": 0.2437, + "step": 9397 + }, + { + "epoch": 0.7445434739552387, + "grad_norm": 1.4677539744941233, + "learning_rate": 3.2312591916805382e-06, + "loss": 0.1498, + "step": 9398 + }, + { + "epoch": 0.744622697563874, + "grad_norm": 1.353828997542222, + "learning_rate": 3.2293705616030267e-06, + "loss": 0.1107, + "step": 9399 + }, + { + "epoch": 0.7447019211725094, + "grad_norm": 1.1243917441623805, + "learning_rate": 3.2274823773388885e-06, + "loss": 0.082, + "step": 9400 + }, + { + "epoch": 0.7447811447811448, + "grad_norm": 1.2415656257365184, + "learning_rate": 3.2255946390124482e-06, + "loss": 0.1102, + "step": 9401 + }, + { + "epoch": 0.7448603683897802, + "grad_norm": 1.5009386574586785, + "learning_rate": 3.223707346748002e-06, + "loss": 0.1442, + "step": 9402 + }, + { + "epoch": 0.7449395919984155, + "grad_norm": 1.2630398148054123, + "learning_rate": 3.221820500669823e-06, + "loss": 0.1026, + "step": 9403 + }, + { + "epoch": 0.7450188156070509, + "grad_norm": 1.4765042416967769, + "learning_rate": 3.2199341009021514e-06, + "loss": 0.196, + "step": 9404 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 1.9950592831851934, + "learning_rate": 3.218048147569195e-06, + "loss": 0.2206, + "step": 9405 + }, + { + "epoch": 0.7451772628243216, + "grad_norm": 3.0799890651613917, + "learning_rate": 3.216162640795133e-06, + "loss": 0.2244, + "step": 9406 + }, + { + "epoch": 0.745256486432957, + "grad_norm": 1.782768468281133, + "learning_rate": 3.2142775807041214e-06, + "loss": 0.1787, + "step": 9407 + }, + { + "epoch": 0.7453357100415924, + "grad_norm": 1.7789121044026834, + "learning_rate": 3.2123929674202816e-06, + "loss": 0.214, + "step": 9408 + }, + { + "epoch": 0.7454149336502278, + "grad_norm": 1.6260684541933699, + "learning_rate": 3.2105088010677e-06, + "loss": 0.1881, + "step": 9409 + }, + { + "epoch": 0.7454941572588631, + "grad_norm": 1.4632101493607146, + "learning_rate": 3.2086250817704488e-06, + "loss": 0.1556, + "step": 9410 + }, + { + "epoch": 0.7455733808674985, + "grad_norm": 1.5008100394690242, + "learning_rate": 3.2067418096525593e-06, + "loss": 0.163, + "step": 9411 + }, + { + "epoch": 0.7456526044761339, + "grad_norm": 1.9513070603270986, + "learning_rate": 3.2048589848380297e-06, + "loss": 0.1347, + "step": 9412 + }, + { + "epoch": 0.7457318280847692, + "grad_norm": 1.3226089248980673, + "learning_rate": 3.202976607450844e-06, + "loss": 0.089, + "step": 9413 + }, + { + "epoch": 0.7458110516934047, + "grad_norm": 1.569659335239659, + "learning_rate": 3.201094677614943e-06, + "loss": 0.1689, + "step": 9414 + }, + { + "epoch": 0.74589027530204, + "grad_norm": 1.792883878128053, + "learning_rate": 3.1992131954542404e-06, + "loss": 0.2112, + "step": 9415 + }, + { + "epoch": 0.7459694989106754, + "grad_norm": 1.4410446105303847, + "learning_rate": 3.1973321610926277e-06, + "loss": 0.1596, + "step": 9416 + }, + { + "epoch": 0.7460487225193108, + "grad_norm": 1.6162279520850313, + "learning_rate": 3.1954515746539616e-06, + "loss": 0.1967, + "step": 9417 + }, + { + "epoch": 0.7461279461279461, + "grad_norm": 1.4949360680688506, + "learning_rate": 3.193571436262064e-06, + "loss": 0.1659, + "step": 9418 + }, + { + "epoch": 0.7462071697365815, + "grad_norm": 1.6659961745475955, + "learning_rate": 3.191691746040739e-06, + "loss": 0.1283, + "step": 9419 + }, + { + "epoch": 0.7462863933452168, + "grad_norm": 1.3619291046913964, + "learning_rate": 3.189812504113754e-06, + "loss": 0.1177, + "step": 9420 + }, + { + "epoch": 0.7463656169538523, + "grad_norm": 1.7671101665733155, + "learning_rate": 3.187933710604847e-06, + "loss": 0.1902, + "step": 9421 + }, + { + "epoch": 0.7464448405624876, + "grad_norm": 1.2793405139697156, + "learning_rate": 3.186055365637725e-06, + "loss": 0.1365, + "step": 9422 + }, + { + "epoch": 0.746524064171123, + "grad_norm": 2.045577367070868, + "learning_rate": 3.184177469336073e-06, + "loss": 0.2738, + "step": 9423 + }, + { + "epoch": 0.7466032877797584, + "grad_norm": 1.7388327769947831, + "learning_rate": 3.1823000218235388e-06, + "loss": 0.1509, + "step": 9424 + }, + { + "epoch": 0.7466825113883937, + "grad_norm": 1.5926056012829106, + "learning_rate": 3.180423023223741e-06, + "loss": 0.1725, + "step": 9425 + }, + { + "epoch": 0.7467617349970291, + "grad_norm": 2.3044459028553423, + "learning_rate": 3.1785464736602754e-06, + "loss": 0.2752, + "step": 9426 + }, + { + "epoch": 0.7468409586056645, + "grad_norm": 1.3631241901676512, + "learning_rate": 3.1766703732567027e-06, + "loss": 0.1517, + "step": 9427 + }, + { + "epoch": 0.7469201822142999, + "grad_norm": 1.8131295120335895, + "learning_rate": 3.1747947221365517e-06, + "loss": 0.1823, + "step": 9428 + }, + { + "epoch": 0.7469994058229352, + "grad_norm": 1.4104341139662362, + "learning_rate": 3.17291952042333e-06, + "loss": 0.129, + "step": 9429 + }, + { + "epoch": 0.7470786294315707, + "grad_norm": 1.5564989955430768, + "learning_rate": 3.171044768240508e-06, + "loss": 0.1936, + "step": 9430 + }, + { + "epoch": 0.747157853040206, + "grad_norm": 1.9275451811158177, + "learning_rate": 3.169170465711525e-06, + "loss": 0.2043, + "step": 9431 + }, + { + "epoch": 0.7472370766488413, + "grad_norm": 1.6472144822267165, + "learning_rate": 3.167296612959803e-06, + "loss": 0.1162, + "step": 9432 + }, + { + "epoch": 0.7473163002574768, + "grad_norm": 1.738899501926095, + "learning_rate": 3.1654232101087225e-06, + "loss": 0.2611, + "step": 9433 + }, + { + "epoch": 0.7473955238661121, + "grad_norm": 1.4511805916709952, + "learning_rate": 3.1635502572816333e-06, + "loss": 0.155, + "step": 9434 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 1.2958664386911982, + "learning_rate": 3.1616777546018696e-06, + "loss": 0.172, + "step": 9435 + }, + { + "epoch": 0.7475539710833828, + "grad_norm": 1.5714686865174037, + "learning_rate": 3.1598057021927207e-06, + "loss": 0.1415, + "step": 9436 + }, + { + "epoch": 0.7476331946920183, + "grad_norm": 1.5972021640799656, + "learning_rate": 3.1579341001774546e-06, + "loss": 0.1656, + "step": 9437 + }, + { + "epoch": 0.7477124183006536, + "grad_norm": 1.2608370954512118, + "learning_rate": 3.1560629486793014e-06, + "loss": 0.1317, + "step": 9438 + }, + { + "epoch": 0.7477916419092889, + "grad_norm": 1.8143260551602671, + "learning_rate": 3.154192247821476e-06, + "loss": 0.1862, + "step": 9439 + }, + { + "epoch": 0.7478708655179244, + "grad_norm": 1.447571095309219, + "learning_rate": 3.1523219977271515e-06, + "loss": 0.1465, + "step": 9440 + }, + { + "epoch": 0.7479500891265597, + "grad_norm": 1.6452166264730648, + "learning_rate": 3.1504521985194715e-06, + "loss": 0.2042, + "step": 9441 + }, + { + "epoch": 0.7480293127351951, + "grad_norm": 1.8594082552500155, + "learning_rate": 3.1485828503215588e-06, + "loss": 0.1828, + "step": 9442 + }, + { + "epoch": 0.7481085363438305, + "grad_norm": 1.4077381822167334, + "learning_rate": 3.1467139532564985e-06, + "loss": 0.1069, + "step": 9443 + }, + { + "epoch": 0.7481877599524658, + "grad_norm": 1.5310639014817258, + "learning_rate": 3.144845507447345e-06, + "loss": 0.145, + "step": 9444 + }, + { + "epoch": 0.7482669835611012, + "grad_norm": 1.6000990576281384, + "learning_rate": 3.1429775130171337e-06, + "loss": 0.1691, + "step": 9445 + }, + { + "epoch": 0.7483462071697365, + "grad_norm": 1.6749874607193753, + "learning_rate": 3.141109970088859e-06, + "loss": 0.1878, + "step": 9446 + }, + { + "epoch": 0.748425430778372, + "grad_norm": 1.4345275074123902, + "learning_rate": 3.1392428787854865e-06, + "loss": 0.1059, + "step": 9447 + }, + { + "epoch": 0.7485046543870073, + "grad_norm": 1.7564017617656271, + "learning_rate": 3.1373762392299632e-06, + "loss": 0.1985, + "step": 9448 + }, + { + "epoch": 0.7485838779956427, + "grad_norm": 1.639461351728904, + "learning_rate": 3.135510051545192e-06, + "loss": 0.1623, + "step": 9449 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 1.3363716301873942, + "learning_rate": 3.133644315854055e-06, + "loss": 0.1843, + "step": 9450 + }, + { + "epoch": 0.7487423252129134, + "grad_norm": 1.2541017702994535, + "learning_rate": 3.131779032279397e-06, + "loss": 0.1526, + "step": 9451 + }, + { + "epoch": 0.7488215488215488, + "grad_norm": 1.3861024149164576, + "learning_rate": 3.1299142009440463e-06, + "loss": 0.1614, + "step": 9452 + }, + { + "epoch": 0.7489007724301842, + "grad_norm": 1.6010788956064284, + "learning_rate": 3.1280498219707876e-06, + "loss": 0.155, + "step": 9453 + }, + { + "epoch": 0.7489799960388196, + "grad_norm": 1.7283847995739627, + "learning_rate": 3.1261858954823798e-06, + "loss": 0.1936, + "step": 9454 + }, + { + "epoch": 0.7490592196474549, + "grad_norm": 1.6908437941785552, + "learning_rate": 3.12432242160156e-06, + "loss": 0.1828, + "step": 9455 + }, + { + "epoch": 0.7491384432560904, + "grad_norm": 1.31474546484202, + "learning_rate": 3.1224594004510246e-06, + "loss": 0.1578, + "step": 9456 + }, + { + "epoch": 0.7492176668647257, + "grad_norm": 1.4845040695694474, + "learning_rate": 3.1205968321534406e-06, + "loss": 0.1868, + "step": 9457 + }, + { + "epoch": 0.749296890473361, + "grad_norm": 1.6149009504559704, + "learning_rate": 3.1187347168314586e-06, + "loss": 0.2453, + "step": 9458 + }, + { + "epoch": 0.7493761140819964, + "grad_norm": 1.3771976956138505, + "learning_rate": 3.1168730546076844e-06, + "loss": 0.1281, + "step": 9459 + }, + { + "epoch": 0.7494553376906318, + "grad_norm": 1.9976614011919243, + "learning_rate": 3.1150118456046963e-06, + "loss": 0.2294, + "step": 9460 + }, + { + "epoch": 0.7495345612992672, + "grad_norm": 1.3303440164698377, + "learning_rate": 3.1131510899450533e-06, + "loss": 0.1462, + "step": 9461 + }, + { + "epoch": 0.7496137849079025, + "grad_norm": 1.7248594974234859, + "learning_rate": 3.1112907877512732e-06, + "loss": 0.1885, + "step": 9462 + }, + { + "epoch": 0.749693008516538, + "grad_norm": 1.8288897650668423, + "learning_rate": 3.1094309391458455e-06, + "loss": 0.298, + "step": 9463 + }, + { + "epoch": 0.7497722321251733, + "grad_norm": 1.8095255636537548, + "learning_rate": 3.107571544251241e-06, + "loss": 0.2326, + "step": 9464 + }, + { + "epoch": 0.7498514557338086, + "grad_norm": 1.779288428091656, + "learning_rate": 3.1057126031898843e-06, + "loss": 0.1839, + "step": 9465 + }, + { + "epoch": 0.7499306793424441, + "grad_norm": 1.4074582777607487, + "learning_rate": 3.1038541160841752e-06, + "loss": 0.1174, + "step": 9466 + }, + { + "epoch": 0.7500099029510794, + "grad_norm": 1.5025661199259404, + "learning_rate": 3.1019960830564945e-06, + "loss": 0.1629, + "step": 9467 + }, + { + "epoch": 0.7500891265597148, + "grad_norm": 1.7274635264554326, + "learning_rate": 3.1001385042291797e-06, + "loss": 0.2614, + "step": 9468 + }, + { + "epoch": 0.7501683501683502, + "grad_norm": 1.9423456864226756, + "learning_rate": 3.0982813797245413e-06, + "loss": 0.2614, + "step": 9469 + }, + { + "epoch": 0.7502475737769856, + "grad_norm": 1.5473805181905458, + "learning_rate": 3.096424709664868e-06, + "loss": 0.1711, + "step": 9470 + }, + { + "epoch": 0.7503267973856209, + "grad_norm": 1.2942218161109502, + "learning_rate": 3.094568494172411e-06, + "loss": 0.1358, + "step": 9471 + }, + { + "epoch": 0.7504060209942562, + "grad_norm": 1.4726105053901786, + "learning_rate": 3.0927127333693872e-06, + "loss": 0.1426, + "step": 9472 + }, + { + "epoch": 0.7504852446028917, + "grad_norm": 1.9081955835206885, + "learning_rate": 3.090857427377998e-06, + "loss": 0.2108, + "step": 9473 + }, + { + "epoch": 0.750564468211527, + "grad_norm": 2.3596802848866076, + "learning_rate": 3.0890025763204025e-06, + "loss": 0.2615, + "step": 9474 + }, + { + "epoch": 0.7506436918201624, + "grad_norm": 1.4060438382280747, + "learning_rate": 3.087148180318734e-06, + "loss": 0.1506, + "step": 9475 + }, + { + "epoch": 0.7507229154287978, + "grad_norm": 1.8389606171204427, + "learning_rate": 3.0852942394950915e-06, + "loss": 0.2007, + "step": 9476 + }, + { + "epoch": 0.7508021390374332, + "grad_norm": 1.7784206539099288, + "learning_rate": 3.083440753971556e-06, + "loss": 0.1598, + "step": 9477 + }, + { + "epoch": 0.7508813626460685, + "grad_norm": 1.3504421080075035, + "learning_rate": 3.0815877238701653e-06, + "loss": 0.1224, + "step": 9478 + }, + { + "epoch": 0.7509605862547039, + "grad_norm": 1.5474801975431938, + "learning_rate": 3.079735149312931e-06, + "loss": 0.2434, + "step": 9479 + }, + { + "epoch": 0.7510398098633393, + "grad_norm": 1.515364306173949, + "learning_rate": 3.077883030421843e-06, + "loss": 0.1341, + "step": 9480 + }, + { + "epoch": 0.7511190334719746, + "grad_norm": 1.299646714737203, + "learning_rate": 3.0760313673188493e-06, + "loss": 0.119, + "step": 9481 + }, + { + "epoch": 0.75119825708061, + "grad_norm": 1.466181747769846, + "learning_rate": 3.0741801601258714e-06, + "loss": 0.1838, + "step": 9482 + }, + { + "epoch": 0.7512774806892454, + "grad_norm": 1.6578385617417195, + "learning_rate": 3.072329408964808e-06, + "loss": 0.1662, + "step": 9483 + }, + { + "epoch": 0.7513567042978808, + "grad_norm": 1.1262558780260976, + "learning_rate": 3.0704791139575195e-06, + "loss": 0.1023, + "step": 9484 + }, + { + "epoch": 0.7514359279065161, + "grad_norm": 2.00740069927691, + "learning_rate": 3.0686292752258352e-06, + "loss": 0.2138, + "step": 9485 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 1.4809050641015151, + "learning_rate": 3.066779892891564e-06, + "loss": 0.1597, + "step": 9486 + }, + { + "epoch": 0.7515943751237869, + "grad_norm": 1.375738002186897, + "learning_rate": 3.064930967076477e-06, + "loss": 0.139, + "step": 9487 + }, + { + "epoch": 0.7516735987324222, + "grad_norm": 1.579032260047389, + "learning_rate": 3.063082497902313e-06, + "loss": 0.1606, + "step": 9488 + }, + { + "epoch": 0.7517528223410577, + "grad_norm": 1.691196858421653, + "learning_rate": 3.0612344854907917e-06, + "loss": 0.1787, + "step": 9489 + }, + { + "epoch": 0.751832045949693, + "grad_norm": 1.3844071548100543, + "learning_rate": 3.0593869299635925e-06, + "loss": 0.1259, + "step": 9490 + }, + { + "epoch": 0.7519112695583284, + "grad_norm": 1.6548482180821298, + "learning_rate": 3.0575398314423677e-06, + "loss": 0.1792, + "step": 9491 + }, + { + "epoch": 0.7519904931669638, + "grad_norm": 1.150447114616083, + "learning_rate": 3.0556931900487365e-06, + "loss": 0.0924, + "step": 9492 + }, + { + "epoch": 0.7520697167755991, + "grad_norm": 1.378737897018675, + "learning_rate": 3.053847005904298e-06, + "loss": 0.1825, + "step": 9493 + }, + { + "epoch": 0.7521489403842345, + "grad_norm": 1.676059084084236, + "learning_rate": 3.052001279130612e-06, + "loss": 0.1834, + "step": 9494 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 1.6508074892109708, + "learning_rate": 3.0501560098492056e-06, + "loss": 0.18, + "step": 9495 + }, + { + "epoch": 0.7523073876015053, + "grad_norm": 2.7021218603265207, + "learning_rate": 3.0483111981815906e-06, + "loss": 0.2028, + "step": 9496 + }, + { + "epoch": 0.7523866112101406, + "grad_norm": 1.2323339967492575, + "learning_rate": 3.046466844249232e-06, + "loss": 0.1736, + "step": 9497 + }, + { + "epoch": 0.752465834818776, + "grad_norm": 1.8515017942315066, + "learning_rate": 3.0446229481735713e-06, + "loss": 0.2052, + "step": 9498 + }, + { + "epoch": 0.7525450584274114, + "grad_norm": 1.5596046746561878, + "learning_rate": 3.042779510076025e-06, + "loss": 0.1753, + "step": 9499 + }, + { + "epoch": 0.7526242820360467, + "grad_norm": 1.5645444721132473, + "learning_rate": 3.0409365300779725e-06, + "loss": 0.1821, + "step": 9500 + }, + { + "epoch": 0.7527035056446821, + "grad_norm": 1.4026216267594054, + "learning_rate": 3.039094008300761e-06, + "loss": 0.1543, + "step": 9501 + }, + { + "epoch": 0.7527827292533175, + "grad_norm": 1.7575829887681125, + "learning_rate": 3.0372519448657188e-06, + "loss": 0.2061, + "step": 9502 + }, + { + "epoch": 0.7528619528619529, + "grad_norm": 1.3372458080587726, + "learning_rate": 3.0354103398941327e-06, + "loss": 0.1356, + "step": 9503 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 1.8576834048331887, + "learning_rate": 3.0335691935072618e-06, + "loss": 0.198, + "step": 9504 + }, + { + "epoch": 0.7530204000792237, + "grad_norm": 1.5522534610656729, + "learning_rate": 3.0317285058263426e-06, + "loss": 0.1891, + "step": 9505 + }, + { + "epoch": 0.753099623687859, + "grad_norm": 1.4958394215974595, + "learning_rate": 3.029888276972571e-06, + "loss": 0.1674, + "step": 9506 + }, + { + "epoch": 0.7531788472964943, + "grad_norm": 1.7096614413036455, + "learning_rate": 3.0280485070671197e-06, + "loss": 0.2305, + "step": 9507 + }, + { + "epoch": 0.7532580709051298, + "grad_norm": 1.424818762948807, + "learning_rate": 3.0262091962311234e-06, + "loss": 0.0966, + "step": 9508 + }, + { + "epoch": 0.7533372945137651, + "grad_norm": 1.5254639625246011, + "learning_rate": 3.0243703445856985e-06, + "loss": 0.1632, + "step": 9509 + }, + { + "epoch": 0.7534165181224005, + "grad_norm": 1.1802771609778897, + "learning_rate": 3.0225319522519226e-06, + "loss": 0.1021, + "step": 9510 + }, + { + "epoch": 0.7534957417310358, + "grad_norm": 2.073340745982076, + "learning_rate": 3.0206940193508404e-06, + "loss": 0.188, + "step": 9511 + }, + { + "epoch": 0.7535749653396713, + "grad_norm": 1.5967959405805734, + "learning_rate": 3.018856546003479e-06, + "loss": 0.1517, + "step": 9512 + }, + { + "epoch": 0.7536541889483066, + "grad_norm": 1.6089020478761187, + "learning_rate": 3.0170195323308216e-06, + "loss": 0.1674, + "step": 9513 + }, + { + "epoch": 0.7537334125569419, + "grad_norm": 1.5956859380251154, + "learning_rate": 3.0151829784538257e-06, + "loss": 0.2139, + "step": 9514 + }, + { + "epoch": 0.7538126361655774, + "grad_norm": 1.5600369580201474, + "learning_rate": 3.0133468844934245e-06, + "loss": 0.1983, + "step": 9515 + }, + { + "epoch": 0.7538918597742127, + "grad_norm": 1.8680555971439101, + "learning_rate": 3.0115112505705134e-06, + "loss": 0.1684, + "step": 9516 + }, + { + "epoch": 0.7539710833828481, + "grad_norm": 1.432040150509459, + "learning_rate": 3.0096760768059576e-06, + "loss": 0.132, + "step": 9517 + }, + { + "epoch": 0.7540503069914835, + "grad_norm": 1.1888462611985904, + "learning_rate": 3.0078413633205995e-06, + "loss": 0.1427, + "step": 9518 + }, + { + "epoch": 0.7541295306001188, + "grad_norm": 1.3462789053793733, + "learning_rate": 3.0060071102352438e-06, + "loss": 0.1891, + "step": 9519 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 1.741238356940325, + "learning_rate": 3.0041733176706668e-06, + "loss": 0.211, + "step": 9520 + }, + { + "epoch": 0.7542879778173895, + "grad_norm": 1.3700825011663031, + "learning_rate": 3.002339985747611e-06, + "loss": 0.1374, + "step": 9521 + }, + { + "epoch": 0.754367201426025, + "grad_norm": 1.432665966668152, + "learning_rate": 3.0005071145868004e-06, + "loss": 0.159, + "step": 9522 + }, + { + "epoch": 0.7544464250346603, + "grad_norm": 1.2967558619901436, + "learning_rate": 2.998674704308917e-06, + "loss": 0.152, + "step": 9523 + }, + { + "epoch": 0.7545256486432957, + "grad_norm": 1.1205437366595565, + "learning_rate": 2.9968427550346136e-06, + "loss": 0.1215, + "step": 9524 + }, + { + "epoch": 0.7546048722519311, + "grad_norm": 1.2090976664941415, + "learning_rate": 2.9950112668845198e-06, + "loss": 0.1311, + "step": 9525 + }, + { + "epoch": 0.7546840958605664, + "grad_norm": 1.5006182028397699, + "learning_rate": 2.9931802399792285e-06, + "loss": 0.1715, + "step": 9526 + }, + { + "epoch": 0.7547633194692018, + "grad_norm": 1.3992895327314552, + "learning_rate": 2.9913496744393e-06, + "loss": 0.1628, + "step": 9527 + }, + { + "epoch": 0.7548425430778372, + "grad_norm": 1.6366432547576821, + "learning_rate": 2.9895195703852763e-06, + "loss": 0.1797, + "step": 9528 + }, + { + "epoch": 0.7549217666864726, + "grad_norm": 2.1379468857383874, + "learning_rate": 2.987689927937656e-06, + "loss": 0.184, + "step": 9529 + }, + { + "epoch": 0.7550009902951079, + "grad_norm": 1.5281675528367953, + "learning_rate": 2.98586074721691e-06, + "loss": 0.1569, + "step": 9530 + }, + { + "epoch": 0.7550802139037434, + "grad_norm": 1.6989503178686451, + "learning_rate": 2.9840320283434865e-06, + "loss": 0.1899, + "step": 9531 + }, + { + "epoch": 0.7551594375123787, + "grad_norm": 1.6776677936587132, + "learning_rate": 2.982203771437796e-06, + "loss": 0.2745, + "step": 9532 + }, + { + "epoch": 0.755238661121014, + "grad_norm": 1.5011554946039176, + "learning_rate": 2.9803759766202157e-06, + "loss": 0.2085, + "step": 9533 + }, + { + "epoch": 0.7553178847296494, + "grad_norm": 1.3430163789805196, + "learning_rate": 2.9785486440111044e-06, + "loss": 0.1675, + "step": 9534 + }, + { + "epoch": 0.7553971083382848, + "grad_norm": 1.0929451417473897, + "learning_rate": 2.9767217737307805e-06, + "loss": 0.1189, + "step": 9535 + }, + { + "epoch": 0.7554763319469202, + "grad_norm": 1.4401374108741671, + "learning_rate": 2.974895365899534e-06, + "loss": 0.1539, + "step": 9536 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 1.1513708039036818, + "learning_rate": 2.973069420637621e-06, + "loss": 0.1487, + "step": 9537 + }, + { + "epoch": 0.755634779164191, + "grad_norm": 1.5639579634328595, + "learning_rate": 2.971243938065279e-06, + "loss": 0.1358, + "step": 9538 + }, + { + "epoch": 0.7557140027728263, + "grad_norm": 1.382123959219964, + "learning_rate": 2.9694189183027034e-06, + "loss": 0.1664, + "step": 9539 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 1.406794563778815, + "learning_rate": 2.9675943614700588e-06, + "loss": 0.1452, + "step": 9540 + }, + { + "epoch": 0.7558724499900971, + "grad_norm": 1.514924470356589, + "learning_rate": 2.965770267687492e-06, + "loss": 0.1462, + "step": 9541 + }, + { + "epoch": 0.7559516735987324, + "grad_norm": 1.8229121116583087, + "learning_rate": 2.963946637075107e-06, + "loss": 0.2201, + "step": 9542 + }, + { + "epoch": 0.7560308972073678, + "grad_norm": 1.738524902932573, + "learning_rate": 2.9621234697529787e-06, + "loss": 0.174, + "step": 9543 + }, + { + "epoch": 0.7561101208160032, + "grad_norm": 1.317188089571558, + "learning_rate": 2.9603007658411575e-06, + "loss": 0.1282, + "step": 9544 + }, + { + "epoch": 0.7561893444246386, + "grad_norm": 1.6382582803074983, + "learning_rate": 2.958478525459657e-06, + "loss": 0.1706, + "step": 9545 + }, + { + "epoch": 0.7562685680332739, + "grad_norm": 1.6255078569023884, + "learning_rate": 2.9566567487284613e-06, + "loss": 0.2551, + "step": 9546 + }, + { + "epoch": 0.7563477916419092, + "grad_norm": 1.5063095558374369, + "learning_rate": 2.9548354357675325e-06, + "loss": 0.1402, + "step": 9547 + }, + { + "epoch": 0.7564270152505447, + "grad_norm": 1.599615198473144, + "learning_rate": 2.9530145866967897e-06, + "loss": 0.0952, + "step": 9548 + }, + { + "epoch": 0.75650623885918, + "grad_norm": 1.4484769409306653, + "learning_rate": 2.951194201636125e-06, + "loss": 0.1212, + "step": 9549 + }, + { + "epoch": 0.7565854624678154, + "grad_norm": 1.4000576094962034, + "learning_rate": 2.9493742807054094e-06, + "loss": 0.1396, + "step": 9550 + }, + { + "epoch": 0.7566646860764508, + "grad_norm": 2.0835471251443067, + "learning_rate": 2.947554824024472e-06, + "loss": 0.2206, + "step": 9551 + }, + { + "epoch": 0.7567439096850862, + "grad_norm": 1.4233756794852277, + "learning_rate": 2.9457358317131125e-06, + "loss": 0.157, + "step": 9552 + }, + { + "epoch": 0.7568231332937215, + "grad_norm": 1.4410107839892405, + "learning_rate": 2.943917303891107e-06, + "loss": 0.1582, + "step": 9553 + }, + { + "epoch": 0.7569023569023569, + "grad_norm": 1.4712287477447172, + "learning_rate": 2.942099240678197e-06, + "loss": 0.1995, + "step": 9554 + }, + { + "epoch": 0.7569815805109923, + "grad_norm": 1.2676143088681964, + "learning_rate": 2.940281642194087e-06, + "loss": 0.1041, + "step": 9555 + }, + { + "epoch": 0.7570608041196276, + "grad_norm": 1.2500532005652834, + "learning_rate": 2.938464508558466e-06, + "loss": 0.1353, + "step": 9556 + }, + { + "epoch": 0.757140027728263, + "grad_norm": 1.6491881488554911, + "learning_rate": 2.936647839890979e-06, + "loss": 0.2026, + "step": 9557 + }, + { + "epoch": 0.7572192513368984, + "grad_norm": 1.4519368879855936, + "learning_rate": 2.9348316363112417e-06, + "loss": 0.1163, + "step": 9558 + }, + { + "epoch": 0.7572984749455338, + "grad_norm": 1.4684016565934788, + "learning_rate": 2.933015897938849e-06, + "loss": 0.172, + "step": 9559 + }, + { + "epoch": 0.7573776985541691, + "grad_norm": 1.8279741950236903, + "learning_rate": 2.9312006248933543e-06, + "loss": 0.1942, + "step": 9560 + }, + { + "epoch": 0.7574569221628045, + "grad_norm": 1.2593997777981085, + "learning_rate": 2.9293858172942867e-06, + "loss": 0.0952, + "step": 9561 + }, + { + "epoch": 0.7575361457714399, + "grad_norm": 1.685169523373592, + "learning_rate": 2.9275714752611383e-06, + "loss": 0.1777, + "step": 9562 + }, + { + "epoch": 0.7576153693800752, + "grad_norm": 1.6293331300171128, + "learning_rate": 2.9257575989133803e-06, + "loss": 0.188, + "step": 9563 + }, + { + "epoch": 0.7576945929887107, + "grad_norm": 1.7277992381634562, + "learning_rate": 2.9239441883704455e-06, + "loss": 0.1797, + "step": 9564 + }, + { + "epoch": 0.757773816597346, + "grad_norm": 2.0718207155312234, + "learning_rate": 2.9221312437517357e-06, + "loss": 0.2353, + "step": 9565 + }, + { + "epoch": 0.7578530402059814, + "grad_norm": 1.8272780663551813, + "learning_rate": 2.9203187651766297e-06, + "loss": 0.1789, + "step": 9566 + }, + { + "epoch": 0.7579322638146168, + "grad_norm": 1.3008179692988004, + "learning_rate": 2.918506752764467e-06, + "loss": 0.1225, + "step": 9567 + }, + { + "epoch": 0.7580114874232521, + "grad_norm": 1.5091268751971438, + "learning_rate": 2.916695206634558e-06, + "loss": 0.1632, + "step": 9568 + }, + { + "epoch": 0.7580907110318875, + "grad_norm": 1.3912688793467687, + "learning_rate": 2.91488412690619e-06, + "loss": 0.1382, + "step": 9569 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 1.5009418911861847, + "learning_rate": 2.913073513698611e-06, + "loss": 0.1864, + "step": 9570 + }, + { + "epoch": 0.7582491582491583, + "grad_norm": 1.9032669807975269, + "learning_rate": 2.9112633671310387e-06, + "loss": 0.1325, + "step": 9571 + }, + { + "epoch": 0.7583283818577936, + "grad_norm": 1.305921119378279, + "learning_rate": 2.9094536873226663e-06, + "loss": 0.1072, + "step": 9572 + }, + { + "epoch": 0.758407605466429, + "grad_norm": 1.7762254367444177, + "learning_rate": 2.9076444743926524e-06, + "loss": 0.2143, + "step": 9573 + }, + { + "epoch": 0.7584868290750644, + "grad_norm": 1.7018412652287815, + "learning_rate": 2.9058357284601204e-06, + "loss": 0.1191, + "step": 9574 + }, + { + "epoch": 0.7585660526836997, + "grad_norm": 1.503477976295376, + "learning_rate": 2.9040274496441732e-06, + "loss": 0.2234, + "step": 9575 + }, + { + "epoch": 0.7586452762923351, + "grad_norm": 1.7476893090817263, + "learning_rate": 2.902219638063876e-06, + "loss": 0.2661, + "step": 9576 + }, + { + "epoch": 0.7587244999009705, + "grad_norm": 1.29339856905736, + "learning_rate": 2.9004122938382617e-06, + "loss": 0.1679, + "step": 9577 + }, + { + "epoch": 0.7588037235096059, + "grad_norm": 1.4837709946084463, + "learning_rate": 2.8986054170863344e-06, + "loss": 0.1563, + "step": 9578 + }, + { + "epoch": 0.7588829471182412, + "grad_norm": 1.5938066144230532, + "learning_rate": 2.8967990079270736e-06, + "loss": 0.1486, + "step": 9579 + }, + { + "epoch": 0.7589621707268767, + "grad_norm": 2.4694283380933273, + "learning_rate": 2.89499306647942e-06, + "loss": 0.2052, + "step": 9580 + }, + { + "epoch": 0.759041394335512, + "grad_norm": 1.437866552677991, + "learning_rate": 2.8931875928622833e-06, + "loss": 0.1401, + "step": 9581 + }, + { + "epoch": 0.7591206179441473, + "grad_norm": 1.4191523455019126, + "learning_rate": 2.89138258719455e-06, + "loss": 0.177, + "step": 9582 + }, + { + "epoch": 0.7591998415527828, + "grad_norm": 2.07873487029662, + "learning_rate": 2.8895780495950687e-06, + "loss": 0.1973, + "step": 9583 + }, + { + "epoch": 0.7592790651614181, + "grad_norm": 1.6452833501135546, + "learning_rate": 2.8877739801826577e-06, + "loss": 0.1552, + "step": 9584 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 1.369192058040035, + "learning_rate": 2.8859703790761095e-06, + "loss": 0.152, + "step": 9585 + }, + { + "epoch": 0.7594375123786888, + "grad_norm": 1.9321763671502792, + "learning_rate": 2.8841672463941827e-06, + "loss": 0.1726, + "step": 9586 + }, + { + "epoch": 0.7595167359873243, + "grad_norm": 1.921832610963072, + "learning_rate": 2.8823645822556e-06, + "loss": 0.1714, + "step": 9587 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 1.8154885116918609, + "learning_rate": 2.8805623867790655e-06, + "loss": 0.2201, + "step": 9588 + }, + { + "epoch": 0.7596751832045949, + "grad_norm": 1.9145352741394648, + "learning_rate": 2.8787606600832408e-06, + "loss": 0.2156, + "step": 9589 + }, + { + "epoch": 0.7597544068132304, + "grad_norm": 1.271771240554462, + "learning_rate": 2.876959402286759e-06, + "loss": 0.138, + "step": 9590 + }, + { + "epoch": 0.7598336304218657, + "grad_norm": 1.8684982347885792, + "learning_rate": 2.8751586135082275e-06, + "loss": 0.2539, + "step": 9591 + }, + { + "epoch": 0.7599128540305011, + "grad_norm": 1.3755525710439678, + "learning_rate": 2.873358293866221e-06, + "loss": 0.13, + "step": 9592 + }, + { + "epoch": 0.7599920776391365, + "grad_norm": 1.640054636391881, + "learning_rate": 2.8715584434792786e-06, + "loss": 0.1681, + "step": 9593 + }, + { + "epoch": 0.7600713012477719, + "grad_norm": 1.6853438442092112, + "learning_rate": 2.86975906246591e-06, + "loss": 0.1613, + "step": 9594 + }, + { + "epoch": 0.7601505248564072, + "grad_norm": 1.3471935320060047, + "learning_rate": 2.867960150944602e-06, + "loss": 0.1387, + "step": 9595 + }, + { + "epoch": 0.7602297484650425, + "grad_norm": 1.256638804919288, + "learning_rate": 2.8661617090338e-06, + "loss": 0.0964, + "step": 9596 + }, + { + "epoch": 0.760308972073678, + "grad_norm": 1.7778993680672321, + "learning_rate": 2.864363736851922e-06, + "loss": 0.2379, + "step": 9597 + }, + { + "epoch": 0.7603881956823133, + "grad_norm": 1.2997168805276131, + "learning_rate": 2.86256623451736e-06, + "loss": 0.1272, + "step": 9598 + }, + { + "epoch": 0.7604674192909487, + "grad_norm": 1.7084142028835376, + "learning_rate": 2.860769202148468e-06, + "loss": 0.1282, + "step": 9599 + }, + { + "epoch": 0.7605466428995841, + "grad_norm": 1.5557942875260857, + "learning_rate": 2.8589726398635688e-06, + "loss": 0.1508, + "step": 9600 + }, + { + "epoch": 0.7606258665082194, + "grad_norm": 1.3879859302814046, + "learning_rate": 2.8571765477809645e-06, + "loss": 0.167, + "step": 9601 + }, + { + "epoch": 0.7607050901168548, + "grad_norm": 1.7750074451368518, + "learning_rate": 2.8553809260189145e-06, + "loss": 0.1393, + "step": 9602 + }, + { + "epoch": 0.7607843137254902, + "grad_norm": 1.179348197774705, + "learning_rate": 2.8535857746956507e-06, + "loss": 0.094, + "step": 9603 + }, + { + "epoch": 0.7608635373341256, + "grad_norm": 1.4127493906560673, + "learning_rate": 2.8517910939293804e-06, + "loss": 0.139, + "step": 9604 + }, + { + "epoch": 0.7609427609427609, + "grad_norm": 1.2459826528965805, + "learning_rate": 2.849996883838271e-06, + "loss": 0.1002, + "step": 9605 + }, + { + "epoch": 0.7610219845513964, + "grad_norm": 1.6451252235548848, + "learning_rate": 2.8482031445404634e-06, + "loss": 0.1791, + "step": 9606 + }, + { + "epoch": 0.7611012081600317, + "grad_norm": 1.337216408052708, + "learning_rate": 2.8464098761540637e-06, + "loss": 0.1271, + "step": 9607 + }, + { + "epoch": 0.761180431768667, + "grad_norm": 2.023526144744933, + "learning_rate": 2.844617078797155e-06, + "loss": 0.2025, + "step": 9608 + }, + { + "epoch": 0.7612596553773024, + "grad_norm": 1.313264867396571, + "learning_rate": 2.842824752587783e-06, + "loss": 0.1312, + "step": 9609 + }, + { + "epoch": 0.7613388789859378, + "grad_norm": 1.6196252825965052, + "learning_rate": 2.8410328976439595e-06, + "loss": 0.1617, + "step": 9610 + }, + { + "epoch": 0.7614181025945732, + "grad_norm": 1.979528941108533, + "learning_rate": 2.839241514083676e-06, + "loss": 0.2677, + "step": 9611 + }, + { + "epoch": 0.7614973262032085, + "grad_norm": 1.4385663910436548, + "learning_rate": 2.837450602024884e-06, + "loss": 0.18, + "step": 9612 + }, + { + "epoch": 0.761576549811844, + "grad_norm": 1.0780845427989896, + "learning_rate": 2.8356601615855027e-06, + "loss": 0.126, + "step": 9613 + }, + { + "epoch": 0.7616557734204793, + "grad_norm": 1.231273500924223, + "learning_rate": 2.83387019288343e-06, + "loss": 0.1441, + "step": 9614 + }, + { + "epoch": 0.7617349970291146, + "grad_norm": 1.5717574795388507, + "learning_rate": 2.8320806960365234e-06, + "loss": 0.1613, + "step": 9615 + }, + { + "epoch": 0.7618142206377501, + "grad_norm": 0.9814814106489694, + "learning_rate": 2.8302916711626106e-06, + "loss": 0.0814, + "step": 9616 + }, + { + "epoch": 0.7618934442463854, + "grad_norm": 1.5192828646830723, + "learning_rate": 2.8285031183794955e-06, + "loss": 0.2242, + "step": 9617 + }, + { + "epoch": 0.7619726678550208, + "grad_norm": 2.018901698230236, + "learning_rate": 2.8267150378049437e-06, + "loss": 0.2327, + "step": 9618 + }, + { + "epoch": 0.7620518914636562, + "grad_norm": 1.2212080627070687, + "learning_rate": 2.8249274295566863e-06, + "loss": 0.1378, + "step": 9619 + }, + { + "epoch": 0.7621311150722916, + "grad_norm": 1.5463847352459796, + "learning_rate": 2.823140293752441e-06, + "loss": 0.1649, + "step": 9620 + }, + { + "epoch": 0.7622103386809269, + "grad_norm": 1.44265243264647, + "learning_rate": 2.821353630509871e-06, + "loss": 0.1677, + "step": 9621 + }, + { + "epoch": 0.7622895622895622, + "grad_norm": 1.306445105055589, + "learning_rate": 2.819567439946621e-06, + "loss": 0.1226, + "step": 9622 + }, + { + "epoch": 0.7623687858981977, + "grad_norm": 1.160189404636993, + "learning_rate": 2.8177817221803074e-06, + "loss": 0.0915, + "step": 9623 + }, + { + "epoch": 0.762448009506833, + "grad_norm": 1.5325320464922008, + "learning_rate": 2.8159964773285074e-06, + "loss": 0.1163, + "step": 9624 + }, + { + "epoch": 0.7625272331154684, + "grad_norm": 1.22957711943427, + "learning_rate": 2.8142117055087704e-06, + "loss": 0.106, + "step": 9625 + }, + { + "epoch": 0.7626064567241038, + "grad_norm": 2.23182605516539, + "learning_rate": 2.8124274068386203e-06, + "loss": 0.2043, + "step": 9626 + }, + { + "epoch": 0.7626856803327392, + "grad_norm": 1.422011298494945, + "learning_rate": 2.8106435814355404e-06, + "loss": 0.1611, + "step": 9627 + }, + { + "epoch": 0.7627649039413745, + "grad_norm": 1.7184536338110958, + "learning_rate": 2.808860229416984e-06, + "loss": 0.2071, + "step": 9628 + }, + { + "epoch": 0.7628441275500099, + "grad_norm": 1.694648239779594, + "learning_rate": 2.8070773509003846e-06, + "loss": 0.1873, + "step": 9629 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 1.4848059830422866, + "learning_rate": 2.80529494600313e-06, + "loss": 0.2009, + "step": 9630 + }, + { + "epoch": 0.7630025747672806, + "grad_norm": 1.324079533465585, + "learning_rate": 2.8035130148425847e-06, + "loss": 0.1172, + "step": 9631 + }, + { + "epoch": 0.763081798375916, + "grad_norm": 1.3923843040518629, + "learning_rate": 2.801731557536078e-06, + "loss": 0.1434, + "step": 9632 + }, + { + "epoch": 0.7631610219845514, + "grad_norm": 1.5433305862693556, + "learning_rate": 2.799950574200915e-06, + "loss": 0.1658, + "step": 9633 + }, + { + "epoch": 0.7632402455931868, + "grad_norm": 1.3450891535687375, + "learning_rate": 2.7981700649543618e-06, + "loss": 0.1416, + "step": 9634 + }, + { + "epoch": 0.7633194692018221, + "grad_norm": 1.5095267673182209, + "learning_rate": 2.796390029913655e-06, + "loss": 0.162, + "step": 9635 + }, + { + "epoch": 0.7633986928104575, + "grad_norm": 1.8925942350968739, + "learning_rate": 2.794610469196004e-06, + "loss": 0.1841, + "step": 9636 + }, + { + "epoch": 0.7634779164190929, + "grad_norm": 1.4807924470603768, + "learning_rate": 2.792831382918585e-06, + "loss": 0.2163, + "step": 9637 + }, + { + "epoch": 0.7635571400277282, + "grad_norm": 1.4452815585896193, + "learning_rate": 2.791052771198538e-06, + "loss": 0.1112, + "step": 9638 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 1.5232367235770772, + "learning_rate": 2.7892746341529807e-06, + "loss": 0.192, + "step": 9639 + }, + { + "epoch": 0.763715587244999, + "grad_norm": 1.3599665957013363, + "learning_rate": 2.7874969718989943e-06, + "loss": 0.1521, + "step": 9640 + }, + { + "epoch": 0.7637948108536344, + "grad_norm": 1.4447732718014117, + "learning_rate": 2.785719784553624e-06, + "loss": 0.1947, + "step": 9641 + }, + { + "epoch": 0.7638740344622698, + "grad_norm": 1.5621212162892377, + "learning_rate": 2.7839430722338956e-06, + "loss": 0.13, + "step": 9642 + }, + { + "epoch": 0.7639532580709051, + "grad_norm": 1.2877905146296533, + "learning_rate": 2.7821668350567956e-06, + "loss": 0.1647, + "step": 9643 + }, + { + "epoch": 0.7640324816795405, + "grad_norm": 1.9262421814794235, + "learning_rate": 2.7803910731392757e-06, + "loss": 0.2392, + "step": 9644 + }, + { + "epoch": 0.7641117052881758, + "grad_norm": 1.2307735896125025, + "learning_rate": 2.778615786598269e-06, + "loss": 0.1711, + "step": 9645 + }, + { + "epoch": 0.7641909288968113, + "grad_norm": 1.2177517200257129, + "learning_rate": 2.776840975550664e-06, + "loss": 0.1363, + "step": 9646 + }, + { + "epoch": 0.7642701525054466, + "grad_norm": 1.5107929969953622, + "learning_rate": 2.7750666401133263e-06, + "loss": 0.1369, + "step": 9647 + }, + { + "epoch": 0.764349376114082, + "grad_norm": 1.2828901367921461, + "learning_rate": 2.773292780403083e-06, + "loss": 0.1163, + "step": 9648 + }, + { + "epoch": 0.7644285997227174, + "grad_norm": 1.3996960526650157, + "learning_rate": 2.7715193965367403e-06, + "loss": 0.1517, + "step": 9649 + }, + { + "epoch": 0.7645078233313527, + "grad_norm": 1.5365459237715637, + "learning_rate": 2.769746488631064e-06, + "loss": 0.1869, + "step": 9650 + }, + { + "epoch": 0.7645870469399881, + "grad_norm": 1.5929381783597985, + "learning_rate": 2.767974056802789e-06, + "loss": 0.1539, + "step": 9651 + }, + { + "epoch": 0.7646662705486235, + "grad_norm": 1.5861360416566839, + "learning_rate": 2.766202101168628e-06, + "loss": 0.1722, + "step": 9652 + }, + { + "epoch": 0.7647454941572589, + "grad_norm": 1.196894639185933, + "learning_rate": 2.76443062184525e-06, + "loss": 0.1536, + "step": 9653 + }, + { + "epoch": 0.7648247177658942, + "grad_norm": 1.7041088402699311, + "learning_rate": 2.7626596189492983e-06, + "loss": 0.1845, + "step": 9654 + }, + { + "epoch": 0.7649039413745297, + "grad_norm": 1.577181474857777, + "learning_rate": 2.76088909259739e-06, + "loss": 0.1542, + "step": 9655 + }, + { + "epoch": 0.764983164983165, + "grad_norm": 1.4651262723903933, + "learning_rate": 2.7591190429061023e-06, + "loss": 0.1305, + "step": 9656 + }, + { + "epoch": 0.7650623885918003, + "grad_norm": 1.7243378428260292, + "learning_rate": 2.757349469991981e-06, + "loss": 0.1935, + "step": 9657 + }, + { + "epoch": 0.7651416122004358, + "grad_norm": 1.1589142122615406, + "learning_rate": 2.7555803739715512e-06, + "loss": 0.141, + "step": 9658 + }, + { + "epoch": 0.7652208358090711, + "grad_norm": 1.4854231604298274, + "learning_rate": 2.7538117549612963e-06, + "loss": 0.1379, + "step": 9659 + }, + { + "epoch": 0.7653000594177065, + "grad_norm": 1.7407534740746045, + "learning_rate": 2.752043613077667e-06, + "loss": 0.1274, + "step": 9660 + }, + { + "epoch": 0.7653792830263418, + "grad_norm": 1.5712677410420897, + "learning_rate": 2.7502759484370946e-06, + "loss": 0.1264, + "step": 9661 + }, + { + "epoch": 0.7654585066349773, + "grad_norm": 1.7000941047417002, + "learning_rate": 2.748508761155967e-06, + "loss": 0.1915, + "step": 9662 + }, + { + "epoch": 0.7655377302436126, + "grad_norm": 1.9445900370370082, + "learning_rate": 2.746742051350646e-06, + "loss": 0.1806, + "step": 9663 + }, + { + "epoch": 0.7656169538522479, + "grad_norm": 1.7703423683995427, + "learning_rate": 2.7449758191374574e-06, + "loss": 0.2514, + "step": 9664 + }, + { + "epoch": 0.7656961774608834, + "grad_norm": 1.3939962870276839, + "learning_rate": 2.7432100646327043e-06, + "loss": 0.1645, + "step": 9665 + }, + { + "epoch": 0.7657754010695187, + "grad_norm": 1.7172410780905545, + "learning_rate": 2.7414447879526517e-06, + "loss": 0.1765, + "step": 9666 + }, + { + "epoch": 0.7658546246781541, + "grad_norm": 1.6818708075775044, + "learning_rate": 2.739679989213532e-06, + "loss": 0.2081, + "step": 9667 + }, + { + "epoch": 0.7659338482867895, + "grad_norm": 1.6190776937720763, + "learning_rate": 2.7379156685315523e-06, + "loss": 0.1101, + "step": 9668 + }, + { + "epoch": 0.7660130718954249, + "grad_norm": 1.2116929712336613, + "learning_rate": 2.7361518260228827e-06, + "loss": 0.1321, + "step": 9669 + }, + { + "epoch": 0.7660922955040602, + "grad_norm": 1.2525606397892568, + "learning_rate": 2.734388461803661e-06, + "loss": 0.1486, + "step": 9670 + }, + { + "epoch": 0.7661715191126955, + "grad_norm": 1.1670401297199537, + "learning_rate": 2.7326255759900024e-06, + "loss": 0.1175, + "step": 9671 + }, + { + "epoch": 0.766250742721331, + "grad_norm": 1.2527933234253887, + "learning_rate": 2.7308631686979816e-06, + "loss": 0.1383, + "step": 9672 + }, + { + "epoch": 0.7663299663299663, + "grad_norm": 1.3854187710121246, + "learning_rate": 2.7291012400436414e-06, + "loss": 0.1162, + "step": 9673 + }, + { + "epoch": 0.7664091899386017, + "grad_norm": 1.4693335401184864, + "learning_rate": 2.7273397901430023e-06, + "loss": 0.1125, + "step": 9674 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 1.4071235110547582, + "learning_rate": 2.7255788191120435e-06, + "loss": 0.1585, + "step": 9675 + }, + { + "epoch": 0.7665676371558724, + "grad_norm": 1.4597427004785417, + "learning_rate": 2.723818327066717e-06, + "loss": 0.1484, + "step": 9676 + }, + { + "epoch": 0.7666468607645078, + "grad_norm": 1.9686593105675911, + "learning_rate": 2.722058314122941e-06, + "loss": 0.1512, + "step": 9677 + }, + { + "epoch": 0.7667260843731432, + "grad_norm": 1.5239753346238663, + "learning_rate": 2.7202987803966073e-06, + "loss": 0.1411, + "step": 9678 + }, + { + "epoch": 0.7668053079817786, + "grad_norm": 1.3758904014543183, + "learning_rate": 2.718539726003573e-06, + "loss": 0.1966, + "step": 9679 + }, + { + "epoch": 0.7668845315904139, + "grad_norm": 1.4502387354498574, + "learning_rate": 2.7167811510596577e-06, + "loss": 0.1458, + "step": 9680 + }, + { + "epoch": 0.7669637551990494, + "grad_norm": 1.7872649606072382, + "learning_rate": 2.715023055680661e-06, + "loss": 0.1841, + "step": 9681 + }, + { + "epoch": 0.7670429788076847, + "grad_norm": 1.4927818796313443, + "learning_rate": 2.7132654399823444e-06, + "loss": 0.1733, + "step": 9682 + }, + { + "epoch": 0.76712220241632, + "grad_norm": 1.503873075843439, + "learning_rate": 2.7115083040804337e-06, + "loss": 0.1893, + "step": 9683 + }, + { + "epoch": 0.7672014260249554, + "grad_norm": 1.769570104840266, + "learning_rate": 2.709751648090634e-06, + "loss": 0.1411, + "step": 9684 + }, + { + "epoch": 0.7672806496335908, + "grad_norm": 1.4620511932663662, + "learning_rate": 2.7079954721286108e-06, + "loss": 0.1342, + "step": 9685 + }, + { + "epoch": 0.7673598732422262, + "grad_norm": 1.4318126117244618, + "learning_rate": 2.7062397763099945e-06, + "loss": 0.187, + "step": 9686 + }, + { + "epoch": 0.7674390968508615, + "grad_norm": 1.338005773122725, + "learning_rate": 2.7044845607503967e-06, + "loss": 0.1443, + "step": 9687 + }, + { + "epoch": 0.767518320459497, + "grad_norm": 1.8021723454150782, + "learning_rate": 2.7027298255653878e-06, + "loss": 0.2071, + "step": 9688 + }, + { + "epoch": 0.7675975440681323, + "grad_norm": 1.6713797971425046, + "learning_rate": 2.700975570870503e-06, + "loss": 0.158, + "step": 9689 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 1.4369677431522412, + "learning_rate": 2.6992217967812606e-06, + "loss": 0.1794, + "step": 9690 + }, + { + "epoch": 0.7677559912854031, + "grad_norm": 1.9197999100199266, + "learning_rate": 2.697468503413134e-06, + "loss": 0.2019, + "step": 9691 + }, + { + "epoch": 0.7678352148940384, + "grad_norm": 1.9034338866402514, + "learning_rate": 2.6957156908815684e-06, + "loss": 0.1579, + "step": 9692 + }, + { + "epoch": 0.7679144385026738, + "grad_norm": 1.5096202116532995, + "learning_rate": 2.6939633593019754e-06, + "loss": 0.1723, + "step": 9693 + }, + { + "epoch": 0.7679936621113091, + "grad_norm": 1.7017743281171718, + "learning_rate": 2.692211508789744e-06, + "loss": 0.1803, + "step": 9694 + }, + { + "epoch": 0.7680728857199446, + "grad_norm": 1.8780963774433932, + "learning_rate": 2.6904601394602216e-06, + "loss": 0.2148, + "step": 9695 + }, + { + "epoch": 0.7681521093285799, + "grad_norm": 1.9397399745656363, + "learning_rate": 2.688709251428725e-06, + "loss": 0.188, + "step": 9696 + }, + { + "epoch": 0.7682313329372152, + "grad_norm": 1.69676072172985, + "learning_rate": 2.6869588448105475e-06, + "loss": 0.169, + "step": 9697 + }, + { + "epoch": 0.7683105565458507, + "grad_norm": 1.1825123664861208, + "learning_rate": 2.685208919720942e-06, + "loss": 0.1625, + "step": 9698 + }, + { + "epoch": 0.768389780154486, + "grad_norm": 1.436483432632273, + "learning_rate": 2.683459476275133e-06, + "loss": 0.2066, + "step": 9699 + }, + { + "epoch": 0.7684690037631214, + "grad_norm": 1.5564603859537167, + "learning_rate": 2.6817105145883117e-06, + "loss": 0.1939, + "step": 9700 + }, + { + "epoch": 0.7685482273717568, + "grad_norm": 2.280417242409304, + "learning_rate": 2.6799620347756407e-06, + "loss": 0.1575, + "step": 9701 + }, + { + "epoch": 0.7686274509803922, + "grad_norm": 1.931068521887391, + "learning_rate": 2.6782140369522435e-06, + "loss": 0.2388, + "step": 9702 + }, + { + "epoch": 0.7687066745890275, + "grad_norm": 1.4331824951222825, + "learning_rate": 2.676466521233225e-06, + "loss": 0.1809, + "step": 9703 + }, + { + "epoch": 0.7687858981976629, + "grad_norm": 1.5840426542635977, + "learning_rate": 2.674719487733649e-06, + "loss": 0.163, + "step": 9704 + }, + { + "epoch": 0.7688651218062983, + "grad_norm": 1.3173347634449153, + "learning_rate": 2.672972936568543e-06, + "loss": 0.1664, + "step": 9705 + }, + { + "epoch": 0.7689443454149336, + "grad_norm": 1.6319341472274438, + "learning_rate": 2.6712268678529187e-06, + "loss": 0.1847, + "step": 9706 + }, + { + "epoch": 0.769023569023569, + "grad_norm": 1.471449514737636, + "learning_rate": 2.669481281701739e-06, + "loss": 0.144, + "step": 9707 + }, + { + "epoch": 0.7691027926322044, + "grad_norm": 1.3948400133422167, + "learning_rate": 2.6677361782299437e-06, + "loss": 0.1327, + "step": 9708 + }, + { + "epoch": 0.7691820162408398, + "grad_norm": 1.3903835026277642, + "learning_rate": 2.665991557552442e-06, + "loss": 0.1632, + "step": 9709 + }, + { + "epoch": 0.7692612398494751, + "grad_norm": 1.6207797290057946, + "learning_rate": 2.6642474197841086e-06, + "loss": 0.2303, + "step": 9710 + }, + { + "epoch": 0.7693404634581105, + "grad_norm": 1.8834141426008586, + "learning_rate": 2.6625037650397812e-06, + "loss": 0.1247, + "step": 9711 + }, + { + "epoch": 0.7694196870667459, + "grad_norm": 1.3870334523208032, + "learning_rate": 2.6607605934342785e-06, + "loss": 0.1609, + "step": 9712 + }, + { + "epoch": 0.7694989106753812, + "grad_norm": 1.7396748490946308, + "learning_rate": 2.659017905082376e-06, + "loss": 0.221, + "step": 9713 + }, + { + "epoch": 0.7695781342840167, + "grad_norm": 1.240843454545499, + "learning_rate": 2.657275700098819e-06, + "loss": 0.1665, + "step": 9714 + }, + { + "epoch": 0.769657357892652, + "grad_norm": 1.4015503651814274, + "learning_rate": 2.65553397859833e-06, + "loss": 0.1737, + "step": 9715 + }, + { + "epoch": 0.7697365815012874, + "grad_norm": 1.5220039389197275, + "learning_rate": 2.6537927406955888e-06, + "loss": 0.1305, + "step": 9716 + }, + { + "epoch": 0.7698158051099228, + "grad_norm": 1.2478083610677968, + "learning_rate": 2.6520519865052476e-06, + "loss": 0.0952, + "step": 9717 + }, + { + "epoch": 0.7698950287185581, + "grad_norm": 1.9702254997979598, + "learning_rate": 2.6503117161419246e-06, + "loss": 0.1986, + "step": 9718 + }, + { + "epoch": 0.7699742523271935, + "grad_norm": 1.0945501678400622, + "learning_rate": 2.6485719297202127e-06, + "loss": 0.0925, + "step": 9719 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 1.5997961895467323, + "learning_rate": 2.646832627354667e-06, + "loss": 0.1908, + "step": 9720 + }, + { + "epoch": 0.7701326995444643, + "grad_norm": 1.088008676941749, + "learning_rate": 2.645093809159809e-06, + "loss": 0.151, + "step": 9721 + }, + { + "epoch": 0.7702119231530996, + "grad_norm": 1.48723249216091, + "learning_rate": 2.643355475250137e-06, + "loss": 0.2438, + "step": 9722 + }, + { + "epoch": 0.770291146761735, + "grad_norm": 1.4516588297601196, + "learning_rate": 2.6416176257401083e-06, + "loss": 0.1821, + "step": 9723 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 1.6012799335730787, + "learning_rate": 2.639880260744151e-06, + "loss": 0.1535, + "step": 9724 + }, + { + "epoch": 0.7704495939790057, + "grad_norm": 1.891786305396332, + "learning_rate": 2.6381433803766654e-06, + "loss": 0.2046, + "step": 9725 + }, + { + "epoch": 0.7705288175876411, + "grad_norm": 1.5622293649243357, + "learning_rate": 2.6364069847520155e-06, + "loss": 0.1396, + "step": 9726 + }, + { + "epoch": 0.7706080411962765, + "grad_norm": 1.0209495961864432, + "learning_rate": 2.6346710739845317e-06, + "loss": 0.0924, + "step": 9727 + }, + { + "epoch": 0.7706872648049119, + "grad_norm": 1.3425405787146187, + "learning_rate": 2.6329356481885215e-06, + "loss": 0.1175, + "step": 9728 + }, + { + "epoch": 0.7707664884135472, + "grad_norm": 1.34725029498791, + "learning_rate": 2.6312007074782497e-06, + "loss": 0.1341, + "step": 9729 + }, + { + "epoch": 0.7708457120221827, + "grad_norm": 1.9182872177858583, + "learning_rate": 2.6294662519679525e-06, + "loss": 0.1947, + "step": 9730 + }, + { + "epoch": 0.770924935630818, + "grad_norm": 1.595612126280618, + "learning_rate": 2.627732281771841e-06, + "loss": 0.0958, + "step": 9731 + }, + { + "epoch": 0.7710041592394533, + "grad_norm": 1.2510076883521055, + "learning_rate": 2.6259987970040858e-06, + "loss": 0.1062, + "step": 9732 + }, + { + "epoch": 0.7710833828480887, + "grad_norm": 1.414299427576783, + "learning_rate": 2.6242657977788277e-06, + "loss": 0.1438, + "step": 9733 + }, + { + "epoch": 0.7711626064567241, + "grad_norm": 2.071766309134072, + "learning_rate": 2.6225332842101746e-06, + "loss": 0.1573, + "step": 9734 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 1.52970541262464, + "learning_rate": 2.6208012564122097e-06, + "loss": 0.1198, + "step": 9735 + }, + { + "epoch": 0.7713210536739948, + "grad_norm": 2.2384678881550117, + "learning_rate": 2.6190697144989753e-06, + "loss": 0.1716, + "step": 9736 + }, + { + "epoch": 0.7714002772826303, + "grad_norm": 1.4394732609086163, + "learning_rate": 2.617338658584483e-06, + "loss": 0.1751, + "step": 9737 + }, + { + "epoch": 0.7714795008912656, + "grad_norm": 1.3999492509883333, + "learning_rate": 2.6156080887827183e-06, + "loss": 0.1335, + "step": 9738 + }, + { + "epoch": 0.7715587244999009, + "grad_norm": 1.481491709436734, + "learning_rate": 2.613878005207631e-06, + "loss": 0.2121, + "step": 9739 + }, + { + "epoch": 0.7716379481085364, + "grad_norm": 1.2792671007799559, + "learning_rate": 2.612148407973134e-06, + "loss": 0.0851, + "step": 9740 + }, + { + "epoch": 0.7717171717171717, + "grad_norm": 1.6363438372669241, + "learning_rate": 2.6104192971931197e-06, + "loss": 0.1794, + "step": 9741 + }, + { + "epoch": 0.7717963953258071, + "grad_norm": 1.6994675334890936, + "learning_rate": 2.6086906729814378e-06, + "loss": 0.1927, + "step": 9742 + }, + { + "epoch": 0.7718756189344425, + "grad_norm": 1.6619972490158255, + "learning_rate": 2.606962535451907e-06, + "loss": 0.1306, + "step": 9743 + }, + { + "epoch": 0.7719548425430779, + "grad_norm": 1.4037253512727226, + "learning_rate": 2.605234884718324e-06, + "loss": 0.119, + "step": 9744 + }, + { + "epoch": 0.7720340661517132, + "grad_norm": 1.5833786631067743, + "learning_rate": 2.6035077208944416e-06, + "loss": 0.1898, + "step": 9745 + }, + { + "epoch": 0.7721132897603485, + "grad_norm": 1.230300162259317, + "learning_rate": 2.601781044093984e-06, + "loss": 0.0982, + "step": 9746 + }, + { + "epoch": 0.772192513368984, + "grad_norm": 1.650317289533404, + "learning_rate": 2.600054854430649e-06, + "loss": 0.1565, + "step": 9747 + }, + { + "epoch": 0.7722717369776193, + "grad_norm": 1.851354845951338, + "learning_rate": 2.5983291520180965e-06, + "loss": 0.2274, + "step": 9748 + }, + { + "epoch": 0.7723509605862547, + "grad_norm": 1.9176741520932536, + "learning_rate": 2.5966039369699537e-06, + "loss": 0.2083, + "step": 9749 + }, + { + "epoch": 0.7724301841948901, + "grad_norm": 1.6893577304842577, + "learning_rate": 2.5948792093998167e-06, + "loss": 0.2014, + "step": 9750 + }, + { + "epoch": 0.7725094078035255, + "grad_norm": 1.6994842460804007, + "learning_rate": 2.5931549694212545e-06, + "loss": 0.1654, + "step": 9751 + }, + { + "epoch": 0.7725886314121608, + "grad_norm": 1.5169542108974605, + "learning_rate": 2.5914312171477983e-06, + "loss": 0.1598, + "step": 9752 + }, + { + "epoch": 0.7726678550207962, + "grad_norm": 1.3492658627299086, + "learning_rate": 2.589707952692947e-06, + "loss": 0.1058, + "step": 9753 + }, + { + "epoch": 0.7727470786294316, + "grad_norm": 1.4680193746625914, + "learning_rate": 2.5879851761701724e-06, + "loss": 0.1338, + "step": 9754 + }, + { + "epoch": 0.7728263022380669, + "grad_norm": 1.0322066829724559, + "learning_rate": 2.586262887692911e-06, + "loss": 0.0845, + "step": 9755 + }, + { + "epoch": 0.7729055258467024, + "grad_norm": 1.2500200533604264, + "learning_rate": 2.5845410873745614e-06, + "loss": 0.158, + "step": 9756 + }, + { + "epoch": 0.7729847494553377, + "grad_norm": 1.9357968267628918, + "learning_rate": 2.5828197753285043e-06, + "loss": 0.218, + "step": 9757 + }, + { + "epoch": 0.773063973063973, + "grad_norm": 1.4909328794529666, + "learning_rate": 2.581098951668075e-06, + "loss": 0.1838, + "step": 9758 + }, + { + "epoch": 0.7731431966726084, + "grad_norm": 1.8063545906846683, + "learning_rate": 2.5793786165065805e-06, + "loss": 0.1952, + "step": 9759 + }, + { + "epoch": 0.7732224202812438, + "grad_norm": 1.4321678771562907, + "learning_rate": 2.5776587699573007e-06, + "loss": 0.132, + "step": 9760 + }, + { + "epoch": 0.7733016438898792, + "grad_norm": 1.542022634786877, + "learning_rate": 2.5759394121334767e-06, + "loss": 0.1715, + "step": 9761 + }, + { + "epoch": 0.7733808674985145, + "grad_norm": 1.2097978664145081, + "learning_rate": 2.57422054314832e-06, + "loss": 0.1103, + "step": 9762 + }, + { + "epoch": 0.77346009110715, + "grad_norm": 1.4296828557359313, + "learning_rate": 2.572502163115007e-06, + "loss": 0.1637, + "step": 9763 + }, + { + "epoch": 0.7735393147157853, + "grad_norm": 1.887411540859743, + "learning_rate": 2.5707842721466914e-06, + "loss": 0.2534, + "step": 9764 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 1.5129329834481435, + "learning_rate": 2.5690668703564835e-06, + "loss": 0.1738, + "step": 9765 + }, + { + "epoch": 0.7736977619330561, + "grad_norm": 2.367945613928465, + "learning_rate": 2.5673499578574644e-06, + "loss": 0.1966, + "step": 9766 + }, + { + "epoch": 0.7737769855416914, + "grad_norm": 1.7368737694408045, + "learning_rate": 2.565633534762689e-06, + "loss": 0.1786, + "step": 9767 + }, + { + "epoch": 0.7738562091503268, + "grad_norm": 1.8590987513742387, + "learning_rate": 2.5639176011851753e-06, + "loss": 0.2168, + "step": 9768 + }, + { + "epoch": 0.7739354327589621, + "grad_norm": 1.5926198048873477, + "learning_rate": 2.562202157237903e-06, + "loss": 0.182, + "step": 9769 + }, + { + "epoch": 0.7740146563675976, + "grad_norm": 1.9708474688096471, + "learning_rate": 2.5604872030338336e-06, + "loss": 0.1548, + "step": 9770 + }, + { + "epoch": 0.7740938799762329, + "grad_norm": 1.2991272912314111, + "learning_rate": 2.5587727386858853e-06, + "loss": 0.1491, + "step": 9771 + }, + { + "epoch": 0.7741731035848682, + "grad_norm": 1.5712179169661151, + "learning_rate": 2.5570587643069435e-06, + "loss": 0.172, + "step": 9772 + }, + { + "epoch": 0.7742523271935037, + "grad_norm": 1.879258315975605, + "learning_rate": 2.555345280009872e-06, + "loss": 0.1936, + "step": 9773 + }, + { + "epoch": 0.774331550802139, + "grad_norm": 1.454695856540906, + "learning_rate": 2.5536322859074934e-06, + "loss": 0.1547, + "step": 9774 + }, + { + "epoch": 0.7744107744107744, + "grad_norm": 1.7214091516200096, + "learning_rate": 2.551919782112596e-06, + "loss": 0.2404, + "step": 9775 + }, + { + "epoch": 0.7744899980194098, + "grad_norm": 1.55822482979566, + "learning_rate": 2.550207768737949e-06, + "loss": 0.236, + "step": 9776 + }, + { + "epoch": 0.7745692216280452, + "grad_norm": 1.7239517012712033, + "learning_rate": 2.54849624589627e-06, + "loss": 0.208, + "step": 9777 + }, + { + "epoch": 0.7746484452366805, + "grad_norm": 1.4791335434611836, + "learning_rate": 2.546785213700258e-06, + "loss": 0.1344, + "step": 9778 + }, + { + "epoch": 0.7747276688453159, + "grad_norm": 1.2374391205197226, + "learning_rate": 2.5450746722625785e-06, + "loss": 0.0977, + "step": 9779 + }, + { + "epoch": 0.7748068924539513, + "grad_norm": 1.346750574605836, + "learning_rate": 2.5433646216958617e-06, + "loss": 0.1221, + "step": 9780 + }, + { + "epoch": 0.7748861160625866, + "grad_norm": 1.4056220772094195, + "learning_rate": 2.5416550621127024e-06, + "loss": 0.126, + "step": 9781 + }, + { + "epoch": 0.774965339671222, + "grad_norm": 1.2451002382816594, + "learning_rate": 2.539945993625673e-06, + "loss": 0.0978, + "step": 9782 + }, + { + "epoch": 0.7750445632798574, + "grad_norm": 1.8334259757084481, + "learning_rate": 2.5382374163473046e-06, + "loss": 0.1752, + "step": 9783 + }, + { + "epoch": 0.7751237868884928, + "grad_norm": 1.20400435301815, + "learning_rate": 2.536529330390095e-06, + "loss": 0.1277, + "step": 9784 + }, + { + "epoch": 0.7752030104971281, + "grad_norm": 1.7495749516651284, + "learning_rate": 2.5348217358665207e-06, + "loss": 0.1773, + "step": 9785 + }, + { + "epoch": 0.7752822341057635, + "grad_norm": 1.7867115916421914, + "learning_rate": 2.5331146328890145e-06, + "loss": 0.1887, + "step": 9786 + }, + { + "epoch": 0.7753614577143989, + "grad_norm": 1.437863090871042, + "learning_rate": 2.5314080215699822e-06, + "loss": 0.1681, + "step": 9787 + }, + { + "epoch": 0.7754406813230342, + "grad_norm": 1.7774223714942095, + "learning_rate": 2.5297019020217904e-06, + "loss": 0.1984, + "step": 9788 + }, + { + "epoch": 0.7755199049316697, + "grad_norm": 1.3875515800358615, + "learning_rate": 2.5279962743567877e-06, + "loss": 0.146, + "step": 9789 + }, + { + "epoch": 0.775599128540305, + "grad_norm": 1.6204829785531747, + "learning_rate": 2.526291138687278e-06, + "loss": 0.1234, + "step": 9790 + }, + { + "epoch": 0.7756783521489404, + "grad_norm": 1.9933384651096921, + "learning_rate": 2.5245864951255317e-06, + "loss": 0.261, + "step": 9791 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 1.9217208451848666, + "learning_rate": 2.522882343783799e-06, + "loss": 0.1659, + "step": 9792 + }, + { + "epoch": 0.7758367993662111, + "grad_norm": 1.4731265490539756, + "learning_rate": 2.521178684774286e-06, + "loss": 0.1284, + "step": 9793 + }, + { + "epoch": 0.7759160229748465, + "grad_norm": 1.998911956802136, + "learning_rate": 2.519475518209167e-06, + "loss": 0.2368, + "step": 9794 + }, + { + "epoch": 0.7759952465834818, + "grad_norm": 1.5012895065101473, + "learning_rate": 2.5177728442005956e-06, + "loss": 0.1253, + "step": 9795 + }, + { + "epoch": 0.7760744701921173, + "grad_norm": 1.8593080920882146, + "learning_rate": 2.516070662860679e-06, + "loss": 0.2678, + "step": 9796 + }, + { + "epoch": 0.7761536938007526, + "grad_norm": 1.4973455566323732, + "learning_rate": 2.5143689743014966e-06, + "loss": 0.1667, + "step": 9797 + }, + { + "epoch": 0.776232917409388, + "grad_norm": 1.369211965989727, + "learning_rate": 2.5126677786351005e-06, + "loss": 0.1414, + "step": 9798 + }, + { + "epoch": 0.7763121410180234, + "grad_norm": 1.778001705352899, + "learning_rate": 2.5109670759735063e-06, + "loss": 0.1435, + "step": 9799 + }, + { + "epoch": 0.7763913646266587, + "grad_norm": 1.312856658962155, + "learning_rate": 2.509266866428691e-06, + "loss": 0.1456, + "step": 9800 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 1.477858546155191, + "learning_rate": 2.507567150112613e-06, + "loss": 0.152, + "step": 9801 + }, + { + "epoch": 0.7765498118439295, + "grad_norm": 1.8026840185953654, + "learning_rate": 2.5058679271371865e-06, + "loss": 0.17, + "step": 9802 + }, + { + "epoch": 0.7766290354525649, + "grad_norm": 1.3915835459925203, + "learning_rate": 2.504169197614298e-06, + "loss": 0.1409, + "step": 9803 + }, + { + "epoch": 0.7767082590612002, + "grad_norm": 1.1212030389828769, + "learning_rate": 2.5024709616557964e-06, + "loss": 0.0943, + "step": 9804 + }, + { + "epoch": 0.7767874826698357, + "grad_norm": 1.4732687007129783, + "learning_rate": 2.500773219373509e-06, + "loss": 0.1908, + "step": 9805 + }, + { + "epoch": 0.776866706278471, + "grad_norm": 1.2377783005686418, + "learning_rate": 2.499075970879222e-06, + "loss": 0.1277, + "step": 9806 + }, + { + "epoch": 0.7769459298871063, + "grad_norm": 1.41121346586343, + "learning_rate": 2.4973792162846878e-06, + "loss": 0.1617, + "step": 9807 + }, + { + "epoch": 0.7770251534957417, + "grad_norm": 1.6896059339122411, + "learning_rate": 2.4956829557016336e-06, + "loss": 0.2196, + "step": 9808 + }, + { + "epoch": 0.7771043771043771, + "grad_norm": 1.3312844030840447, + "learning_rate": 2.493987189241749e-06, + "loss": 0.1119, + "step": 9809 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 1.5314673577289795, + "learning_rate": 2.4922919170166883e-06, + "loss": 0.202, + "step": 9810 + }, + { + "epoch": 0.7772628243216478, + "grad_norm": 1.5521891195803676, + "learning_rate": 2.4905971391380823e-06, + "loss": 0.1698, + "step": 9811 + }, + { + "epoch": 0.7773420479302833, + "grad_norm": 1.4298165974024264, + "learning_rate": 2.488902855717522e-06, + "loss": 0.1531, + "step": 9812 + }, + { + "epoch": 0.7774212715389186, + "grad_norm": 1.6079443450505917, + "learning_rate": 2.487209066866565e-06, + "loss": 0.1626, + "step": 9813 + }, + { + "epoch": 0.7775004951475539, + "grad_norm": 1.479766326797685, + "learning_rate": 2.485515772696745e-06, + "loss": 0.1802, + "step": 9814 + }, + { + "epoch": 0.7775797187561894, + "grad_norm": 1.9511307720203546, + "learning_rate": 2.483822973319553e-06, + "loss": 0.2114, + "step": 9815 + }, + { + "epoch": 0.7776589423648247, + "grad_norm": 1.6470902883172156, + "learning_rate": 2.482130668846451e-06, + "loss": 0.1782, + "step": 9816 + }, + { + "epoch": 0.7777381659734601, + "grad_norm": 1.4050107087219839, + "learning_rate": 2.480438859388873e-06, + "loss": 0.1364, + "step": 9817 + }, + { + "epoch": 0.7778173895820955, + "grad_norm": 1.5688834229333537, + "learning_rate": 2.4787475450582133e-06, + "loss": 0.1442, + "step": 9818 + }, + { + "epoch": 0.7778966131907309, + "grad_norm": 1.3212826997733387, + "learning_rate": 2.4770567259658386e-06, + "loss": 0.1381, + "step": 9819 + }, + { + "epoch": 0.7779758367993662, + "grad_norm": 1.4963818967637699, + "learning_rate": 2.4753664022230783e-06, + "loss": 0.1521, + "step": 9820 + }, + { + "epoch": 0.7780550604080015, + "grad_norm": 1.2425808022257758, + "learning_rate": 2.473676573941236e-06, + "loss": 0.1223, + "step": 9821 + }, + { + "epoch": 0.778134284016637, + "grad_norm": 1.3869245581232976, + "learning_rate": 2.471987241231577e-06, + "loss": 0.1673, + "step": 9822 + }, + { + "epoch": 0.7782135076252723, + "grad_norm": 1.4553644321840447, + "learning_rate": 2.4702984042053335e-06, + "loss": 0.1657, + "step": 9823 + }, + { + "epoch": 0.7782927312339077, + "grad_norm": 1.4555058988367506, + "learning_rate": 2.468610062973712e-06, + "loss": 0.1056, + "step": 9824 + }, + { + "epoch": 0.7783719548425431, + "grad_norm": 1.418367629692807, + "learning_rate": 2.466922217647879e-06, + "loss": 0.1343, + "step": 9825 + }, + { + "epoch": 0.7784511784511785, + "grad_norm": 1.563017698175236, + "learning_rate": 2.465234868338968e-06, + "loss": 0.155, + "step": 9826 + }, + { + "epoch": 0.7785304020598138, + "grad_norm": 1.564978379751095, + "learning_rate": 2.4635480151580902e-06, + "loss": 0.1738, + "step": 9827 + }, + { + "epoch": 0.7786096256684492, + "grad_norm": 1.5218207589513273, + "learning_rate": 2.461861658216311e-06, + "loss": 0.1435, + "step": 9828 + }, + { + "epoch": 0.7786888492770846, + "grad_norm": 1.6136781479505269, + "learning_rate": 2.4601757976246685e-06, + "loss": 0.1624, + "step": 9829 + }, + { + "epoch": 0.7787680728857199, + "grad_norm": 1.3249496932637106, + "learning_rate": 2.4584904334941728e-06, + "loss": 0.094, + "step": 9830 + }, + { + "epoch": 0.7788472964943554, + "grad_norm": 1.5017492963559536, + "learning_rate": 2.456805565935795e-06, + "loss": 0.1683, + "step": 9831 + }, + { + "epoch": 0.7789265201029907, + "grad_norm": 1.4531046128085603, + "learning_rate": 2.4551211950604713e-06, + "loss": 0.1905, + "step": 9832 + }, + { + "epoch": 0.7790057437116261, + "grad_norm": 1.706283272163017, + "learning_rate": 2.4534373209791162e-06, + "loss": 0.1624, + "step": 9833 + }, + { + "epoch": 0.7790849673202614, + "grad_norm": 1.6380514707700125, + "learning_rate": 2.451753943802603e-06, + "loss": 0.1902, + "step": 9834 + }, + { + "epoch": 0.7791641909288968, + "grad_norm": 1.6499999222540944, + "learning_rate": 2.4500710636417725e-06, + "loss": 0.1739, + "step": 9835 + }, + { + "epoch": 0.7792434145375322, + "grad_norm": 1.4879660445719807, + "learning_rate": 2.4483886806074308e-06, + "loss": 0.1181, + "step": 9836 + }, + { + "epoch": 0.7793226381461675, + "grad_norm": 1.7650918291893654, + "learning_rate": 2.4467067948103616e-06, + "loss": 0.2373, + "step": 9837 + }, + { + "epoch": 0.779401861754803, + "grad_norm": 1.9501317030648648, + "learning_rate": 2.4450254063613056e-06, + "loss": 0.1755, + "step": 9838 + }, + { + "epoch": 0.7794810853634383, + "grad_norm": 1.365525629253928, + "learning_rate": 2.4433445153709722e-06, + "loss": 0.0967, + "step": 9839 + }, + { + "epoch": 0.7795603089720736, + "grad_norm": 2.284696402821749, + "learning_rate": 2.441664121950045e-06, + "loss": 0.1942, + "step": 9840 + }, + { + "epoch": 0.7796395325807091, + "grad_norm": 1.5042012443899009, + "learning_rate": 2.439984226209167e-06, + "loss": 0.1322, + "step": 9841 + }, + { + "epoch": 0.7797187561893444, + "grad_norm": 1.3875481099429432, + "learning_rate": 2.438304828258947e-06, + "loss": 0.1581, + "step": 9842 + }, + { + "epoch": 0.7797979797979798, + "grad_norm": 1.9634923702246954, + "learning_rate": 2.4366259282099737e-06, + "loss": 0.1832, + "step": 9843 + }, + { + "epoch": 0.7798772034066151, + "grad_norm": 1.3744684943728875, + "learning_rate": 2.4349475261727905e-06, + "loss": 0.155, + "step": 9844 + }, + { + "epoch": 0.7799564270152506, + "grad_norm": 1.6895205019382182, + "learning_rate": 2.4332696222579078e-06, + "loss": 0.1691, + "step": 9845 + }, + { + "epoch": 0.7800356506238859, + "grad_norm": 1.5850544854776158, + "learning_rate": 2.4315922165758154e-06, + "loss": 0.1976, + "step": 9846 + }, + { + "epoch": 0.7801148742325212, + "grad_norm": 1.581470728898586, + "learning_rate": 2.4299153092369598e-06, + "loss": 0.1509, + "step": 9847 + }, + { + "epoch": 0.7801940978411567, + "grad_norm": 1.4437674560460607, + "learning_rate": 2.428238900351755e-06, + "loss": 0.139, + "step": 9848 + }, + { + "epoch": 0.780273321449792, + "grad_norm": 1.31502528899535, + "learning_rate": 2.426562990030582e-06, + "loss": 0.1298, + "step": 9849 + }, + { + "epoch": 0.7803525450584274, + "grad_norm": 1.3514536223662552, + "learning_rate": 2.424887578383799e-06, + "loss": 0.1861, + "step": 9850 + }, + { + "epoch": 0.7804317686670628, + "grad_norm": 1.7806678450907218, + "learning_rate": 2.4232126655217202e-06, + "loss": 0.1692, + "step": 9851 + }, + { + "epoch": 0.7805109922756982, + "grad_norm": 1.2194089916371416, + "learning_rate": 2.421538251554627e-06, + "loss": 0.1284, + "step": 9852 + }, + { + "epoch": 0.7805902158843335, + "grad_norm": 1.4233446865831838, + "learning_rate": 2.4198643365927767e-06, + "loss": 0.132, + "step": 9853 + }, + { + "epoch": 0.7806694394929689, + "grad_norm": 1.3894587162835084, + "learning_rate": 2.4181909207463873e-06, + "loss": 0.1219, + "step": 9854 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 1.4438565454138383, + "learning_rate": 2.4165180041256444e-06, + "loss": 0.1997, + "step": 9855 + }, + { + "epoch": 0.7808278867102396, + "grad_norm": 1.4825858616325516, + "learning_rate": 2.4148455868407015e-06, + "loss": 0.1403, + "step": 9856 + }, + { + "epoch": 0.780907110318875, + "grad_norm": 1.9852472755035369, + "learning_rate": 2.413173669001676e-06, + "loss": 0.221, + "step": 9857 + }, + { + "epoch": 0.7809863339275104, + "grad_norm": 1.4129847202664567, + "learning_rate": 2.4115022507186626e-06, + "loss": 0.1268, + "step": 9858 + }, + { + "epoch": 0.7810655575361458, + "grad_norm": 1.6444194614633125, + "learning_rate": 2.409831332101712e-06, + "loss": 0.1286, + "step": 9859 + }, + { + "epoch": 0.7811447811447811, + "grad_norm": 1.6743803655198743, + "learning_rate": 2.4081609132608464e-06, + "loss": 0.1662, + "step": 9860 + }, + { + "epoch": 0.7812240047534165, + "grad_norm": 1.5071777355332132, + "learning_rate": 2.406490994306052e-06, + "loss": 0.1959, + "step": 9861 + }, + { + "epoch": 0.7813032283620519, + "grad_norm": 1.3217166468552706, + "learning_rate": 2.4048215753472914e-06, + "loss": 0.136, + "step": 9862 + }, + { + "epoch": 0.7813824519706872, + "grad_norm": 1.8603288390768549, + "learning_rate": 2.403152656494485e-06, + "loss": 0.1748, + "step": 9863 + }, + { + "epoch": 0.7814616755793227, + "grad_norm": 1.5524933169021706, + "learning_rate": 2.401484237857519e-06, + "loss": 0.1727, + "step": 9864 + }, + { + "epoch": 0.781540899187958, + "grad_norm": 1.4513100745484329, + "learning_rate": 2.3998163195462565e-06, + "loss": 0.1192, + "step": 9865 + }, + { + "epoch": 0.7816201227965934, + "grad_norm": 1.3655271254718757, + "learning_rate": 2.398148901670521e-06, + "loss": 0.1928, + "step": 9866 + }, + { + "epoch": 0.7816993464052288, + "grad_norm": 1.1358946494058615, + "learning_rate": 2.396481984340098e-06, + "loss": 0.0843, + "step": 9867 + }, + { + "epoch": 0.7817785700138641, + "grad_norm": 1.493678617900152, + "learning_rate": 2.3948155676647546e-06, + "loss": 0.0905, + "step": 9868 + }, + { + "epoch": 0.7818577936224995, + "grad_norm": 2.2380240605705377, + "learning_rate": 2.393149651754212e-06, + "loss": 0.1764, + "step": 9869 + }, + { + "epoch": 0.7819370172311348, + "grad_norm": 1.615659718099454, + "learning_rate": 2.391484236718159e-06, + "loss": 0.1237, + "step": 9870 + }, + { + "epoch": 0.7820162408397703, + "grad_norm": 1.4542170550840967, + "learning_rate": 2.389819322666264e-06, + "loss": 0.1386, + "step": 9871 + }, + { + "epoch": 0.7820954644484056, + "grad_norm": 1.6898234553595362, + "learning_rate": 2.3881549097081467e-06, + "loss": 0.155, + "step": 9872 + }, + { + "epoch": 0.782174688057041, + "grad_norm": 2.139501797222874, + "learning_rate": 2.3864909979534044e-06, + "loss": 0.1915, + "step": 9873 + }, + { + "epoch": 0.7822539116656764, + "grad_norm": 1.4481664144895345, + "learning_rate": 2.3848275875115925e-06, + "loss": 0.1551, + "step": 9874 + }, + { + "epoch": 0.7823331352743117, + "grad_norm": 1.9653217027377339, + "learning_rate": 2.3831646784922446e-06, + "loss": 0.1831, + "step": 9875 + }, + { + "epoch": 0.7824123588829471, + "grad_norm": 1.8148982093916635, + "learning_rate": 2.381502271004853e-06, + "loss": 0.2308, + "step": 9876 + }, + { + "epoch": 0.7824915824915825, + "grad_norm": 1.4528137559606764, + "learning_rate": 2.3798403651588765e-06, + "loss": 0.187, + "step": 9877 + }, + { + "epoch": 0.7825708061002179, + "grad_norm": 1.5887962836977498, + "learning_rate": 2.3781789610637483e-06, + "loss": 0.2413, + "step": 9878 + }, + { + "epoch": 0.7826500297088532, + "grad_norm": 1.5512615286566966, + "learning_rate": 2.376518058828863e-06, + "loss": 0.1784, + "step": 9879 + }, + { + "epoch": 0.7827292533174887, + "grad_norm": 1.1740892807392678, + "learning_rate": 2.3748576585635774e-06, + "loss": 0.1327, + "step": 9880 + }, + { + "epoch": 0.782808476926124, + "grad_norm": 1.2857031530328165, + "learning_rate": 2.373197760377228e-06, + "loss": 0.1446, + "step": 9881 + }, + { + "epoch": 0.7828877005347593, + "grad_norm": 1.5223889311563306, + "learning_rate": 2.371538364379109e-06, + "loss": 0.1616, + "step": 9882 + }, + { + "epoch": 0.7829669241433947, + "grad_norm": 1.5839227481392508, + "learning_rate": 2.36987947067848e-06, + "loss": 0.2003, + "step": 9883 + }, + { + "epoch": 0.7830461477520301, + "grad_norm": 1.4081995392008955, + "learning_rate": 2.368221079384577e-06, + "loss": 0.1549, + "step": 9884 + }, + { + "epoch": 0.7831253713606655, + "grad_norm": 1.6971172908327063, + "learning_rate": 2.3665631906065933e-06, + "loss": 0.18, + "step": 9885 + }, + { + "epoch": 0.7832045949693008, + "grad_norm": 1.1954142628303228, + "learning_rate": 2.364905804453692e-06, + "loss": 0.1489, + "step": 9886 + }, + { + "epoch": 0.7832838185779363, + "grad_norm": 1.6572554709624563, + "learning_rate": 2.3632489210350074e-06, + "loss": 0.1338, + "step": 9887 + }, + { + "epoch": 0.7833630421865716, + "grad_norm": 1.1781206400864117, + "learning_rate": 2.361592540459636e-06, + "loss": 0.0844, + "step": 9888 + }, + { + "epoch": 0.7834422657952069, + "grad_norm": 1.5340634649743108, + "learning_rate": 2.3599366628366427e-06, + "loss": 0.1771, + "step": 9889 + }, + { + "epoch": 0.7835214894038424, + "grad_norm": 1.6778345987190693, + "learning_rate": 2.358281288275055e-06, + "loss": 0.2209, + "step": 9890 + }, + { + "epoch": 0.7836007130124777, + "grad_norm": 1.3264485644963284, + "learning_rate": 2.356626416883878e-06, + "loss": 0.1545, + "step": 9891 + }, + { + "epoch": 0.7836799366211131, + "grad_norm": 1.9691156240211038, + "learning_rate": 2.354972048772074e-06, + "loss": 0.2074, + "step": 9892 + }, + { + "epoch": 0.7837591602297485, + "grad_norm": 1.2191977698700902, + "learning_rate": 2.353318184048573e-06, + "loss": 0.0844, + "step": 9893 + }, + { + "epoch": 0.7838383838383839, + "grad_norm": 1.137265551923628, + "learning_rate": 2.351664822822277e-06, + "loss": 0.08, + "step": 9894 + }, + { + "epoch": 0.7839176074470192, + "grad_norm": 1.421115980040482, + "learning_rate": 2.3500119652020526e-06, + "loss": 0.1554, + "step": 9895 + }, + { + "epoch": 0.7839968310556545, + "grad_norm": 1.5200703481219549, + "learning_rate": 2.348359611296728e-06, + "loss": 0.1846, + "step": 9896 + }, + { + "epoch": 0.78407605466429, + "grad_norm": 1.6095400050334874, + "learning_rate": 2.346707761215108e-06, + "loss": 0.1502, + "step": 9897 + }, + { + "epoch": 0.7841552782729253, + "grad_norm": 1.4076656297089143, + "learning_rate": 2.345056415065956e-06, + "loss": 0.1503, + "step": 9898 + }, + { + "epoch": 0.7842345018815607, + "grad_norm": 1.541479098845182, + "learning_rate": 2.343405572958004e-06, + "loss": 0.172, + "step": 9899 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 1.5297463587582756, + "learning_rate": 2.341755234999956e-06, + "loss": 0.245, + "step": 9900 + }, + { + "epoch": 0.7843929490988315, + "grad_norm": 1.5833443207634692, + "learning_rate": 2.3401054013004776e-06, + "loss": 0.1508, + "step": 9901 + }, + { + "epoch": 0.7844721727074668, + "grad_norm": 1.5892295059920318, + "learning_rate": 2.338456071968198e-06, + "loss": 0.1578, + "step": 9902 + }, + { + "epoch": 0.7845513963161022, + "grad_norm": 1.4429492724249036, + "learning_rate": 2.336807247111723e-06, + "loss": 0.1612, + "step": 9903 + }, + { + "epoch": 0.7846306199247376, + "grad_norm": 1.3529003984642856, + "learning_rate": 2.3351589268396193e-06, + "loss": 0.1335, + "step": 9904 + }, + { + "epoch": 0.7847098435333729, + "grad_norm": 1.7004838622497822, + "learning_rate": 2.3335111112604194e-06, + "loss": 0.1933, + "step": 9905 + }, + { + "epoch": 0.7847890671420084, + "grad_norm": 1.5184346920036407, + "learning_rate": 2.33186380048262e-06, + "loss": 0.2354, + "step": 9906 + }, + { + "epoch": 0.7848682907506437, + "grad_norm": 1.7302373847009187, + "learning_rate": 2.330216994614696e-06, + "loss": 0.2152, + "step": 9907 + }, + { + "epoch": 0.7849475143592791, + "grad_norm": 1.7780155831602766, + "learning_rate": 2.3285706937650786e-06, + "loss": 0.1689, + "step": 9908 + }, + { + "epoch": 0.7850267379679144, + "grad_norm": 1.6726453576435996, + "learning_rate": 2.3269248980421653e-06, + "loss": 0.17, + "step": 9909 + }, + { + "epoch": 0.7851059615765498, + "grad_norm": 1.2388215252220365, + "learning_rate": 2.3252796075543295e-06, + "loss": 0.1081, + "step": 9910 + }, + { + "epoch": 0.7851851851851852, + "grad_norm": 1.0664201285624373, + "learning_rate": 2.3236348224099038e-06, + "loss": 0.1176, + "step": 9911 + }, + { + "epoch": 0.7852644087938205, + "grad_norm": 2.0678534576917524, + "learning_rate": 2.3219905427171864e-06, + "loss": 0.2299, + "step": 9912 + }, + { + "epoch": 0.785343632402456, + "grad_norm": 1.64307162014019, + "learning_rate": 2.320346768584449e-06, + "loss": 0.1576, + "step": 9913 + }, + { + "epoch": 0.7854228560110913, + "grad_norm": 1.0788347048400195, + "learning_rate": 2.3187035001199254e-06, + "loss": 0.0896, + "step": 9914 + }, + { + "epoch": 0.7855020796197266, + "grad_norm": 1.2997438253846387, + "learning_rate": 2.317060737431813e-06, + "loss": 0.1666, + "step": 9915 + }, + { + "epoch": 0.7855813032283621, + "grad_norm": 1.3365023128119964, + "learning_rate": 2.3154184806282863e-06, + "loss": 0.1059, + "step": 9916 + }, + { + "epoch": 0.7856605268369974, + "grad_norm": 1.5025039908520286, + "learning_rate": 2.3137767298174774e-06, + "loss": 0.124, + "step": 9917 + }, + { + "epoch": 0.7857397504456328, + "grad_norm": 1.6868164976194144, + "learning_rate": 2.312135485107486e-06, + "loss": 0.1362, + "step": 9918 + }, + { + "epoch": 0.7858189740542681, + "grad_norm": 1.2729634173030109, + "learning_rate": 2.3104947466063785e-06, + "loss": 0.1447, + "step": 9919 + }, + { + "epoch": 0.7858981976629036, + "grad_norm": 0.9913998222470702, + "learning_rate": 2.3088545144221964e-06, + "loss": 0.067, + "step": 9920 + }, + { + "epoch": 0.7859774212715389, + "grad_norm": 1.8229278522045034, + "learning_rate": 2.307214788662936e-06, + "loss": 0.2254, + "step": 9921 + }, + { + "epoch": 0.7860566448801742, + "grad_norm": 1.5690718750912345, + "learning_rate": 2.3055755694365644e-06, + "loss": 0.1407, + "step": 9922 + }, + { + "epoch": 0.7861358684888097, + "grad_norm": 1.5155520335935997, + "learning_rate": 2.303936856851021e-06, + "loss": 0.0957, + "step": 9923 + }, + { + "epoch": 0.786215092097445, + "grad_norm": 1.2659496091344593, + "learning_rate": 2.302298651014204e-06, + "loss": 0.0895, + "step": 9924 + }, + { + "epoch": 0.7862943157060804, + "grad_norm": 1.4116650662247483, + "learning_rate": 2.3006609520339796e-06, + "loss": 0.1089, + "step": 9925 + }, + { + "epoch": 0.7863735393147158, + "grad_norm": 1.946327090543923, + "learning_rate": 2.2990237600181864e-06, + "loss": 0.2623, + "step": 9926 + }, + { + "epoch": 0.7864527629233512, + "grad_norm": 1.4824259779344384, + "learning_rate": 2.2973870750746253e-06, + "loss": 0.1439, + "step": 9927 + }, + { + "epoch": 0.7865319865319865, + "grad_norm": 2.0297942406853244, + "learning_rate": 2.2957508973110586e-06, + "loss": 0.1525, + "step": 9928 + }, + { + "epoch": 0.7866112101406219, + "grad_norm": 1.7268831389340178, + "learning_rate": 2.2941152268352284e-06, + "loss": 0.1578, + "step": 9929 + }, + { + "epoch": 0.7866904337492573, + "grad_norm": 1.7233516262432869, + "learning_rate": 2.292480063754833e-06, + "loss": 0.1908, + "step": 9930 + }, + { + "epoch": 0.7867696573578926, + "grad_norm": 1.6122513701460157, + "learning_rate": 2.2908454081775344e-06, + "loss": 0.1761, + "step": 9931 + }, + { + "epoch": 0.786848880966528, + "grad_norm": 1.2913194872695093, + "learning_rate": 2.2892112602109783e-06, + "loss": 0.0796, + "step": 9932 + }, + { + "epoch": 0.7869281045751634, + "grad_norm": 1.191435906005958, + "learning_rate": 2.2875776199627564e-06, + "loss": 0.1467, + "step": 9933 + }, + { + "epoch": 0.7870073281837988, + "grad_norm": 1.662942997145696, + "learning_rate": 2.2859444875404347e-06, + "loss": 0.1556, + "step": 9934 + }, + { + "epoch": 0.7870865517924341, + "grad_norm": 1.9013726168634986, + "learning_rate": 2.2843118630515536e-06, + "loss": 0.2085, + "step": 9935 + }, + { + "epoch": 0.7871657754010695, + "grad_norm": 1.4286296586426461, + "learning_rate": 2.282679746603611e-06, + "loss": 0.0949, + "step": 9936 + }, + { + "epoch": 0.7872449990097049, + "grad_norm": 1.6938781461089853, + "learning_rate": 2.281048138304072e-06, + "loss": 0.204, + "step": 9937 + }, + { + "epoch": 0.7873242226183402, + "grad_norm": 1.900504851065913, + "learning_rate": 2.279417038260373e-06, + "loss": 0.2873, + "step": 9938 + }, + { + "epoch": 0.7874034462269757, + "grad_norm": 1.404110202195443, + "learning_rate": 2.2777864465799137e-06, + "loss": 0.163, + "step": 9939 + }, + { + "epoch": 0.787482669835611, + "grad_norm": 1.5573164992739201, + "learning_rate": 2.276156363370058e-06, + "loss": 0.1483, + "step": 9940 + }, + { + "epoch": 0.7875618934442464, + "grad_norm": 1.3045762965554017, + "learning_rate": 2.274526788738143e-06, + "loss": 0.147, + "step": 9941 + }, + { + "epoch": 0.7876411170528818, + "grad_norm": 1.7295399244403333, + "learning_rate": 2.272897722791466e-06, + "loss": 0.2139, + "step": 9942 + }, + { + "epoch": 0.7877203406615171, + "grad_norm": 1.6195616020356043, + "learning_rate": 2.271269165637294e-06, + "loss": 0.1934, + "step": 9943 + }, + { + "epoch": 0.7877995642701525, + "grad_norm": 1.5485879008510388, + "learning_rate": 2.2696411173828557e-06, + "loss": 0.1413, + "step": 9944 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 1.9380151936770873, + "learning_rate": 2.268013578135357e-06, + "loss": 0.2173, + "step": 9945 + }, + { + "epoch": 0.7879580114874233, + "grad_norm": 1.4341914940444063, + "learning_rate": 2.266386548001961e-06, + "loss": 0.1822, + "step": 9946 + }, + { + "epoch": 0.7880372350960586, + "grad_norm": 1.5959739300320508, + "learning_rate": 2.264760027089795e-06, + "loss": 0.2274, + "step": 9947 + }, + { + "epoch": 0.788116458704694, + "grad_norm": 1.1242432050579019, + "learning_rate": 2.2631340155059656e-06, + "loss": 0.1102, + "step": 9948 + }, + { + "epoch": 0.7881956823133294, + "grad_norm": 1.9244895012940224, + "learning_rate": 2.261508513357532e-06, + "loss": 0.1813, + "step": 9949 + }, + { + "epoch": 0.7882749059219647, + "grad_norm": 1.4237682201949433, + "learning_rate": 2.2598835207515267e-06, + "loss": 0.1016, + "step": 9950 + }, + { + "epoch": 0.7883541295306001, + "grad_norm": 1.7744325348652044, + "learning_rate": 2.2582590377949497e-06, + "loss": 0.216, + "step": 9951 + }, + { + "epoch": 0.7884333531392355, + "grad_norm": 1.5749910436672572, + "learning_rate": 2.2566350645947656e-06, + "loss": 0.1746, + "step": 9952 + }, + { + "epoch": 0.7885125767478709, + "grad_norm": 1.355511694526603, + "learning_rate": 2.2550116012579004e-06, + "loss": 0.0897, + "step": 9953 + }, + { + "epoch": 0.7885918003565062, + "grad_norm": 1.6295008299820368, + "learning_rate": 2.253388647891258e-06, + "loss": 0.1643, + "step": 9954 + }, + { + "epoch": 0.7886710239651417, + "grad_norm": 1.3467391373535633, + "learning_rate": 2.2517662046016975e-06, + "loss": 0.1004, + "step": 9955 + }, + { + "epoch": 0.788750247573777, + "grad_norm": 4.682270455395327, + "learning_rate": 2.250144271496049e-06, + "loss": 0.1969, + "step": 9956 + }, + { + "epoch": 0.7888294711824123, + "grad_norm": 1.5699310942789322, + "learning_rate": 2.2485228486811128e-06, + "loss": 0.1608, + "step": 9957 + }, + { + "epoch": 0.7889086947910477, + "grad_norm": 1.486879949519601, + "learning_rate": 2.2469019362636478e-06, + "loss": 0.1361, + "step": 9958 + }, + { + "epoch": 0.7889879183996831, + "grad_norm": 1.595931092624869, + "learning_rate": 2.2452815343503862e-06, + "loss": 0.1698, + "step": 9959 + }, + { + "epoch": 0.7890671420083185, + "grad_norm": 1.784241410979693, + "learning_rate": 2.2436616430480197e-06, + "loss": 0.2391, + "step": 9960 + }, + { + "epoch": 0.7891463656169538, + "grad_norm": 1.463888108877651, + "learning_rate": 2.2420422624632153e-06, + "loss": 0.1881, + "step": 9961 + }, + { + "epoch": 0.7892255892255893, + "grad_norm": 1.3974477761097146, + "learning_rate": 2.2404233927025985e-06, + "loss": 0.1318, + "step": 9962 + }, + { + "epoch": 0.7893048128342246, + "grad_norm": 1.5570581920763162, + "learning_rate": 2.238805033872762e-06, + "loss": 0.1454, + "step": 9963 + }, + { + "epoch": 0.7893840364428599, + "grad_norm": 1.642197661575361, + "learning_rate": 2.237187186080273e-06, + "loss": 0.137, + "step": 9964 + }, + { + "epoch": 0.7894632600514954, + "grad_norm": 1.6150614542953492, + "learning_rate": 2.235569849431655e-06, + "loss": 0.1696, + "step": 9965 + }, + { + "epoch": 0.7895424836601307, + "grad_norm": 1.2972054456968034, + "learning_rate": 2.2339530240333993e-06, + "loss": 0.158, + "step": 9966 + }, + { + "epoch": 0.7896217072687661, + "grad_norm": 1.2969935030274593, + "learning_rate": 2.2323367099919724e-06, + "loss": 0.1674, + "step": 9967 + }, + { + "epoch": 0.7897009308774015, + "grad_norm": 1.519984704530541, + "learning_rate": 2.230720907413797e-06, + "loss": 0.1644, + "step": 9968 + }, + { + "epoch": 0.7897801544860369, + "grad_norm": 1.327596584034342, + "learning_rate": 2.2291056164052638e-06, + "loss": 0.1164, + "step": 9969 + }, + { + "epoch": 0.7898593780946722, + "grad_norm": 1.1748700946165422, + "learning_rate": 2.2274908370727376e-06, + "loss": 0.1397, + "step": 9970 + }, + { + "epoch": 0.7899386017033075, + "grad_norm": 1.7387930116965715, + "learning_rate": 2.2258765695225416e-06, + "loss": 0.1185, + "step": 9971 + }, + { + "epoch": 0.790017825311943, + "grad_norm": 1.2234761605340527, + "learning_rate": 2.224262813860962e-06, + "loss": 0.1084, + "step": 9972 + }, + { + "epoch": 0.7900970489205783, + "grad_norm": 1.9514398896101708, + "learning_rate": 2.2226495701942663e-06, + "loss": 0.1962, + "step": 9973 + }, + { + "epoch": 0.7901762725292137, + "grad_norm": 1.490312835581842, + "learning_rate": 2.2210368386286742e-06, + "loss": 0.2002, + "step": 9974 + }, + { + "epoch": 0.7902554961378491, + "grad_norm": 1.4741412854177365, + "learning_rate": 2.219424619270375e-06, + "loss": 0.1099, + "step": 9975 + }, + { + "epoch": 0.7903347197464845, + "grad_norm": 1.551959043376881, + "learning_rate": 2.2178129122255255e-06, + "loss": 0.139, + "step": 9976 + }, + { + "epoch": 0.7904139433551198, + "grad_norm": 1.1980804146224264, + "learning_rate": 2.2162017176002514e-06, + "loss": 0.1337, + "step": 9977 + }, + { + "epoch": 0.7904931669637552, + "grad_norm": 1.4266653732552625, + "learning_rate": 2.2145910355006415e-06, + "loss": 0.1677, + "step": 9978 + }, + { + "epoch": 0.7905723905723906, + "grad_norm": 2.3174832590171084, + "learning_rate": 2.212980866032749e-06, + "loss": 0.1632, + "step": 9979 + }, + { + "epoch": 0.7906516141810259, + "grad_norm": 1.754321042318844, + "learning_rate": 2.2113712093025997e-06, + "loss": 0.2074, + "step": 9980 + }, + { + "epoch": 0.7907308377896614, + "grad_norm": 2.066931491831207, + "learning_rate": 2.20976206541618e-06, + "loss": 0.2058, + "step": 9981 + }, + { + "epoch": 0.7908100613982967, + "grad_norm": 1.4004544365633376, + "learning_rate": 2.208153434479442e-06, + "loss": 0.1318, + "step": 9982 + }, + { + "epoch": 0.7908892850069321, + "grad_norm": 1.3430351447290874, + "learning_rate": 2.20654531659831e-06, + "loss": 0.1399, + "step": 9983 + }, + { + "epoch": 0.7909685086155674, + "grad_norm": 1.7441387425468462, + "learning_rate": 2.2049377118786696e-06, + "loss": 0.1891, + "step": 9984 + }, + { + "epoch": 0.7910477322242028, + "grad_norm": 1.3147742498340642, + "learning_rate": 2.2033306204263704e-06, + "loss": 0.098, + "step": 9985 + }, + { + "epoch": 0.7911269558328382, + "grad_norm": 1.4776189216839015, + "learning_rate": 2.2017240423472384e-06, + "loss": 0.125, + "step": 9986 + }, + { + "epoch": 0.7912061794414735, + "grad_norm": 1.5624806083130125, + "learning_rate": 2.200117977747055e-06, + "loss": 0.1743, + "step": 9987 + }, + { + "epoch": 0.791285403050109, + "grad_norm": 1.6305221424473388, + "learning_rate": 2.198512426731568e-06, + "loss": 0.147, + "step": 9988 + }, + { + "epoch": 0.7913646266587443, + "grad_norm": 1.646593475485873, + "learning_rate": 2.196907389406504e-06, + "loss": 0.1817, + "step": 9989 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 2.0803310071104955, + "learning_rate": 2.195302865877541e-06, + "loss": 0.2863, + "step": 9990 + }, + { + "epoch": 0.7915230738760151, + "grad_norm": 1.7736444271091318, + "learning_rate": 2.193698856250331e-06, + "loss": 0.1744, + "step": 9991 + }, + { + "epoch": 0.7916022974846504, + "grad_norm": 1.621184216725666, + "learning_rate": 2.1920953606304875e-06, + "loss": 0.2197, + "step": 9992 + }, + { + "epoch": 0.7916815210932858, + "grad_norm": 1.6404741812759889, + "learning_rate": 2.1904923791235965e-06, + "loss": 0.1703, + "step": 9993 + }, + { + "epoch": 0.7917607447019211, + "grad_norm": 1.4897659344779688, + "learning_rate": 2.188889911835207e-06, + "loss": 0.1374, + "step": 9994 + }, + { + "epoch": 0.7918399683105566, + "grad_norm": 1.8663482662202842, + "learning_rate": 2.1872879588708286e-06, + "loss": 0.205, + "step": 9995 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 1.6094847483146242, + "learning_rate": 2.185686520335948e-06, + "loss": 0.1729, + "step": 9996 + }, + { + "epoch": 0.7919984155278272, + "grad_norm": 1.579945800895183, + "learning_rate": 2.184085596336011e-06, + "loss": 0.1748, + "step": 9997 + }, + { + "epoch": 0.7920776391364627, + "grad_norm": 1.2182952293360618, + "learning_rate": 2.1824851869764262e-06, + "loss": 0.1085, + "step": 9998 + }, + { + "epoch": 0.792156862745098, + "grad_norm": 1.9813333334345649, + "learning_rate": 2.1808852923625802e-06, + "loss": 0.2213, + "step": 9999 + }, + { + "epoch": 0.7922360863537334, + "grad_norm": 2.3798008584447876, + "learning_rate": 2.1792859125998134e-06, + "loss": 0.2385, + "step": 10000 + }, + { + "epoch": 0.7923153099623688, + "grad_norm": 1.2946411560688356, + "learning_rate": 2.1776870477934353e-06, + "loss": 0.1164, + "step": 10001 + }, + { + "epoch": 0.7923945335710042, + "grad_norm": 1.5012788819907488, + "learning_rate": 2.1760886980487307e-06, + "loss": 0.1385, + "step": 10002 + }, + { + "epoch": 0.7924737571796395, + "grad_norm": 1.8064772182168056, + "learning_rate": 2.174490863470938e-06, + "loss": 0.2323, + "step": 10003 + }, + { + "epoch": 0.7925529807882749, + "grad_norm": 1.3205231349479927, + "learning_rate": 2.1728935441652687e-06, + "loss": 0.0755, + "step": 10004 + }, + { + "epoch": 0.7926322043969103, + "grad_norm": 1.8299458070746077, + "learning_rate": 2.1712967402368947e-06, + "loss": 0.095, + "step": 10005 + }, + { + "epoch": 0.7927114280055456, + "grad_norm": 2.0463888396517165, + "learning_rate": 2.169700451790964e-06, + "loss": 0.1594, + "step": 10006 + }, + { + "epoch": 0.792790651614181, + "grad_norm": 1.6230727902814366, + "learning_rate": 2.168104678932581e-06, + "loss": 0.1748, + "step": 10007 + }, + { + "epoch": 0.7928698752228164, + "grad_norm": 1.902277582263067, + "learning_rate": 2.166509421766818e-06, + "loss": 0.155, + "step": 10008 + }, + { + "epoch": 0.7929490988314518, + "grad_norm": 1.3007070544993693, + "learning_rate": 2.1649146803987197e-06, + "loss": 0.1294, + "step": 10009 + }, + { + "epoch": 0.7930283224400871, + "grad_norm": 1.6989870907121092, + "learning_rate": 2.1633204549332897e-06, + "loss": 0.1547, + "step": 10010 + }, + { + "epoch": 0.7931075460487225, + "grad_norm": 1.2099287702109043, + "learning_rate": 2.1617267454754996e-06, + "loss": 0.0925, + "step": 10011 + }, + { + "epoch": 0.7931867696573579, + "grad_norm": 1.4496158928678884, + "learning_rate": 2.160133552130289e-06, + "loss": 0.2026, + "step": 10012 + }, + { + "epoch": 0.7932659932659932, + "grad_norm": 1.74231832692503, + "learning_rate": 2.1585408750025584e-06, + "loss": 0.1371, + "step": 10013 + }, + { + "epoch": 0.7933452168746287, + "grad_norm": 1.522157793095154, + "learning_rate": 2.1569487141971824e-06, + "loss": 0.1764, + "step": 10014 + }, + { + "epoch": 0.793424440483264, + "grad_norm": 1.629797164066686, + "learning_rate": 2.155357069818995e-06, + "loss": 0.232, + "step": 10015 + }, + { + "epoch": 0.7935036640918994, + "grad_norm": 1.3883747090570508, + "learning_rate": 2.1537659419727987e-06, + "loss": 0.2118, + "step": 10016 + }, + { + "epoch": 0.7935828877005348, + "grad_norm": 1.8661431270692048, + "learning_rate": 2.152175330763359e-06, + "loss": 0.108, + "step": 10017 + }, + { + "epoch": 0.7936621113091701, + "grad_norm": 1.3944278233127205, + "learning_rate": 2.150585236295415e-06, + "loss": 0.1349, + "step": 10018 + }, + { + "epoch": 0.7937413349178055, + "grad_norm": 1.392398724398443, + "learning_rate": 2.148995658673665e-06, + "loss": 0.1081, + "step": 10019 + }, + { + "epoch": 0.7938205585264408, + "grad_norm": 1.6525013010408893, + "learning_rate": 2.14740659800277e-06, + "loss": 0.1809, + "step": 10020 + }, + { + "epoch": 0.7938997821350763, + "grad_norm": 1.600757574755079, + "learning_rate": 2.1458180543873697e-06, + "loss": 0.1235, + "step": 10021 + }, + { + "epoch": 0.7939790057437116, + "grad_norm": 1.6959189801343713, + "learning_rate": 2.1442300279320593e-06, + "loss": 0.1578, + "step": 10022 + }, + { + "epoch": 0.794058229352347, + "grad_norm": 1.775667930781625, + "learning_rate": 2.142642518741399e-06, + "loss": 0.2742, + "step": 10023 + }, + { + "epoch": 0.7941374529609824, + "grad_norm": 1.5579501042143638, + "learning_rate": 2.141055526919924e-06, + "loss": 0.114, + "step": 10024 + }, + { + "epoch": 0.7942166765696177, + "grad_norm": 1.270325046075699, + "learning_rate": 2.1394690525721275e-06, + "loss": 0.1352, + "step": 10025 + }, + { + "epoch": 0.7942959001782531, + "grad_norm": 1.6561846729585206, + "learning_rate": 2.137883095802469e-06, + "loss": 0.2266, + "step": 10026 + }, + { + "epoch": 0.7943751237868885, + "grad_norm": 1.6738282412633614, + "learning_rate": 2.1362976567153813e-06, + "loss": 0.1764, + "step": 10027 + }, + { + "epoch": 0.7944543473955239, + "grad_norm": 1.6976550261578531, + "learning_rate": 2.134712735415255e-06, + "loss": 0.2019, + "step": 10028 + }, + { + "epoch": 0.7945335710041592, + "grad_norm": 1.4696840344431166, + "learning_rate": 2.13312833200645e-06, + "loss": 0.1435, + "step": 10029 + }, + { + "epoch": 0.7946127946127947, + "grad_norm": 1.4003875061720121, + "learning_rate": 2.131544446593289e-06, + "loss": 0.1247, + "step": 10030 + }, + { + "epoch": 0.79469201822143, + "grad_norm": 1.3843549351146476, + "learning_rate": 2.1299610792800675e-06, + "loss": 0.1037, + "step": 10031 + }, + { + "epoch": 0.7947712418300653, + "grad_norm": 1.2525833791601648, + "learning_rate": 2.1283782301710408e-06, + "loss": 0.1203, + "step": 10032 + }, + { + "epoch": 0.7948504654387007, + "grad_norm": 1.4484258631400002, + "learning_rate": 2.1267958993704297e-06, + "loss": 0.1765, + "step": 10033 + }, + { + "epoch": 0.7949296890473361, + "grad_norm": 1.4152771342621562, + "learning_rate": 2.1252140869824266e-06, + "loss": 0.1485, + "step": 10034 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 1.8643331778965033, + "learning_rate": 2.1236327931111868e-06, + "loss": 0.1861, + "step": 10035 + }, + { + "epoch": 0.7950881362646068, + "grad_norm": 1.498891506088113, + "learning_rate": 2.122052017860825e-06, + "loss": 0.1426, + "step": 10036 + }, + { + "epoch": 0.7951673598732423, + "grad_norm": 1.3808495616863234, + "learning_rate": 2.120471761335434e-06, + "loss": 0.129, + "step": 10037 + }, + { + "epoch": 0.7952465834818776, + "grad_norm": 1.5324500141901025, + "learning_rate": 2.118892023639064e-06, + "loss": 0.0995, + "step": 10038 + }, + { + "epoch": 0.7953258070905129, + "grad_norm": 1.6761058699444447, + "learning_rate": 2.1173128048757307e-06, + "loss": 0.1826, + "step": 10039 + }, + { + "epoch": 0.7954050306991484, + "grad_norm": 1.585363782878541, + "learning_rate": 2.115734105149422e-06, + "loss": 0.2085, + "step": 10040 + }, + { + "epoch": 0.7954842543077837, + "grad_norm": 1.5461038712788653, + "learning_rate": 2.1141559245640865e-06, + "loss": 0.1, + "step": 10041 + }, + { + "epoch": 0.7955634779164191, + "grad_norm": 1.6139730034851818, + "learning_rate": 2.1125782632236357e-06, + "loss": 0.1482, + "step": 10042 + }, + { + "epoch": 0.7956427015250545, + "grad_norm": 1.672022973226804, + "learning_rate": 2.111001121231957e-06, + "loss": 0.1368, + "step": 10043 + }, + { + "epoch": 0.7957219251336899, + "grad_norm": 1.939269575701926, + "learning_rate": 2.1094244986928956e-06, + "loss": 0.199, + "step": 10044 + }, + { + "epoch": 0.7958011487423252, + "grad_norm": 1.5465284925046832, + "learning_rate": 2.1078483957102637e-06, + "loss": 0.157, + "step": 10045 + }, + { + "epoch": 0.7958803723509605, + "grad_norm": 1.2372117591846676, + "learning_rate": 2.1062728123878383e-06, + "loss": 0.1193, + "step": 10046 + }, + { + "epoch": 0.795959595959596, + "grad_norm": 1.0899371777790527, + "learning_rate": 2.1046977488293675e-06, + "loss": 0.1078, + "step": 10047 + }, + { + "epoch": 0.7960388195682313, + "grad_norm": 1.8196955824363443, + "learning_rate": 2.1031232051385606e-06, + "loss": 0.1874, + "step": 10048 + }, + { + "epoch": 0.7961180431768667, + "grad_norm": 1.6934029344638837, + "learning_rate": 2.1015491814190913e-06, + "loss": 0.1905, + "step": 10049 + }, + { + "epoch": 0.7961972667855021, + "grad_norm": 1.2761708467391744, + "learning_rate": 2.099975677774606e-06, + "loss": 0.1281, + "step": 10050 + }, + { + "epoch": 0.7962764903941375, + "grad_norm": 1.3001720610321532, + "learning_rate": 2.0984026943087087e-06, + "loss": 0.1459, + "step": 10051 + }, + { + "epoch": 0.7963557140027728, + "grad_norm": 1.7436757517148482, + "learning_rate": 2.096830231124972e-06, + "loss": 0.2174, + "step": 10052 + }, + { + "epoch": 0.7964349376114082, + "grad_norm": 2.0254871002333967, + "learning_rate": 2.0952582883269403e-06, + "loss": 0.278, + "step": 10053 + }, + { + "epoch": 0.7965141612200436, + "grad_norm": 2.0624690806201174, + "learning_rate": 2.093686866018114e-06, + "loss": 0.1953, + "step": 10054 + }, + { + "epoch": 0.7965933848286789, + "grad_norm": 1.1574003552965064, + "learning_rate": 2.0921159643019627e-06, + "loss": 0.1018, + "step": 10055 + }, + { + "epoch": 0.7966726084373144, + "grad_norm": 1.2272561335047898, + "learning_rate": 2.0905455832819277e-06, + "loss": 0.1385, + "step": 10056 + }, + { + "epoch": 0.7967518320459497, + "grad_norm": 1.3680676860652303, + "learning_rate": 2.088975723061408e-06, + "loss": 0.1641, + "step": 10057 + }, + { + "epoch": 0.7968310556545851, + "grad_norm": 1.2946262379219424, + "learning_rate": 2.0874063837437687e-06, + "loss": 0.1796, + "step": 10058 + }, + { + "epoch": 0.7969102792632204, + "grad_norm": 1.6170030796631873, + "learning_rate": 2.085837565432349e-06, + "loss": 0.1764, + "step": 10059 + }, + { + "epoch": 0.7969895028718558, + "grad_norm": 1.589702139256348, + "learning_rate": 2.0842692682304442e-06, + "loss": 0.1721, + "step": 10060 + }, + { + "epoch": 0.7970687264804912, + "grad_norm": 1.4861482116536642, + "learning_rate": 2.0827014922413213e-06, + "loss": 0.1929, + "step": 10061 + }, + { + "epoch": 0.7971479500891265, + "grad_norm": 2.116904576828805, + "learning_rate": 2.0811342375682065e-06, + "loss": 0.2659, + "step": 10062 + }, + { + "epoch": 0.797227173697762, + "grad_norm": 1.394589371776751, + "learning_rate": 2.0795675043143016e-06, + "loss": 0.1196, + "step": 10063 + }, + { + "epoch": 0.7973063973063973, + "grad_norm": 1.260542638084776, + "learning_rate": 2.0780012925827653e-06, + "loss": 0.0991, + "step": 10064 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 1.399681603253, + "learning_rate": 2.0764356024767228e-06, + "loss": 0.2049, + "step": 10065 + }, + { + "epoch": 0.7974648445236681, + "grad_norm": 1.7207505402694168, + "learning_rate": 2.0748704340992743e-06, + "loss": 0.1899, + "step": 10066 + }, + { + "epoch": 0.7975440681323034, + "grad_norm": 1.3770166180821815, + "learning_rate": 2.0733057875534734e-06, + "loss": 0.1627, + "step": 10067 + }, + { + "epoch": 0.7976232917409388, + "grad_norm": 1.309604247854031, + "learning_rate": 2.0717416629423425e-06, + "loss": 0.1297, + "step": 10068 + }, + { + "epoch": 0.7977025153495741, + "grad_norm": 1.59963359782775, + "learning_rate": 2.0701780603688783e-06, + "loss": 0.0945, + "step": 10069 + }, + { + "epoch": 0.7977817389582096, + "grad_norm": 1.3378879486392, + "learning_rate": 2.068614979936032e-06, + "loss": 0.1271, + "step": 10070 + }, + { + "epoch": 0.7978609625668449, + "grad_norm": 1.720023720341635, + "learning_rate": 2.0670524217467237e-06, + "loss": 0.1558, + "step": 10071 + }, + { + "epoch": 0.7979401861754803, + "grad_norm": 1.6811447599097589, + "learning_rate": 2.0654903859038457e-06, + "loss": 0.1417, + "step": 10072 + }, + { + "epoch": 0.7980194097841157, + "grad_norm": 1.5077634819512435, + "learning_rate": 2.0639288725102467e-06, + "loss": 0.1623, + "step": 10073 + }, + { + "epoch": 0.798098633392751, + "grad_norm": 1.2716758822265468, + "learning_rate": 2.0623678816687433e-06, + "loss": 0.095, + "step": 10074 + }, + { + "epoch": 0.7981778570013864, + "grad_norm": 1.2788909379660118, + "learning_rate": 2.0608074134821243e-06, + "loss": 0.1128, + "step": 10075 + }, + { + "epoch": 0.7982570806100218, + "grad_norm": 1.6234988712115461, + "learning_rate": 2.0592474680531347e-06, + "loss": 0.1477, + "step": 10076 + }, + { + "epoch": 0.7983363042186572, + "grad_norm": 1.4192793330504276, + "learning_rate": 2.0576880454844926e-06, + "loss": 0.1663, + "step": 10077 + }, + { + "epoch": 0.7984155278272925, + "grad_norm": 1.5796686463744987, + "learning_rate": 2.0561291458788736e-06, + "loss": 0.1446, + "step": 10078 + }, + { + "epoch": 0.7984947514359279, + "grad_norm": 1.0989884124112725, + "learning_rate": 2.0545707693389296e-06, + "loss": 0.092, + "step": 10079 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 1.5998706133694671, + "learning_rate": 2.0530129159672685e-06, + "loss": 0.1817, + "step": 10080 + }, + { + "epoch": 0.7986531986531986, + "grad_norm": 1.1636296558117674, + "learning_rate": 2.0514555858664663e-06, + "loss": 0.0899, + "step": 10081 + }, + { + "epoch": 0.798732422261834, + "grad_norm": 2.364029207493055, + "learning_rate": 2.0498987791390713e-06, + "loss": 0.164, + "step": 10082 + }, + { + "epoch": 0.7988116458704694, + "grad_norm": 1.5230177713705173, + "learning_rate": 2.0483424958875876e-06, + "loss": 0.1112, + "step": 10083 + }, + { + "epoch": 0.7988908694791048, + "grad_norm": 1.271856275702475, + "learning_rate": 2.0467867362144867e-06, + "loss": 0.1193, + "step": 10084 + }, + { + "epoch": 0.7989700930877401, + "grad_norm": 1.8417327971505366, + "learning_rate": 2.0452315002222134e-06, + "loss": 0.1794, + "step": 10085 + }, + { + "epoch": 0.7990493166963755, + "grad_norm": 1.901927848330103, + "learning_rate": 2.04367678801317e-06, + "loss": 0.1663, + "step": 10086 + }, + { + "epoch": 0.7991285403050109, + "grad_norm": 1.2062639002162754, + "learning_rate": 2.0421225996897243e-06, + "loss": 0.1073, + "step": 10087 + }, + { + "epoch": 0.7992077639136462, + "grad_norm": 1.551817034589452, + "learning_rate": 2.0405689353542204e-06, + "loss": 0.1513, + "step": 10088 + }, + { + "epoch": 0.7992869875222817, + "grad_norm": 1.8257863477116614, + "learning_rate": 2.0390157951089506e-06, + "loss": 0.2563, + "step": 10089 + }, + { + "epoch": 0.799366211130917, + "grad_norm": 1.3803848172969628, + "learning_rate": 2.0374631790561815e-06, + "loss": 0.1351, + "step": 10090 + }, + { + "epoch": 0.7994454347395524, + "grad_norm": 1.4254438243035916, + "learning_rate": 2.0359110872981526e-06, + "loss": 0.1775, + "step": 10091 + }, + { + "epoch": 0.7995246583481878, + "grad_norm": 1.4489089324482682, + "learning_rate": 2.034359519937057e-06, + "loss": 0.2376, + "step": 10092 + }, + { + "epoch": 0.7996038819568231, + "grad_norm": 1.2478757956891127, + "learning_rate": 2.032808477075057e-06, + "loss": 0.103, + "step": 10093 + }, + { + "epoch": 0.7996831055654585, + "grad_norm": 1.4629855566074748, + "learning_rate": 2.0312579588142846e-06, + "loss": 0.1324, + "step": 10094 + }, + { + "epoch": 0.7997623291740938, + "grad_norm": 1.8229933961937135, + "learning_rate": 2.029707965256833e-06, + "loss": 0.3169, + "step": 10095 + }, + { + "epoch": 0.7998415527827293, + "grad_norm": 1.5385007592991855, + "learning_rate": 2.0281584965047585e-06, + "loss": 0.1434, + "step": 10096 + }, + { + "epoch": 0.7999207763913646, + "grad_norm": 1.5602318935580481, + "learning_rate": 2.0266095526600925e-06, + "loss": 0.1478, + "step": 10097 + }, + { + "epoch": 0.8, + "grad_norm": 1.8613677904147585, + "learning_rate": 2.0250611338248215e-06, + "loss": 0.1783, + "step": 10098 + }, + { + "epoch": 0.8000792236086354, + "grad_norm": 1.1816379135958381, + "learning_rate": 2.0235132401008985e-06, + "loss": 0.1016, + "step": 10099 + }, + { + "epoch": 0.8001584472172707, + "grad_norm": 1.508357582334823, + "learning_rate": 2.0219658715902514e-06, + "loss": 0.1201, + "step": 10100 + }, + { + "epoch": 0.8002376708259061, + "grad_norm": 1.700854720807967, + "learning_rate": 2.0204190283947645e-06, + "loss": 0.1715, + "step": 10101 + }, + { + "epoch": 0.8003168944345415, + "grad_norm": 1.3664397670804829, + "learning_rate": 2.0188727106162874e-06, + "loss": 0.0966, + "step": 10102 + }, + { + "epoch": 0.8003961180431769, + "grad_norm": 2.385838496637402, + "learning_rate": 2.017326918356639e-06, + "loss": 0.2359, + "step": 10103 + }, + { + "epoch": 0.8004753416518122, + "grad_norm": 1.4365508647778298, + "learning_rate": 2.0157816517176045e-06, + "loss": 0.1421, + "step": 10104 + }, + { + "epoch": 0.8005545652604477, + "grad_norm": 1.943683144532721, + "learning_rate": 2.0142369108009306e-06, + "loss": 0.2097, + "step": 10105 + }, + { + "epoch": 0.800633788869083, + "grad_norm": 1.3436318033767165, + "learning_rate": 2.012692695708328e-06, + "loss": 0.1057, + "step": 10106 + }, + { + "epoch": 0.8007130124777183, + "grad_norm": 1.5499590469250224, + "learning_rate": 2.011149006541483e-06, + "loss": 0.1768, + "step": 10107 + }, + { + "epoch": 0.8007922360863537, + "grad_norm": 1.6404213910855365, + "learning_rate": 2.0096058434020348e-06, + "loss": 0.1633, + "step": 10108 + }, + { + "epoch": 0.8008714596949891, + "grad_norm": 1.6223471620885814, + "learning_rate": 2.0080632063915927e-06, + "loss": 0.1328, + "step": 10109 + }, + { + "epoch": 0.8009506833036245, + "grad_norm": 1.3071802592812904, + "learning_rate": 2.0065210956117354e-06, + "loss": 0.1393, + "step": 10110 + }, + { + "epoch": 0.8010299069122598, + "grad_norm": 1.5464149552672812, + "learning_rate": 2.0049795111640023e-06, + "loss": 0.1868, + "step": 10111 + }, + { + "epoch": 0.8011091305208953, + "grad_norm": 1.198399349005077, + "learning_rate": 2.0034384531498962e-06, + "loss": 0.1185, + "step": 10112 + }, + { + "epoch": 0.8011883541295306, + "grad_norm": 1.7867543862279636, + "learning_rate": 2.0018979216708935e-06, + "loss": 0.1933, + "step": 10113 + }, + { + "epoch": 0.8012675777381659, + "grad_norm": 1.8636313720727733, + "learning_rate": 2.000357916828428e-06, + "loss": 0.241, + "step": 10114 + }, + { + "epoch": 0.8013468013468014, + "grad_norm": 1.128102643210593, + "learning_rate": 1.9988184387239027e-06, + "loss": 0.0846, + "step": 10115 + }, + { + "epoch": 0.8014260249554367, + "grad_norm": 1.5799733785609547, + "learning_rate": 1.9972794874586808e-06, + "loss": 0.2104, + "step": 10116 + }, + { + "epoch": 0.8015052485640721, + "grad_norm": 1.4905271844760142, + "learning_rate": 1.9957410631341e-06, + "loss": 0.1756, + "step": 10117 + }, + { + "epoch": 0.8015844721727075, + "grad_norm": 1.3595749645203963, + "learning_rate": 1.9942031658514573e-06, + "loss": 0.1586, + "step": 10118 + }, + { + "epoch": 0.8016636957813429, + "grad_norm": 1.2394181239867947, + "learning_rate": 1.992665795712011e-06, + "loss": 0.1094, + "step": 10119 + }, + { + "epoch": 0.8017429193899782, + "grad_norm": 2.351212147907515, + "learning_rate": 1.991128952816996e-06, + "loss": 0.1992, + "step": 10120 + }, + { + "epoch": 0.8018221429986135, + "grad_norm": 1.4138074303762753, + "learning_rate": 1.9895926372676042e-06, + "loss": 0.1358, + "step": 10121 + }, + { + "epoch": 0.801901366607249, + "grad_norm": 2.550023113461321, + "learning_rate": 1.988056849164991e-06, + "loss": 0.2071, + "step": 10122 + }, + { + "epoch": 0.8019805902158843, + "grad_norm": 1.407072783639433, + "learning_rate": 1.986521588610285e-06, + "loss": 0.1028, + "step": 10123 + }, + { + "epoch": 0.8020598138245197, + "grad_norm": 1.5554430663669578, + "learning_rate": 1.9849868557045738e-06, + "loss": 0.1459, + "step": 10124 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 1.3596984063936324, + "learning_rate": 1.9834526505489105e-06, + "loss": 0.1335, + "step": 10125 + }, + { + "epoch": 0.8022182610417905, + "grad_norm": 1.6867999110951097, + "learning_rate": 1.9819189732443187e-06, + "loss": 0.1894, + "step": 10126 + }, + { + "epoch": 0.8022974846504258, + "grad_norm": 1.2631506900831124, + "learning_rate": 1.9803858238917826e-06, + "loss": 0.1283, + "step": 10127 + }, + { + "epoch": 0.8023767082590612, + "grad_norm": 1.590177265379649, + "learning_rate": 1.97885320259225e-06, + "loss": 0.1529, + "step": 10128 + }, + { + "epoch": 0.8024559318676966, + "grad_norm": 1.7504912380558968, + "learning_rate": 1.9773211094466404e-06, + "loss": 0.1385, + "step": 10129 + }, + { + "epoch": 0.8025351554763319, + "grad_norm": 1.383627748702993, + "learning_rate": 1.975789544555834e-06, + "loss": 0.1581, + "step": 10130 + }, + { + "epoch": 0.8026143790849674, + "grad_norm": 0.9140639757121933, + "learning_rate": 1.9742585080206754e-06, + "loss": 0.0771, + "step": 10131 + }, + { + "epoch": 0.8026936026936027, + "grad_norm": 1.2681141780402247, + "learning_rate": 1.9727279999419745e-06, + "loss": 0.0937, + "step": 10132 + }, + { + "epoch": 0.8027728263022381, + "grad_norm": 1.4437582560043931, + "learning_rate": 1.9711980204205115e-06, + "loss": 0.1411, + "step": 10133 + }, + { + "epoch": 0.8028520499108734, + "grad_norm": 2.0257689897551496, + "learning_rate": 1.9696685695570285e-06, + "loss": 0.1582, + "step": 10134 + }, + { + "epoch": 0.8029312735195088, + "grad_norm": 1.380091833340432, + "learning_rate": 1.9681396474522264e-06, + "loss": 0.1294, + "step": 10135 + }, + { + "epoch": 0.8030104971281442, + "grad_norm": 1.429248547889866, + "learning_rate": 1.966611254206785e-06, + "loss": 0.1574, + "step": 10136 + }, + { + "epoch": 0.8030897207367795, + "grad_norm": 1.7395309804758408, + "learning_rate": 1.9650833899213383e-06, + "loss": 0.2106, + "step": 10137 + }, + { + "epoch": 0.803168944345415, + "grad_norm": 1.543049780955099, + "learning_rate": 1.963556054696487e-06, + "loss": 0.1215, + "step": 10138 + }, + { + "epoch": 0.8032481679540503, + "grad_norm": 1.4159356945475823, + "learning_rate": 1.962029248632802e-06, + "loss": 0.1223, + "step": 10139 + }, + { + "epoch": 0.8033273915626857, + "grad_norm": 1.697947199007826, + "learning_rate": 1.9605029718308156e-06, + "loss": 0.147, + "step": 10140 + }, + { + "epoch": 0.8034066151713211, + "grad_norm": 1.501095133673133, + "learning_rate": 1.958977224391021e-06, + "loss": 0.1293, + "step": 10141 + }, + { + "epoch": 0.8034858387799564, + "grad_norm": 1.4484840347357082, + "learning_rate": 1.957452006413889e-06, + "loss": 0.1859, + "step": 10142 + }, + { + "epoch": 0.8035650623885918, + "grad_norm": 1.7972089667484514, + "learning_rate": 1.955927317999844e-06, + "loss": 0.1473, + "step": 10143 + }, + { + "epoch": 0.8036442859972271, + "grad_norm": 1.5542272362875562, + "learning_rate": 1.9544031592492763e-06, + "loss": 0.2153, + "step": 10144 + }, + { + "epoch": 0.8037235096058626, + "grad_norm": 1.5303449893007977, + "learning_rate": 1.9528795302625515e-06, + "loss": 0.155, + "step": 10145 + }, + { + "epoch": 0.8038027332144979, + "grad_norm": 1.963975460964055, + "learning_rate": 1.951356431139988e-06, + "loss": 0.194, + "step": 10146 + }, + { + "epoch": 0.8038819568231333, + "grad_norm": 1.6653093872282159, + "learning_rate": 1.949833861981877e-06, + "loss": 0.1638, + "step": 10147 + }, + { + "epoch": 0.8039611804317687, + "grad_norm": 2.0885010103761137, + "learning_rate": 1.948311822888468e-06, + "loss": 0.283, + "step": 10148 + }, + { + "epoch": 0.804040404040404, + "grad_norm": 1.2158365030210319, + "learning_rate": 1.9467903139599853e-06, + "loss": 0.1182, + "step": 10149 + }, + { + "epoch": 0.8041196276490394, + "grad_norm": 1.29934060553683, + "learning_rate": 1.945269335296611e-06, + "loss": 0.1091, + "step": 10150 + }, + { + "epoch": 0.8041988512576748, + "grad_norm": 1.422297877206242, + "learning_rate": 1.943748886998492e-06, + "loss": 0.1589, + "step": 10151 + }, + { + "epoch": 0.8042780748663102, + "grad_norm": 1.423596780024782, + "learning_rate": 1.942228969165748e-06, + "loss": 0.1929, + "step": 10152 + }, + { + "epoch": 0.8043572984749455, + "grad_norm": 1.515650404112296, + "learning_rate": 1.940709581898453e-06, + "loss": 0.1561, + "step": 10153 + }, + { + "epoch": 0.8044365220835809, + "grad_norm": 1.4124082812732461, + "learning_rate": 1.9391907252966522e-06, + "loss": 0.1675, + "step": 10154 + }, + { + "epoch": 0.8045157456922163, + "grad_norm": 1.8424677881916665, + "learning_rate": 1.9376723994603574e-06, + "loss": 0.2195, + "step": 10155 + }, + { + "epoch": 0.8045949693008516, + "grad_norm": 1.4488302868411622, + "learning_rate": 1.936154604489543e-06, + "loss": 0.141, + "step": 10156 + }, + { + "epoch": 0.804674192909487, + "grad_norm": 1.3601410458661092, + "learning_rate": 1.9346373404841433e-06, + "loss": 0.1442, + "step": 10157 + }, + { + "epoch": 0.8047534165181224, + "grad_norm": 1.5949133377001212, + "learning_rate": 1.93312060754407e-06, + "loss": 0.1283, + "step": 10158 + }, + { + "epoch": 0.8048326401267578, + "grad_norm": 1.5321869344100048, + "learning_rate": 1.9316044057691886e-06, + "loss": 0.1166, + "step": 10159 + }, + { + "epoch": 0.8049118637353931, + "grad_norm": 1.4355298856536187, + "learning_rate": 1.9300887352593355e-06, + "loss": 0.1761, + "step": 10160 + }, + { + "epoch": 0.8049910873440285, + "grad_norm": 1.581620378126073, + "learning_rate": 1.928573596114306e-06, + "loss": 0.1828, + "step": 10161 + }, + { + "epoch": 0.8050703109526639, + "grad_norm": 1.1867670072055803, + "learning_rate": 1.9270589884338706e-06, + "loss": 0.1132, + "step": 10162 + }, + { + "epoch": 0.8051495345612992, + "grad_norm": 1.483492389032306, + "learning_rate": 1.9255449123177563e-06, + "loss": 0.1239, + "step": 10163 + }, + { + "epoch": 0.8052287581699347, + "grad_norm": 1.937197663060176, + "learning_rate": 1.924031367865655e-06, + "loss": 0.2173, + "step": 10164 + }, + { + "epoch": 0.80530798177857, + "grad_norm": 1.3902151136750005, + "learning_rate": 1.922518355177232e-06, + "loss": 0.1544, + "step": 10165 + }, + { + "epoch": 0.8053872053872054, + "grad_norm": 1.5249798481509889, + "learning_rate": 1.921005874352109e-06, + "loss": 0.1901, + "step": 10166 + }, + { + "epoch": 0.8054664289958408, + "grad_norm": 1.5704410352847817, + "learning_rate": 1.9194939254898746e-06, + "loss": 0.2062, + "step": 10167 + }, + { + "epoch": 0.8055456526044761, + "grad_norm": 1.6748794058153496, + "learning_rate": 1.917982508690085e-06, + "loss": 0.1676, + "step": 10168 + }, + { + "epoch": 0.8056248762131115, + "grad_norm": 1.6403306464579648, + "learning_rate": 1.916471624052256e-06, + "loss": 0.1739, + "step": 10169 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 1.9178260160582072, + "learning_rate": 1.914961271675879e-06, + "loss": 0.1947, + "step": 10170 + }, + { + "epoch": 0.8057833234303823, + "grad_norm": 1.5429835706375996, + "learning_rate": 1.9134514516603987e-06, + "loss": 0.1252, + "step": 10171 + }, + { + "epoch": 0.8058625470390176, + "grad_norm": 1.3060493762375314, + "learning_rate": 1.9119421641052294e-06, + "loss": 0.1245, + "step": 10172 + }, + { + "epoch": 0.805941770647653, + "grad_norm": 1.3233080574823295, + "learning_rate": 1.91043340910975e-06, + "loss": 0.1055, + "step": 10173 + }, + { + "epoch": 0.8060209942562884, + "grad_norm": 1.4000093105511395, + "learning_rate": 1.908925186773308e-06, + "loss": 0.0821, + "step": 10174 + }, + { + "epoch": 0.8061002178649237, + "grad_norm": 1.339805727400341, + "learning_rate": 1.907417497195211e-06, + "loss": 0.0893, + "step": 10175 + }, + { + "epoch": 0.8061794414735591, + "grad_norm": 1.3999300862862118, + "learning_rate": 1.9059103404747303e-06, + "loss": 0.2248, + "step": 10176 + }, + { + "epoch": 0.8062586650821945, + "grad_norm": 1.3871411333071113, + "learning_rate": 1.9044037167111096e-06, + "loss": 0.1595, + "step": 10177 + }, + { + "epoch": 0.8063378886908299, + "grad_norm": 1.3162439612056145, + "learning_rate": 1.9028976260035515e-06, + "loss": 0.1314, + "step": 10178 + }, + { + "epoch": 0.8064171122994652, + "grad_norm": 1.3438839106795213, + "learning_rate": 1.901392068451221e-06, + "loss": 0.1342, + "step": 10179 + }, + { + "epoch": 0.8064963359081007, + "grad_norm": 1.6214944652941157, + "learning_rate": 1.8998870441532569e-06, + "loss": 0.1763, + "step": 10180 + }, + { + "epoch": 0.806575559516736, + "grad_norm": 1.4374563650281171, + "learning_rate": 1.8983825532087551e-06, + "loss": 0.1284, + "step": 10181 + }, + { + "epoch": 0.8066547831253713, + "grad_norm": 1.3766630589202025, + "learning_rate": 1.8968785957167779e-06, + "loss": 0.1376, + "step": 10182 + }, + { + "epoch": 0.8067340067340067, + "grad_norm": 1.4271095294128944, + "learning_rate": 1.8953751717763592e-06, + "loss": 0.1351, + "step": 10183 + }, + { + "epoch": 0.8068132303426421, + "grad_norm": 1.6732051555301723, + "learning_rate": 1.8938722814864863e-06, + "loss": 0.142, + "step": 10184 + }, + { + "epoch": 0.8068924539512775, + "grad_norm": 1.9145342038410325, + "learning_rate": 1.8923699249461214e-06, + "loss": 0.2429, + "step": 10185 + }, + { + "epoch": 0.8069716775599128, + "grad_norm": 1.6207421022625255, + "learning_rate": 1.890868102254182e-06, + "loss": 0.1901, + "step": 10186 + }, + { + "epoch": 0.8070509011685483, + "grad_norm": 3.2524947785552434, + "learning_rate": 1.8893668135095611e-06, + "loss": 0.1623, + "step": 10187 + }, + { + "epoch": 0.8071301247771836, + "grad_norm": 1.9078402332663684, + "learning_rate": 1.8878660588111108e-06, + "loss": 0.2134, + "step": 10188 + }, + { + "epoch": 0.8072093483858189, + "grad_norm": 1.689520950573459, + "learning_rate": 1.8863658382576444e-06, + "loss": 0.117, + "step": 10189 + }, + { + "epoch": 0.8072885719944544, + "grad_norm": 1.3460928841631432, + "learning_rate": 1.8848661519479504e-06, + "loss": 0.1252, + "step": 10190 + }, + { + "epoch": 0.8073677956030897, + "grad_norm": 1.2925654138633549, + "learning_rate": 1.8833669999807723e-06, + "loss": 0.1263, + "step": 10191 + }, + { + "epoch": 0.8074470192117251, + "grad_norm": 1.547395445615637, + "learning_rate": 1.88186838245482e-06, + "loss": 0.1644, + "step": 10192 + }, + { + "epoch": 0.8075262428203605, + "grad_norm": 1.4744145856896094, + "learning_rate": 1.8803702994687755e-06, + "loss": 0.1493, + "step": 10193 + }, + { + "epoch": 0.8076054664289959, + "grad_norm": 2.0520884858072597, + "learning_rate": 1.8788727511212768e-06, + "loss": 0.2142, + "step": 10194 + }, + { + "epoch": 0.8076846900376312, + "grad_norm": 1.4537298009652566, + "learning_rate": 1.8773757375109292e-06, + "loss": 0.1166, + "step": 10195 + }, + { + "epoch": 0.8077639136462665, + "grad_norm": 1.7031439386205676, + "learning_rate": 1.8758792587363084e-06, + "loss": 0.1472, + "step": 10196 + }, + { + "epoch": 0.807843137254902, + "grad_norm": 1.4041291261632518, + "learning_rate": 1.8743833148959479e-06, + "loss": 0.1381, + "step": 10197 + }, + { + "epoch": 0.8079223608635373, + "grad_norm": 1.4175470188767687, + "learning_rate": 1.8728879060883443e-06, + "loss": 0.1386, + "step": 10198 + }, + { + "epoch": 0.8080015844721727, + "grad_norm": 1.5324494717929245, + "learning_rate": 1.8713930324119711e-06, + "loss": 0.1304, + "step": 10199 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 1.6193221912454494, + "learning_rate": 1.869898693965253e-06, + "loss": 0.1355, + "step": 10200 + }, + { + "epoch": 0.8081600316894435, + "grad_norm": 2.1245017621506683, + "learning_rate": 1.868404890846587e-06, + "loss": 0.1498, + "step": 10201 + }, + { + "epoch": 0.8082392552980788, + "grad_norm": 1.7341259367735058, + "learning_rate": 1.8669116231543294e-06, + "loss": 0.0997, + "step": 10202 + }, + { + "epoch": 0.8083184789067142, + "grad_norm": 1.5921317302274582, + "learning_rate": 1.865418890986811e-06, + "loss": 0.1963, + "step": 10203 + }, + { + "epoch": 0.8083977025153496, + "grad_norm": 1.5524424906902254, + "learning_rate": 1.8639266944423163e-06, + "loss": 0.0806, + "step": 10204 + }, + { + "epoch": 0.8084769261239849, + "grad_norm": 1.503889931488173, + "learning_rate": 1.8624350336190977e-06, + "loss": 0.1831, + "step": 10205 + }, + { + "epoch": 0.8085561497326204, + "grad_norm": 2.4269029017068333, + "learning_rate": 1.8609439086153803e-06, + "loss": 0.1867, + "step": 10206 + }, + { + "epoch": 0.8086353733412557, + "grad_norm": 1.628669563698834, + "learning_rate": 1.859453319529343e-06, + "loss": 0.2021, + "step": 10207 + }, + { + "epoch": 0.8087145969498911, + "grad_norm": 1.4857230049323982, + "learning_rate": 1.857963266459133e-06, + "loss": 0.1583, + "step": 10208 + }, + { + "epoch": 0.8087938205585264, + "grad_norm": 1.489637359184151, + "learning_rate": 1.8564737495028673e-06, + "loss": 0.1191, + "step": 10209 + }, + { + "epoch": 0.8088730441671618, + "grad_norm": 1.9414988822093187, + "learning_rate": 1.854984768758621e-06, + "loss": 0.2305, + "step": 10210 + }, + { + "epoch": 0.8089522677757972, + "grad_norm": 1.5178442810974178, + "learning_rate": 1.853496324324434e-06, + "loss": 0.1273, + "step": 10211 + }, + { + "epoch": 0.8090314913844325, + "grad_norm": 1.4632503103421797, + "learning_rate": 1.8520084162983176e-06, + "loss": 0.1632, + "step": 10212 + }, + { + "epoch": 0.809110714993068, + "grad_norm": 1.361152823939929, + "learning_rate": 1.8505210447782418e-06, + "loss": 0.1173, + "step": 10213 + }, + { + "epoch": 0.8091899386017033, + "grad_norm": 1.4494503293152026, + "learning_rate": 1.8490342098621395e-06, + "loss": 0.1612, + "step": 10214 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 1.4320130612131827, + "learning_rate": 1.8475479116479166e-06, + "loss": 0.1323, + "step": 10215 + }, + { + "epoch": 0.8093483858189741, + "grad_norm": 1.9164216849273914, + "learning_rate": 1.8460621502334375e-06, + "loss": 0.2167, + "step": 10216 + }, + { + "epoch": 0.8094276094276094, + "grad_norm": 2.1218331232522107, + "learning_rate": 1.8445769257165314e-06, + "loss": 0.279, + "step": 10217 + }, + { + "epoch": 0.8095068330362448, + "grad_norm": 1.3690367984547889, + "learning_rate": 1.8430922381949912e-06, + "loss": 0.1227, + "step": 10218 + }, + { + "epoch": 0.8095860566448801, + "grad_norm": 1.2720416047960106, + "learning_rate": 1.84160808776658e-06, + "loss": 0.1168, + "step": 10219 + }, + { + "epoch": 0.8096652802535156, + "grad_norm": 1.2754975272351605, + "learning_rate": 1.8401244745290214e-06, + "loss": 0.1489, + "step": 10220 + }, + { + "epoch": 0.8097445038621509, + "grad_norm": 1.684514865833414, + "learning_rate": 1.838641398580001e-06, + "loss": 0.1547, + "step": 10221 + }, + { + "epoch": 0.8098237274707863, + "grad_norm": 1.2835216458748944, + "learning_rate": 1.8371588600171764e-06, + "loss": 0.135, + "step": 10222 + }, + { + "epoch": 0.8099029510794217, + "grad_norm": 1.1807354882395644, + "learning_rate": 1.8356768589381646e-06, + "loss": 0.1333, + "step": 10223 + }, + { + "epoch": 0.809982174688057, + "grad_norm": 1.4277479278290404, + "learning_rate": 1.8341953954405434e-06, + "loss": 0.1034, + "step": 10224 + }, + { + "epoch": 0.8100613982966924, + "grad_norm": 1.3890390353434057, + "learning_rate": 1.832714469621868e-06, + "loss": 0.1501, + "step": 10225 + }, + { + "epoch": 0.8101406219053278, + "grad_norm": 1.32985190738275, + "learning_rate": 1.8312340815796458e-06, + "loss": 0.1423, + "step": 10226 + }, + { + "epoch": 0.8102198455139632, + "grad_norm": 1.6146013798383707, + "learning_rate": 1.8297542314113515e-06, + "loss": 0.1788, + "step": 10227 + }, + { + "epoch": 0.8102990691225985, + "grad_norm": 1.555922643750267, + "learning_rate": 1.82827491921443e-06, + "loss": 0.1985, + "step": 10228 + }, + { + "epoch": 0.810378292731234, + "grad_norm": 1.4560983811400832, + "learning_rate": 1.8267961450862859e-06, + "loss": 0.1647, + "step": 10229 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 1.3916763100502931, + "learning_rate": 1.8253179091242868e-06, + "loss": 0.1692, + "step": 10230 + }, + { + "epoch": 0.8105367399485046, + "grad_norm": 1.3225720232642015, + "learning_rate": 1.8238402114257714e-06, + "loss": 0.1291, + "step": 10231 + }, + { + "epoch": 0.81061596355714, + "grad_norm": 1.6022248695588641, + "learning_rate": 1.8223630520880365e-06, + "loss": 0.15, + "step": 10232 + }, + { + "epoch": 0.8106951871657754, + "grad_norm": 1.499400266212322, + "learning_rate": 1.8208864312083462e-06, + "loss": 0.1798, + "step": 10233 + }, + { + "epoch": 0.8107744107744108, + "grad_norm": 1.0841858671403477, + "learning_rate": 1.8194103488839265e-06, + "loss": 0.0903, + "step": 10234 + }, + { + "epoch": 0.8108536343830461, + "grad_norm": 1.9480837370727113, + "learning_rate": 1.817934805211976e-06, + "loss": 0.2158, + "step": 10235 + }, + { + "epoch": 0.8109328579916815, + "grad_norm": 1.0799161844457754, + "learning_rate": 1.8164598002896484e-06, + "loss": 0.0699, + "step": 10236 + }, + { + "epoch": 0.8110120816003169, + "grad_norm": 1.650267265375426, + "learning_rate": 1.8149853342140644e-06, + "loss": 0.1791, + "step": 10237 + }, + { + "epoch": 0.8110913052089522, + "grad_norm": 1.5440341658163712, + "learning_rate": 1.8135114070823145e-06, + "loss": 0.122, + "step": 10238 + }, + { + "epoch": 0.8111705288175877, + "grad_norm": 1.3154770436476249, + "learning_rate": 1.8120380189914476e-06, + "loss": 0.1393, + "step": 10239 + }, + { + "epoch": 0.811249752426223, + "grad_norm": 1.3547774727000639, + "learning_rate": 1.8105651700384764e-06, + "loss": 0.13, + "step": 10240 + }, + { + "epoch": 0.8113289760348584, + "grad_norm": 1.5862613403045687, + "learning_rate": 1.8090928603203871e-06, + "loss": 0.1583, + "step": 10241 + }, + { + "epoch": 0.8114081996434938, + "grad_norm": 1.0949917882524616, + "learning_rate": 1.8076210899341196e-06, + "loss": 0.0723, + "step": 10242 + }, + { + "epoch": 0.8114874232521291, + "grad_norm": 1.3598693622546882, + "learning_rate": 1.8061498589765824e-06, + "loss": 0.1375, + "step": 10243 + }, + { + "epoch": 0.8115666468607645, + "grad_norm": 1.3442668370230686, + "learning_rate": 1.804679167544655e-06, + "loss": 0.1103, + "step": 10244 + }, + { + "epoch": 0.8116458704693998, + "grad_norm": 1.8530545587670106, + "learning_rate": 1.8032090157351701e-06, + "loss": 0.2337, + "step": 10245 + }, + { + "epoch": 0.8117250940780353, + "grad_norm": 1.6407431942640982, + "learning_rate": 1.8017394036449276e-06, + "loss": 0.1247, + "step": 10246 + }, + { + "epoch": 0.8118043176866706, + "grad_norm": 1.7582120303689612, + "learning_rate": 1.8002703313706993e-06, + "loss": 0.1684, + "step": 10247 + }, + { + "epoch": 0.811883541295306, + "grad_norm": 1.4737780048789422, + "learning_rate": 1.7988017990092167e-06, + "loss": 0.1663, + "step": 10248 + }, + { + "epoch": 0.8119627649039414, + "grad_norm": 1.4752616047110867, + "learning_rate": 1.797333806657171e-06, + "loss": 0.1853, + "step": 10249 + }, + { + "epoch": 0.8120419885125767, + "grad_norm": 1.5995545597588632, + "learning_rate": 1.7958663544112277e-06, + "loss": 0.1781, + "step": 10250 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 1.5764097328095472, + "learning_rate": 1.794399442368009e-06, + "loss": 0.2008, + "step": 10251 + }, + { + "epoch": 0.8122004357298475, + "grad_norm": 1.7650027714958156, + "learning_rate": 1.7929330706241023e-06, + "loss": 0.1848, + "step": 10252 + }, + { + "epoch": 0.8122796593384829, + "grad_norm": 1.640980019165187, + "learning_rate": 1.7914672392760645e-06, + "loss": 0.1987, + "step": 10253 + }, + { + "epoch": 0.8123588829471182, + "grad_norm": 1.3983029711889268, + "learning_rate": 1.7900019484204135e-06, + "loss": 0.1617, + "step": 10254 + }, + { + "epoch": 0.8124381065557537, + "grad_norm": 1.6621175032104232, + "learning_rate": 1.788537198153627e-06, + "loss": 0.2053, + "step": 10255 + }, + { + "epoch": 0.812517330164389, + "grad_norm": 1.3284279640787715, + "learning_rate": 1.787072988572157e-06, + "loss": 0.1618, + "step": 10256 + }, + { + "epoch": 0.8125965537730243, + "grad_norm": 1.319564027415114, + "learning_rate": 1.7856093197724133e-06, + "loss": 0.2032, + "step": 10257 + }, + { + "epoch": 0.8126757773816597, + "grad_norm": 1.2499998373100147, + "learning_rate": 1.7841461918507708e-06, + "loss": 0.1222, + "step": 10258 + }, + { + "epoch": 0.8127550009902951, + "grad_norm": 2.3355292251028925, + "learning_rate": 1.7826836049035655e-06, + "loss": 0.2188, + "step": 10259 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 1.5548397470585844, + "learning_rate": 1.7812215590271099e-06, + "loss": 0.1365, + "step": 10260 + }, + { + "epoch": 0.8129134482075658, + "grad_norm": 1.3376105877286772, + "learning_rate": 1.7797600543176675e-06, + "loss": 0.1327, + "step": 10261 + }, + { + "epoch": 0.8129926718162013, + "grad_norm": 1.6440723227014375, + "learning_rate": 1.7782990908714703e-06, + "loss": 0.1846, + "step": 10262 + }, + { + "epoch": 0.8130718954248366, + "grad_norm": 1.3063096188681953, + "learning_rate": 1.7768386687847194e-06, + "loss": 0.1859, + "step": 10263 + }, + { + "epoch": 0.8131511190334719, + "grad_norm": 1.4561815971044818, + "learning_rate": 1.7753787881535757e-06, + "loss": 0.1237, + "step": 10264 + }, + { + "epoch": 0.8132303426421074, + "grad_norm": 1.487580613271827, + "learning_rate": 1.7739194490741607e-06, + "loss": 0.1847, + "step": 10265 + }, + { + "epoch": 0.8133095662507427, + "grad_norm": 1.3401571802784014, + "learning_rate": 1.7724606516425724e-06, + "loss": 0.0887, + "step": 10266 + }, + { + "epoch": 0.8133887898593781, + "grad_norm": 1.7160559866542362, + "learning_rate": 1.7710023959548617e-06, + "loss": 0.1992, + "step": 10267 + }, + { + "epoch": 0.8134680134680135, + "grad_norm": 1.9485808893810683, + "learning_rate": 1.7695446821070438e-06, + "loss": 0.1365, + "step": 10268 + }, + { + "epoch": 0.8135472370766489, + "grad_norm": 1.2232951640120895, + "learning_rate": 1.76808751019511e-06, + "loss": 0.1136, + "step": 10269 + }, + { + "epoch": 0.8136264606852842, + "grad_norm": 1.6849270389519992, + "learning_rate": 1.7666308803150045e-06, + "loss": 0.164, + "step": 10270 + }, + { + "epoch": 0.8137056842939195, + "grad_norm": 1.6571948338488582, + "learning_rate": 1.7651747925626383e-06, + "loss": 0.2078, + "step": 10271 + }, + { + "epoch": 0.813784907902555, + "grad_norm": 1.2751020666041997, + "learning_rate": 1.763719247033886e-06, + "loss": 0.133, + "step": 10272 + }, + { + "epoch": 0.8138641315111903, + "grad_norm": 1.7490706940946588, + "learning_rate": 1.762264243824594e-06, + "loss": 0.1533, + "step": 10273 + }, + { + "epoch": 0.8139433551198257, + "grad_norm": 2.1033476044461703, + "learning_rate": 1.7608097830305637e-06, + "loss": 0.1296, + "step": 10274 + }, + { + "epoch": 0.8140225787284611, + "grad_norm": 1.757098556024279, + "learning_rate": 1.7593558647475627e-06, + "loss": 0.2016, + "step": 10275 + }, + { + "epoch": 0.8141018023370965, + "grad_norm": 1.4005303019412985, + "learning_rate": 1.7579024890713282e-06, + "loss": 0.1355, + "step": 10276 + }, + { + "epoch": 0.8141810259457318, + "grad_norm": 1.2944314907712298, + "learning_rate": 1.7564496560975574e-06, + "loss": 0.1137, + "step": 10277 + }, + { + "epoch": 0.8142602495543672, + "grad_norm": 1.2065437881129497, + "learning_rate": 1.7549973659219077e-06, + "loss": 0.1206, + "step": 10278 + }, + { + "epoch": 0.8143394731630026, + "grad_norm": 1.563223827830552, + "learning_rate": 1.7535456186400123e-06, + "loss": 0.1349, + "step": 10279 + }, + { + "epoch": 0.8144186967716379, + "grad_norm": 1.293468899058931, + "learning_rate": 1.7520944143474584e-06, + "loss": 0.1199, + "step": 10280 + }, + { + "epoch": 0.8144979203802734, + "grad_norm": 1.9545872410534244, + "learning_rate": 1.750643753139798e-06, + "loss": 0.2305, + "step": 10281 + }, + { + "epoch": 0.8145771439889087, + "grad_norm": 1.45167296938253, + "learning_rate": 1.749193635112556e-06, + "loss": 0.1846, + "step": 10282 + }, + { + "epoch": 0.8146563675975441, + "grad_norm": 1.4773059217776972, + "learning_rate": 1.7477440603612127e-06, + "loss": 0.12, + "step": 10283 + }, + { + "epoch": 0.8147355912061794, + "grad_norm": 1.6397864836033171, + "learning_rate": 1.746295028981213e-06, + "loss": 0.1707, + "step": 10284 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 1.5442778327129143, + "learning_rate": 1.7448465410679737e-06, + "loss": 0.0818, + "step": 10285 + }, + { + "epoch": 0.8148940384234502, + "grad_norm": 1.6107266736935286, + "learning_rate": 1.7433985967168686e-06, + "loss": 0.1548, + "step": 10286 + }, + { + "epoch": 0.8149732620320855, + "grad_norm": 1.556060702484606, + "learning_rate": 1.7419511960232384e-06, + "loss": 0.143, + "step": 10287 + }, + { + "epoch": 0.815052485640721, + "grad_norm": 1.3043812718686045, + "learning_rate": 1.7405043390823827e-06, + "loss": 0.1298, + "step": 10288 + }, + { + "epoch": 0.8151317092493563, + "grad_norm": 1.5310681664110861, + "learning_rate": 1.7390580259895783e-06, + "loss": 0.1223, + "step": 10289 + }, + { + "epoch": 0.8152109328579917, + "grad_norm": 1.813732762518309, + "learning_rate": 1.7376122568400533e-06, + "loss": 0.1887, + "step": 10290 + }, + { + "epoch": 0.8152901564666271, + "grad_norm": 1.8248206105619527, + "learning_rate": 1.7361670317290014e-06, + "loss": 0.2044, + "step": 10291 + }, + { + "epoch": 0.8153693800752624, + "grad_norm": 1.455164810741571, + "learning_rate": 1.7347223507515908e-06, + "loss": 0.1562, + "step": 10292 + }, + { + "epoch": 0.8154486036838978, + "grad_norm": 1.2684545191068992, + "learning_rate": 1.7332782140029436e-06, + "loss": 0.1588, + "step": 10293 + }, + { + "epoch": 0.8155278272925331, + "grad_norm": 1.3060185125176813, + "learning_rate": 1.7318346215781468e-06, + "loss": 0.1134, + "step": 10294 + }, + { + "epoch": 0.8156070509011686, + "grad_norm": 1.2471042013693805, + "learning_rate": 1.7303915735722586e-06, + "loss": 0.1313, + "step": 10295 + }, + { + "epoch": 0.8156862745098039, + "grad_norm": 1.4156723512949727, + "learning_rate": 1.7289490700802947e-06, + "loss": 0.1812, + "step": 10296 + }, + { + "epoch": 0.8157654981184393, + "grad_norm": 1.4905878848810883, + "learning_rate": 1.727507111197233e-06, + "loss": 0.2018, + "step": 10297 + }, + { + "epoch": 0.8158447217270747, + "grad_norm": 1.469917636532856, + "learning_rate": 1.7260656970180268e-06, + "loss": 0.1375, + "step": 10298 + }, + { + "epoch": 0.81592394533571, + "grad_norm": 1.7008029424453908, + "learning_rate": 1.7246248276375832e-06, + "loss": 0.1848, + "step": 10299 + }, + { + "epoch": 0.8160031689443454, + "grad_norm": 1.4404807252316005, + "learning_rate": 1.7231845031507732e-06, + "loss": 0.153, + "step": 10300 + }, + { + "epoch": 0.8160823925529808, + "grad_norm": 1.5259418737319537, + "learning_rate": 1.72174472365244e-06, + "loss": 0.1729, + "step": 10301 + }, + { + "epoch": 0.8161616161616162, + "grad_norm": 2.2294044179077632, + "learning_rate": 1.720305489237385e-06, + "loss": 0.1845, + "step": 10302 + }, + { + "epoch": 0.8162408397702515, + "grad_norm": 1.51770991657262, + "learning_rate": 1.718866800000375e-06, + "loss": 0.1429, + "step": 10303 + }, + { + "epoch": 0.816320063378887, + "grad_norm": 1.5252263534915778, + "learning_rate": 1.7174286560361364e-06, + "loss": 0.246, + "step": 10304 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 1.2684446449199165, + "learning_rate": 1.7159910574393702e-06, + "loss": 0.1178, + "step": 10305 + }, + { + "epoch": 0.8164785105961576, + "grad_norm": 1.4574327547842165, + "learning_rate": 1.7145540043047327e-06, + "loss": 0.1593, + "step": 10306 + }, + { + "epoch": 0.816557734204793, + "grad_norm": 1.2894628056887252, + "learning_rate": 1.713117496726845e-06, + "loss": 0.1245, + "step": 10307 + }, + { + "epoch": 0.8166369578134284, + "grad_norm": 1.9009598357048905, + "learning_rate": 1.711681534800298e-06, + "loss": 0.1963, + "step": 10308 + }, + { + "epoch": 0.8167161814220638, + "grad_norm": 1.6017773650496285, + "learning_rate": 1.7102461186196418e-06, + "loss": 0.193, + "step": 10309 + }, + { + "epoch": 0.8167954050306991, + "grad_norm": 1.7704882477449502, + "learning_rate": 1.7088112482793872e-06, + "loss": 0.1956, + "step": 10310 + }, + { + "epoch": 0.8168746286393346, + "grad_norm": 1.827845277119032, + "learning_rate": 1.7073769238740213e-06, + "loss": 0.178, + "step": 10311 + }, + { + "epoch": 0.8169538522479699, + "grad_norm": 1.338857371424728, + "learning_rate": 1.7059431454979825e-06, + "loss": 0.1071, + "step": 10312 + }, + { + "epoch": 0.8170330758566052, + "grad_norm": 1.5104697165527032, + "learning_rate": 1.7045099132456766e-06, + "loss": 0.1268, + "step": 10313 + }, + { + "epoch": 0.8171122994652407, + "grad_norm": 2.3705614942161084, + "learning_rate": 1.7030772272114803e-06, + "loss": 0.2403, + "step": 10314 + }, + { + "epoch": 0.817191523073876, + "grad_norm": 1.551941842256551, + "learning_rate": 1.7016450874897273e-06, + "loss": 0.1075, + "step": 10315 + }, + { + "epoch": 0.8172707466825114, + "grad_norm": 1.6601032928315478, + "learning_rate": 1.7002134941747116e-06, + "loss": 0.1404, + "step": 10316 + }, + { + "epoch": 0.8173499702911468, + "grad_norm": 1.446943438006338, + "learning_rate": 1.698782447360705e-06, + "loss": 0.1211, + "step": 10317 + }, + { + "epoch": 0.8174291938997821, + "grad_norm": 1.1791993478590417, + "learning_rate": 1.697351947141932e-06, + "loss": 0.077, + "step": 10318 + }, + { + "epoch": 0.8175084175084175, + "grad_norm": 1.8786981615817122, + "learning_rate": 1.6959219936125827e-06, + "loss": 0.1996, + "step": 10319 + }, + { + "epoch": 0.8175876411170528, + "grad_norm": 1.4084962785176613, + "learning_rate": 1.6944925868668106e-06, + "loss": 0.1438, + "step": 10320 + }, + { + "epoch": 0.8176668647256883, + "grad_norm": 1.5637447247709173, + "learning_rate": 1.6930637269987415e-06, + "loss": 0.1424, + "step": 10321 + }, + { + "epoch": 0.8177460883343236, + "grad_norm": 1.6968352641284936, + "learning_rate": 1.691635414102455e-06, + "loss": 0.1664, + "step": 10322 + }, + { + "epoch": 0.817825311942959, + "grad_norm": 1.6194239769975236, + "learning_rate": 1.6902076482719987e-06, + "loss": 0.1361, + "step": 10323 + }, + { + "epoch": 0.8179045355515944, + "grad_norm": 1.706610780483751, + "learning_rate": 1.6887804296013854e-06, + "loss": 0.2111, + "step": 10324 + }, + { + "epoch": 0.8179837591602297, + "grad_norm": 1.589586373119487, + "learning_rate": 1.6873537581845866e-06, + "loss": 0.182, + "step": 10325 + }, + { + "epoch": 0.8180629827688651, + "grad_norm": 2.3782431269676243, + "learning_rate": 1.6859276341155483e-06, + "loss": 0.2385, + "step": 10326 + }, + { + "epoch": 0.8181422063775005, + "grad_norm": 1.6419110174945446, + "learning_rate": 1.68450205748817e-06, + "loss": 0.1624, + "step": 10327 + }, + { + "epoch": 0.8182214299861359, + "grad_norm": 1.8000388462286745, + "learning_rate": 1.6830770283963194e-06, + "loss": 0.1585, + "step": 10328 + }, + { + "epoch": 0.8183006535947712, + "grad_norm": 1.5882387791068995, + "learning_rate": 1.6816525469338252e-06, + "loss": 0.1745, + "step": 10329 + }, + { + "epoch": 0.8183798772034067, + "grad_norm": 1.5503896355813822, + "learning_rate": 1.6802286131944889e-06, + "loss": 0.1778, + "step": 10330 + }, + { + "epoch": 0.818459100812042, + "grad_norm": 1.8213112782804408, + "learning_rate": 1.6788052272720656e-06, + "loss": 0.1644, + "step": 10331 + }, + { + "epoch": 0.8185383244206773, + "grad_norm": 1.4065908752001892, + "learning_rate": 1.677382389260277e-06, + "loss": 0.1372, + "step": 10332 + }, + { + "epoch": 0.8186175480293127, + "grad_norm": 1.5053742806512196, + "learning_rate": 1.6759600992528147e-06, + "loss": 0.136, + "step": 10333 + }, + { + "epoch": 0.8186967716379481, + "grad_norm": 1.5910039711765025, + "learning_rate": 1.674538357343326e-06, + "loss": 0.1711, + "step": 10334 + }, + { + "epoch": 0.8187759952465835, + "grad_norm": 1.6852041934350679, + "learning_rate": 1.6731171636254263e-06, + "loss": 0.1427, + "step": 10335 + }, + { + "epoch": 0.8188552188552188, + "grad_norm": 1.2593735125698782, + "learning_rate": 1.6716965181926959e-06, + "loss": 0.1498, + "step": 10336 + }, + { + "epoch": 0.8189344424638543, + "grad_norm": 1.6431241496499942, + "learning_rate": 1.670276421138677e-06, + "loss": 0.1702, + "step": 10337 + }, + { + "epoch": 0.8190136660724896, + "grad_norm": 1.59497049054573, + "learning_rate": 1.6688568725568732e-06, + "loss": 0.1609, + "step": 10338 + }, + { + "epoch": 0.8190928896811249, + "grad_norm": 1.2004729142077848, + "learning_rate": 1.6674378725407603e-06, + "loss": 0.1301, + "step": 10339 + }, + { + "epoch": 0.8191721132897604, + "grad_norm": 1.4718200700932034, + "learning_rate": 1.6660194211837687e-06, + "loss": 0.1326, + "step": 10340 + }, + { + "epoch": 0.8192513368983957, + "grad_norm": 1.5987359360750721, + "learning_rate": 1.6646015185792963e-06, + "loss": 0.1958, + "step": 10341 + }, + { + "epoch": 0.8193305605070311, + "grad_norm": 1.6960538878199312, + "learning_rate": 1.6631841648207092e-06, + "loss": 0.1693, + "step": 10342 + }, + { + "epoch": 0.8194097841156665, + "grad_norm": 1.7701877100046661, + "learning_rate": 1.6617673600013295e-06, + "loss": 0.1686, + "step": 10343 + }, + { + "epoch": 0.8194890077243019, + "grad_norm": 1.740906619506329, + "learning_rate": 1.6603511042144494e-06, + "loss": 0.1485, + "step": 10344 + }, + { + "epoch": 0.8195682313329372, + "grad_norm": 1.5507463508608135, + "learning_rate": 1.6589353975533174e-06, + "loss": 0.1508, + "step": 10345 + }, + { + "epoch": 0.8196474549415725, + "grad_norm": 1.204686433958875, + "learning_rate": 1.6575202401111578e-06, + "loss": 0.1497, + "step": 10346 + }, + { + "epoch": 0.819726678550208, + "grad_norm": 1.4986351884970766, + "learning_rate": 1.6561056319811497e-06, + "loss": 0.1733, + "step": 10347 + }, + { + "epoch": 0.8198059021588433, + "grad_norm": 1.5125905018831163, + "learning_rate": 1.654691573256434e-06, + "loss": 0.1467, + "step": 10348 + }, + { + "epoch": 0.8198851257674787, + "grad_norm": 1.7310964853386275, + "learning_rate": 1.653278064030126e-06, + "loss": 0.2099, + "step": 10349 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 2.1121796978152174, + "learning_rate": 1.651865104395296e-06, + "loss": 0.2821, + "step": 10350 + }, + { + "epoch": 0.8200435729847495, + "grad_norm": 1.78586811416678, + "learning_rate": 1.6504526944449772e-06, + "loss": 0.1883, + "step": 10351 + }, + { + "epoch": 0.8201227965933848, + "grad_norm": 1.0888208275566291, + "learning_rate": 1.6490408342721764e-06, + "loss": 0.1118, + "step": 10352 + }, + { + "epoch": 0.8202020202020202, + "grad_norm": 1.5086020860669018, + "learning_rate": 1.6476295239698537e-06, + "loss": 0.1468, + "step": 10353 + }, + { + "epoch": 0.8202812438106556, + "grad_norm": 1.7931152307010985, + "learning_rate": 1.6462187636309345e-06, + "loss": 0.2269, + "step": 10354 + }, + { + "epoch": 0.8203604674192909, + "grad_norm": 1.7335744727508822, + "learning_rate": 1.6448085533483172e-06, + "loss": 0.2141, + "step": 10355 + }, + { + "epoch": 0.8204396910279264, + "grad_norm": 1.2915757357350675, + "learning_rate": 1.6433988932148547e-06, + "loss": 0.1282, + "step": 10356 + }, + { + "epoch": 0.8205189146365617, + "grad_norm": 1.5612157986408384, + "learning_rate": 1.6419897833233644e-06, + "loss": 0.2239, + "step": 10357 + }, + { + "epoch": 0.8205981382451971, + "grad_norm": 1.6084479981473068, + "learning_rate": 1.6405812237666296e-06, + "loss": 0.1541, + "step": 10358 + }, + { + "epoch": 0.8206773618538324, + "grad_norm": 1.359443615910525, + "learning_rate": 1.6391732146373994e-06, + "loss": 0.1475, + "step": 10359 + }, + { + "epoch": 0.8207565854624678, + "grad_norm": 1.2763808489309336, + "learning_rate": 1.6377657560283844e-06, + "loss": 0.0933, + "step": 10360 + }, + { + "epoch": 0.8208358090711032, + "grad_norm": 1.414698658074898, + "learning_rate": 1.6363588480322545e-06, + "loss": 0.1383, + "step": 10361 + }, + { + "epoch": 0.8209150326797385, + "grad_norm": 1.4651512981191723, + "learning_rate": 1.6349524907416536e-06, + "loss": 0.1686, + "step": 10362 + }, + { + "epoch": 0.820994256288374, + "grad_norm": 1.4025084777632981, + "learning_rate": 1.6335466842491821e-06, + "loss": 0.1921, + "step": 10363 + }, + { + "epoch": 0.8210734798970093, + "grad_norm": 1.4501960291485576, + "learning_rate": 1.6321414286474014e-06, + "loss": 0.1474, + "step": 10364 + }, + { + "epoch": 0.8211527035056447, + "grad_norm": 1.4328133623683392, + "learning_rate": 1.6307367240288463e-06, + "loss": 0.1489, + "step": 10365 + }, + { + "epoch": 0.8212319271142801, + "grad_norm": 1.632903840416775, + "learning_rate": 1.6293325704860087e-06, + "loss": 0.2278, + "step": 10366 + }, + { + "epoch": 0.8213111507229154, + "grad_norm": 1.7143170271539867, + "learning_rate": 1.6279289681113407e-06, + "loss": 0.1557, + "step": 10367 + }, + { + "epoch": 0.8213903743315508, + "grad_norm": 1.3358997278288598, + "learning_rate": 1.626525916997269e-06, + "loss": 0.1158, + "step": 10368 + }, + { + "epoch": 0.8214695979401861, + "grad_norm": 1.242847283899903, + "learning_rate": 1.6251234172361763e-06, + "loss": 0.1139, + "step": 10369 + }, + { + "epoch": 0.8215488215488216, + "grad_norm": 1.4258896855857521, + "learning_rate": 1.623721468920405e-06, + "loss": 0.1187, + "step": 10370 + }, + { + "epoch": 0.8216280451574569, + "grad_norm": 1.5389124303075783, + "learning_rate": 1.6223200721422739e-06, + "loss": 0.1477, + "step": 10371 + }, + { + "epoch": 0.8217072687660923, + "grad_norm": 2.3975215197648096, + "learning_rate": 1.6209192269940555e-06, + "loss": 0.1429, + "step": 10372 + }, + { + "epoch": 0.8217864923747277, + "grad_norm": 1.2650057683133027, + "learning_rate": 1.6195189335679884e-06, + "loss": 0.1505, + "step": 10373 + }, + { + "epoch": 0.821865715983363, + "grad_norm": 1.8335978879588404, + "learning_rate": 1.6181191919562734e-06, + "loss": 0.2252, + "step": 10374 + }, + { + "epoch": 0.8219449395919984, + "grad_norm": 1.4650376424076854, + "learning_rate": 1.6167200022510799e-06, + "loss": 0.1748, + "step": 10375 + }, + { + "epoch": 0.8220241632006338, + "grad_norm": 1.955988790984777, + "learning_rate": 1.6153213645445376e-06, + "loss": 0.2561, + "step": 10376 + }, + { + "epoch": 0.8221033868092692, + "grad_norm": 1.5508357430436572, + "learning_rate": 1.613923278928735e-06, + "loss": 0.1829, + "step": 10377 + }, + { + "epoch": 0.8221826104179045, + "grad_norm": 1.8059499301520012, + "learning_rate": 1.6125257454957365e-06, + "loss": 0.2098, + "step": 10378 + }, + { + "epoch": 0.82226183402654, + "grad_norm": 2.1598524364571494, + "learning_rate": 1.6111287643375607e-06, + "loss": 0.167, + "step": 10379 + }, + { + "epoch": 0.8223410576351753, + "grad_norm": 2.035988592801404, + "learning_rate": 1.6097323355461869e-06, + "loss": 0.1384, + "step": 10380 + }, + { + "epoch": 0.8224202812438106, + "grad_norm": 1.6738637565697734, + "learning_rate": 1.6083364592135708e-06, + "loss": 0.1385, + "step": 10381 + }, + { + "epoch": 0.822499504852446, + "grad_norm": 1.244503578165528, + "learning_rate": 1.6069411354316212e-06, + "loss": 0.1112, + "step": 10382 + }, + { + "epoch": 0.8225787284610814, + "grad_norm": 1.656622176169255, + "learning_rate": 1.6055463642922098e-06, + "loss": 0.1239, + "step": 10383 + }, + { + "epoch": 0.8226579520697168, + "grad_norm": 1.3428949977064997, + "learning_rate": 1.6041521458871812e-06, + "loss": 0.1622, + "step": 10384 + }, + { + "epoch": 0.8227371756783521, + "grad_norm": 1.5641554576348324, + "learning_rate": 1.6027584803083351e-06, + "loss": 0.1575, + "step": 10385 + }, + { + "epoch": 0.8228163992869876, + "grad_norm": 1.411993080404408, + "learning_rate": 1.6013653676474371e-06, + "loss": 0.1806, + "step": 10386 + }, + { + "epoch": 0.8228956228956229, + "grad_norm": 1.5339240738577442, + "learning_rate": 1.5999728079962197e-06, + "loss": 0.1315, + "step": 10387 + }, + { + "epoch": 0.8229748465042582, + "grad_norm": 1.4727229216951876, + "learning_rate": 1.5985808014463745e-06, + "loss": 0.1464, + "step": 10388 + }, + { + "epoch": 0.8230540701128937, + "grad_norm": 2.2163658001990325, + "learning_rate": 1.5971893480895583e-06, + "loss": 0.1399, + "step": 10389 + }, + { + "epoch": 0.823133293721529, + "grad_norm": 1.4393577776760622, + "learning_rate": 1.5957984480173893e-06, + "loss": 0.15, + "step": 10390 + }, + { + "epoch": 0.8232125173301644, + "grad_norm": 1.5996092927925485, + "learning_rate": 1.5944081013214575e-06, + "loss": 0.1286, + "step": 10391 + }, + { + "epoch": 0.8232917409387998, + "grad_norm": 1.8738218925004884, + "learning_rate": 1.593018308093306e-06, + "loss": 0.2643, + "step": 10392 + }, + { + "epoch": 0.8233709645474351, + "grad_norm": 1.950226506939809, + "learning_rate": 1.5916290684244452e-06, + "loss": 0.1764, + "step": 10393 + }, + { + "epoch": 0.8234501881560705, + "grad_norm": 1.9310168181998901, + "learning_rate": 1.5902403824063539e-06, + "loss": 0.1719, + "step": 10394 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.842916780135873, + "learning_rate": 1.5888522501304682e-06, + "loss": 0.1874, + "step": 10395 + }, + { + "epoch": 0.8236086353733413, + "grad_norm": 1.5154251082834898, + "learning_rate": 1.587464671688187e-06, + "loss": 0.1503, + "step": 10396 + }, + { + "epoch": 0.8236878589819766, + "grad_norm": 1.58940819301186, + "learning_rate": 1.5860776471708816e-06, + "loss": 0.1947, + "step": 10397 + }, + { + "epoch": 0.823767082590612, + "grad_norm": 1.4734192225503464, + "learning_rate": 1.5846911766698781e-06, + "loss": 0.1702, + "step": 10398 + }, + { + "epoch": 0.8238463061992474, + "grad_norm": 1.8562995864213159, + "learning_rate": 1.5833052602764664e-06, + "loss": 0.2415, + "step": 10399 + }, + { + "epoch": 0.8239255298078827, + "grad_norm": 1.3954524152324503, + "learning_rate": 1.5819198980819096e-06, + "loss": 0.1429, + "step": 10400 + }, + { + "epoch": 0.8240047534165181, + "grad_norm": 1.4347241991578803, + "learning_rate": 1.5805350901774197e-06, + "loss": 0.1455, + "step": 10401 + }, + { + "epoch": 0.8240839770251535, + "grad_norm": 1.210332876336636, + "learning_rate": 1.5791508366541797e-06, + "loss": 0.109, + "step": 10402 + }, + { + "epoch": 0.8241632006337889, + "grad_norm": 1.7049231532527356, + "learning_rate": 1.577767137603341e-06, + "loss": 0.1265, + "step": 10403 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 1.0506811936293623, + "learning_rate": 1.5763839931160108e-06, + "loss": 0.0913, + "step": 10404 + }, + { + "epoch": 0.8243216478510597, + "grad_norm": 1.5996917032968943, + "learning_rate": 1.5750014032832617e-06, + "loss": 0.18, + "step": 10405 + }, + { + "epoch": 0.824400871459695, + "grad_norm": 1.4212408655501363, + "learning_rate": 1.5736193681961332e-06, + "loss": 0.1006, + "step": 10406 + }, + { + "epoch": 0.8244800950683303, + "grad_norm": 1.2106947710392142, + "learning_rate": 1.5722378879456234e-06, + "loss": 0.1311, + "step": 10407 + }, + { + "epoch": 0.8245593186769657, + "grad_norm": 1.2166450878347332, + "learning_rate": 1.5708569626226954e-06, + "loss": 0.1128, + "step": 10408 + }, + { + "epoch": 0.8246385422856011, + "grad_norm": 1.219630824386909, + "learning_rate": 1.5694765923182798e-06, + "loss": 0.1022, + "step": 10409 + }, + { + "epoch": 0.8247177658942365, + "grad_norm": 1.3392468145625938, + "learning_rate": 1.5680967771232659e-06, + "loss": 0.1456, + "step": 10410 + }, + { + "epoch": 0.8247969895028718, + "grad_norm": 1.5876683201497666, + "learning_rate": 1.5667175171285054e-06, + "loss": 0.1384, + "step": 10411 + }, + { + "epoch": 0.8248762131115073, + "grad_norm": 1.1821167185607704, + "learning_rate": 1.5653388124248203e-06, + "loss": 0.146, + "step": 10412 + }, + { + "epoch": 0.8249554367201426, + "grad_norm": 1.4446984510941085, + "learning_rate": 1.5639606631029892e-06, + "loss": 0.1493, + "step": 10413 + }, + { + "epoch": 0.8250346603287779, + "grad_norm": 1.8737353338642322, + "learning_rate": 1.5625830692537569e-06, + "loss": 0.249, + "step": 10414 + }, + { + "epoch": 0.8251138839374134, + "grad_norm": 1.45948961535352, + "learning_rate": 1.561206030967828e-06, + "loss": 0.1351, + "step": 10415 + }, + { + "epoch": 0.8251931075460487, + "grad_norm": 1.290693019786443, + "learning_rate": 1.5598295483358804e-06, + "loss": 0.1198, + "step": 10416 + }, + { + "epoch": 0.8252723311546841, + "grad_norm": 1.3390180259225275, + "learning_rate": 1.5584536214485457e-06, + "loss": 0.1261, + "step": 10417 + }, + { + "epoch": 0.8253515547633195, + "grad_norm": 2.063602378896092, + "learning_rate": 1.5570782503964188e-06, + "loss": 0.1914, + "step": 10418 + }, + { + "epoch": 0.8254307783719549, + "grad_norm": 1.4035741308956817, + "learning_rate": 1.5557034352700672e-06, + "loss": 0.12, + "step": 10419 + }, + { + "epoch": 0.8255100019805902, + "grad_norm": 1.5891147409405608, + "learning_rate": 1.5543291761600133e-06, + "loss": 0.1979, + "step": 10420 + }, + { + "epoch": 0.8255892255892255, + "grad_norm": 1.3329053501075112, + "learning_rate": 1.552955473156742e-06, + "loss": 0.1633, + "step": 10421 + }, + { + "epoch": 0.825668449197861, + "grad_norm": 1.5358401531839272, + "learning_rate": 1.5515823263507112e-06, + "loss": 0.1576, + "step": 10422 + }, + { + "epoch": 0.8257476728064963, + "grad_norm": 1.0787482908661634, + "learning_rate": 1.5502097358323321e-06, + "loss": 0.09, + "step": 10423 + }, + { + "epoch": 0.8258268964151317, + "grad_norm": 1.2604716499021176, + "learning_rate": 1.548837701691983e-06, + "loss": 0.1116, + "step": 10424 + }, + { + "epoch": 0.8259061200237671, + "grad_norm": 1.4572633089092795, + "learning_rate": 1.547466224020009e-06, + "loss": 0.1247, + "step": 10425 + }, + { + "epoch": 0.8259853436324025, + "grad_norm": 1.9520664048337526, + "learning_rate": 1.5460953029067128e-06, + "loss": 0.1351, + "step": 10426 + }, + { + "epoch": 0.8260645672410378, + "grad_norm": 1.4268932591572854, + "learning_rate": 1.5447249384423624e-06, + "loss": 0.123, + "step": 10427 + }, + { + "epoch": 0.8261437908496732, + "grad_norm": 1.6448199221259747, + "learning_rate": 1.543355130717189e-06, + "loss": 0.1561, + "step": 10428 + }, + { + "epoch": 0.8262230144583086, + "grad_norm": 1.3967811918732347, + "learning_rate": 1.5419858798213928e-06, + "loss": 0.1254, + "step": 10429 + }, + { + "epoch": 0.8263022380669439, + "grad_norm": 1.2648540259126664, + "learning_rate": 1.540617185845128e-06, + "loss": 0.1489, + "step": 10430 + }, + { + "epoch": 0.8263814616755794, + "grad_norm": 1.4296467965865831, + "learning_rate": 1.5392490488785151e-06, + "loss": 0.1243, + "step": 10431 + }, + { + "epoch": 0.8264606852842147, + "grad_norm": 1.8390185485198758, + "learning_rate": 1.537881469011645e-06, + "loss": 0.2429, + "step": 10432 + }, + { + "epoch": 0.8265399088928501, + "grad_norm": 1.8535120444680417, + "learning_rate": 1.5365144463345627e-06, + "loss": 0.1497, + "step": 10433 + }, + { + "epoch": 0.8266191325014854, + "grad_norm": 1.728725861797571, + "learning_rate": 1.5351479809372772e-06, + "loss": 0.2105, + "step": 10434 + }, + { + "epoch": 0.8266983561101208, + "grad_norm": 1.3596185137767802, + "learning_rate": 1.5337820729097697e-06, + "loss": 0.1135, + "step": 10435 + }, + { + "epoch": 0.8267775797187562, + "grad_norm": 1.6339166712688329, + "learning_rate": 1.5324167223419762e-06, + "loss": 0.1845, + "step": 10436 + }, + { + "epoch": 0.8268568033273915, + "grad_norm": 1.7855396420387195, + "learning_rate": 1.5310519293237958e-06, + "loss": 0.2137, + "step": 10437 + }, + { + "epoch": 0.826936026936027, + "grad_norm": 1.4004420618199764, + "learning_rate": 1.5296876939450978e-06, + "loss": 0.1064, + "step": 10438 + }, + { + "epoch": 0.8270152505446623, + "grad_norm": 1.5502497092379819, + "learning_rate": 1.528324016295709e-06, + "loss": 0.1408, + "step": 10439 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 1.7836419753485788, + "learning_rate": 1.5269608964654181e-06, + "loss": 0.1171, + "step": 10440 + }, + { + "epoch": 0.8271736977619331, + "grad_norm": 1.6575619634308936, + "learning_rate": 1.525598334543985e-06, + "loss": 0.1442, + "step": 10441 + }, + { + "epoch": 0.8272529213705684, + "grad_norm": 1.7407613428327606, + "learning_rate": 1.524236330621125e-06, + "loss": 0.1845, + "step": 10442 + }, + { + "epoch": 0.8273321449792038, + "grad_norm": 1.4347332428198172, + "learning_rate": 1.5228748847865205e-06, + "loss": 0.1426, + "step": 10443 + }, + { + "epoch": 0.8274113685878391, + "grad_norm": 1.3698020828193718, + "learning_rate": 1.5215139971298131e-06, + "loss": 0.1448, + "step": 10444 + }, + { + "epoch": 0.8274905921964746, + "grad_norm": 1.6739333265735419, + "learning_rate": 1.5201536677406147e-06, + "loss": 0.1556, + "step": 10445 + }, + { + "epoch": 0.8275698158051099, + "grad_norm": 1.4746853690416448, + "learning_rate": 1.518793896708496e-06, + "loss": 0.1185, + "step": 10446 + }, + { + "epoch": 0.8276490394137453, + "grad_norm": 1.8048450054750969, + "learning_rate": 1.517434684122987e-06, + "loss": 0.1231, + "step": 10447 + }, + { + "epoch": 0.8277282630223807, + "grad_norm": 1.8907254679962828, + "learning_rate": 1.5160760300735911e-06, + "loss": 0.2279, + "step": 10448 + }, + { + "epoch": 0.827807486631016, + "grad_norm": 1.8222072880649218, + "learning_rate": 1.5147179346497665e-06, + "loss": 0.2103, + "step": 10449 + }, + { + "epoch": 0.8278867102396514, + "grad_norm": 2.122667636450811, + "learning_rate": 1.513360397940935e-06, + "loss": 0.295, + "step": 10450 + }, + { + "epoch": 0.8279659338482868, + "grad_norm": 1.7200946348052384, + "learning_rate": 1.5120034200364885e-06, + "loss": 0.2088, + "step": 10451 + }, + { + "epoch": 0.8280451574569222, + "grad_norm": 1.3738326824158502, + "learning_rate": 1.5106470010257758e-06, + "loss": 0.1341, + "step": 10452 + }, + { + "epoch": 0.8281243810655575, + "grad_norm": 1.5027063601514015, + "learning_rate": 1.509291140998107e-06, + "loss": 0.1461, + "step": 10453 + }, + { + "epoch": 0.828203604674193, + "grad_norm": 1.6721475262122434, + "learning_rate": 1.5079358400427635e-06, + "loss": 0.1538, + "step": 10454 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 1.6774558342530719, + "learning_rate": 1.5065810982489849e-06, + "loss": 0.1601, + "step": 10455 + }, + { + "epoch": 0.8283620518914636, + "grad_norm": 1.6818453145166503, + "learning_rate": 1.5052269157059707e-06, + "loss": 0.1544, + "step": 10456 + }, + { + "epoch": 0.828441275500099, + "grad_norm": 1.3627276408551925, + "learning_rate": 1.503873292502892e-06, + "loss": 0.1322, + "step": 10457 + }, + { + "epoch": 0.8285204991087344, + "grad_norm": 1.411075376802412, + "learning_rate": 1.5025202287288764e-06, + "loss": 0.114, + "step": 10458 + }, + { + "epoch": 0.8285997227173698, + "grad_norm": 1.458350494482667, + "learning_rate": 1.501167724473016e-06, + "loss": 0.1927, + "step": 10459 + }, + { + "epoch": 0.8286789463260051, + "grad_norm": 1.4190803687970546, + "learning_rate": 1.499815779824365e-06, + "loss": 0.1194, + "step": 10460 + }, + { + "epoch": 0.8287581699346406, + "grad_norm": 1.325797862383344, + "learning_rate": 1.4984643948719469e-06, + "loss": 0.1331, + "step": 10461 + }, + { + "epoch": 0.8288373935432759, + "grad_norm": 1.38424853695347, + "learning_rate": 1.4971135697047422e-06, + "loss": 0.1583, + "step": 10462 + }, + { + "epoch": 0.8289166171519112, + "grad_norm": 1.4697523749720716, + "learning_rate": 1.4957633044116925e-06, + "loss": 0.1271, + "step": 10463 + }, + { + "epoch": 0.8289958407605467, + "grad_norm": 1.3988389706591748, + "learning_rate": 1.4944135990817121e-06, + "loss": 0.1512, + "step": 10464 + }, + { + "epoch": 0.829075064369182, + "grad_norm": 1.111534357178801, + "learning_rate": 1.4930644538036709e-06, + "loss": 0.0868, + "step": 10465 + }, + { + "epoch": 0.8291542879778174, + "grad_norm": 1.6657537168219245, + "learning_rate": 1.4917158686663992e-06, + "loss": 0.1753, + "step": 10466 + }, + { + "epoch": 0.8292335115864528, + "grad_norm": 1.2148211390739012, + "learning_rate": 1.490367843758701e-06, + "loss": 0.133, + "step": 10467 + }, + { + "epoch": 0.8293127351950882, + "grad_norm": 1.5329173406558545, + "learning_rate": 1.4890203791693337e-06, + "loss": 0.1753, + "step": 10468 + }, + { + "epoch": 0.8293919588037235, + "grad_norm": 1.382283485781237, + "learning_rate": 1.4876734749870213e-06, + "loss": 0.1399, + "step": 10469 + }, + { + "epoch": 0.8294711824123588, + "grad_norm": 1.4432399496278205, + "learning_rate": 1.4863271313004535e-06, + "loss": 0.1854, + "step": 10470 + }, + { + "epoch": 0.8295504060209943, + "grad_norm": 1.434362292061377, + "learning_rate": 1.4849813481982788e-06, + "loss": 0.1524, + "step": 10471 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 1.293517868604829, + "learning_rate": 1.483636125769108e-06, + "loss": 0.1226, + "step": 10472 + }, + { + "epoch": 0.829708853238265, + "grad_norm": 1.645473271617257, + "learning_rate": 1.482291464101523e-06, + "loss": 0.1745, + "step": 10473 + }, + { + "epoch": 0.8297880768469004, + "grad_norm": 1.6640206062544738, + "learning_rate": 1.480947363284061e-06, + "loss": 0.1559, + "step": 10474 + }, + { + "epoch": 0.8298673004555357, + "grad_norm": 1.827833822706193, + "learning_rate": 1.4796038234052235e-06, + "loss": 0.152, + "step": 10475 + }, + { + "epoch": 0.8299465240641711, + "grad_norm": 1.5847326784336713, + "learning_rate": 1.4782608445534741e-06, + "loss": 0.1833, + "step": 10476 + }, + { + "epoch": 0.8300257476728065, + "grad_norm": 1.3526824269563864, + "learning_rate": 1.4769184268172465e-06, + "loss": 0.1343, + "step": 10477 + }, + { + "epoch": 0.8301049712814419, + "grad_norm": 1.6452907709358355, + "learning_rate": 1.4755765702849311e-06, + "loss": 0.1767, + "step": 10478 + }, + { + "epoch": 0.8301841948900772, + "grad_norm": 1.7567117556498941, + "learning_rate": 1.4742352750448806e-06, + "loss": 0.1671, + "step": 10479 + }, + { + "epoch": 0.8302634184987127, + "grad_norm": 1.7550545175522747, + "learning_rate": 1.4728945411854135e-06, + "loss": 0.1986, + "step": 10480 + }, + { + "epoch": 0.830342642107348, + "grad_norm": 1.6306571338844298, + "learning_rate": 1.4715543687948096e-06, + "loss": 0.168, + "step": 10481 + }, + { + "epoch": 0.8304218657159833, + "grad_norm": 1.255040224432049, + "learning_rate": 1.470214757961317e-06, + "loss": 0.1334, + "step": 10482 + }, + { + "epoch": 0.8305010893246187, + "grad_norm": 1.2978999779415066, + "learning_rate": 1.4688757087731386e-06, + "loss": 0.1459, + "step": 10483 + }, + { + "epoch": 0.8305803129332541, + "grad_norm": 1.7039019014916967, + "learning_rate": 1.4675372213184458e-06, + "loss": 0.166, + "step": 10484 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 2.6034323665882386, + "learning_rate": 1.4661992956853699e-06, + "loss": 0.1951, + "step": 10485 + }, + { + "epoch": 0.8307387601505248, + "grad_norm": 1.847362845942156, + "learning_rate": 1.4648619319620105e-06, + "loss": 0.2026, + "step": 10486 + }, + { + "epoch": 0.8308179837591603, + "grad_norm": 1.264175584467807, + "learning_rate": 1.463525130236424e-06, + "loss": 0.1149, + "step": 10487 + }, + { + "epoch": 0.8308972073677956, + "grad_norm": 1.3686448353848946, + "learning_rate": 1.4621888905966308e-06, + "loss": 0.1109, + "step": 10488 + }, + { + "epoch": 0.8309764309764309, + "grad_norm": 1.6418515933543478, + "learning_rate": 1.4608532131306198e-06, + "loss": 0.2549, + "step": 10489 + }, + { + "epoch": 0.8310556545850664, + "grad_norm": 2.0124970527044983, + "learning_rate": 1.459518097926337e-06, + "loss": 0.1939, + "step": 10490 + }, + { + "epoch": 0.8311348781937017, + "grad_norm": 1.0405927005080426, + "learning_rate": 1.4581835450716907e-06, + "loss": 0.0819, + "step": 10491 + }, + { + "epoch": 0.8312141018023371, + "grad_norm": 1.3312072031435174, + "learning_rate": 1.4568495546545603e-06, + "loss": 0.1374, + "step": 10492 + }, + { + "epoch": 0.8312933254109725, + "grad_norm": 1.9728663054895477, + "learning_rate": 1.4555161267627793e-06, + "loss": 0.2894, + "step": 10493 + }, + { + "epoch": 0.8313725490196079, + "grad_norm": 1.4866272666577227, + "learning_rate": 1.4541832614841455e-06, + "loss": 0.1363, + "step": 10494 + }, + { + "epoch": 0.8314517726282432, + "grad_norm": 1.529754605298281, + "learning_rate": 1.4528509589064276e-06, + "loss": 0.0868, + "step": 10495 + }, + { + "epoch": 0.8315309962368785, + "grad_norm": 1.650501268851585, + "learning_rate": 1.4515192191173466e-06, + "loss": 0.1642, + "step": 10496 + }, + { + "epoch": 0.831610219845514, + "grad_norm": 1.3909628219421502, + "learning_rate": 1.45018804220459e-06, + "loss": 0.1372, + "step": 10497 + }, + { + "epoch": 0.8316894434541493, + "grad_norm": 1.2581369409862537, + "learning_rate": 1.4488574282558143e-06, + "loss": 0.1133, + "step": 10498 + }, + { + "epoch": 0.8317686670627847, + "grad_norm": 1.433149621609104, + "learning_rate": 1.4475273773586319e-06, + "loss": 0.1487, + "step": 10499 + }, + { + "epoch": 0.8318478906714201, + "grad_norm": 1.30609432038533, + "learning_rate": 1.446197889600619e-06, + "loss": 0.1665, + "step": 10500 + }, + { + "epoch": 0.8319271142800555, + "grad_norm": 1.6653218909747391, + "learning_rate": 1.444868965069315e-06, + "loss": 0.1864, + "step": 10501 + }, + { + "epoch": 0.8320063378886908, + "grad_norm": 1.4163967956303973, + "learning_rate": 1.443540603852227e-06, + "loss": 0.1362, + "step": 10502 + }, + { + "epoch": 0.8320855614973262, + "grad_norm": 1.2849366947885508, + "learning_rate": 1.4422128060368201e-06, + "loss": 0.1072, + "step": 10503 + }, + { + "epoch": 0.8321647851059616, + "grad_norm": 1.9601634183640058, + "learning_rate": 1.4408855717105197e-06, + "loss": 0.216, + "step": 10504 + }, + { + "epoch": 0.8322440087145969, + "grad_norm": 1.4598953868221773, + "learning_rate": 1.4395589009607225e-06, + "loss": 0.1575, + "step": 10505 + }, + { + "epoch": 0.8323232323232324, + "grad_norm": 1.7371467309628827, + "learning_rate": 1.4382327938747808e-06, + "loss": 0.1609, + "step": 10506 + }, + { + "epoch": 0.8324024559318677, + "grad_norm": 1.5589338859986424, + "learning_rate": 1.4369072505400117e-06, + "loss": 0.1262, + "step": 10507 + }, + { + "epoch": 0.8324816795405031, + "grad_norm": 1.5804549002657287, + "learning_rate": 1.4355822710436995e-06, + "loss": 0.135, + "step": 10508 + }, + { + "epoch": 0.8325609031491384, + "grad_norm": 1.3066817922881286, + "learning_rate": 1.4342578554730858e-06, + "loss": 0.1051, + "step": 10509 + }, + { + "epoch": 0.8326401267577738, + "grad_norm": 1.2173387730932184, + "learning_rate": 1.4329340039153738e-06, + "loss": 0.124, + "step": 10510 + }, + { + "epoch": 0.8327193503664092, + "grad_norm": 1.4560198374978264, + "learning_rate": 1.4316107164577376e-06, + "loss": 0.1505, + "step": 10511 + }, + { + "epoch": 0.8327985739750445, + "grad_norm": 1.264701314761224, + "learning_rate": 1.430287993187307e-06, + "loss": 0.1358, + "step": 10512 + }, + { + "epoch": 0.83287779758368, + "grad_norm": 1.5112765519019675, + "learning_rate": 1.4289658341911782e-06, + "loss": 0.1336, + "step": 10513 + }, + { + "epoch": 0.8329570211923153, + "grad_norm": 1.7954708873097363, + "learning_rate": 1.4276442395564049e-06, + "loss": 0.1939, + "step": 10514 + }, + { + "epoch": 0.8330362448009507, + "grad_norm": 1.3780983896068477, + "learning_rate": 1.426323209370014e-06, + "loss": 0.1216, + "step": 10515 + }, + { + "epoch": 0.8331154684095861, + "grad_norm": 2.0236118774975687, + "learning_rate": 1.425002743718985e-06, + "loss": 0.1644, + "step": 10516 + }, + { + "epoch": 0.8331946920182214, + "grad_norm": 1.7012607371020356, + "learning_rate": 1.4236828426902626e-06, + "loss": 0.1808, + "step": 10517 + }, + { + "epoch": 0.8332739156268568, + "grad_norm": 1.1425512918678684, + "learning_rate": 1.4223635063707619e-06, + "loss": 0.1277, + "step": 10518 + }, + { + "epoch": 0.8333531392354921, + "grad_norm": 1.2659950838878717, + "learning_rate": 1.421044734847351e-06, + "loss": 0.1488, + "step": 10519 + }, + { + "epoch": 0.8334323628441276, + "grad_norm": 1.8681973614222098, + "learning_rate": 1.4197265282068618e-06, + "loss": 0.2096, + "step": 10520 + }, + { + "epoch": 0.8335115864527629, + "grad_norm": 2.0668473181305025, + "learning_rate": 1.4184088865360978e-06, + "loss": 0.2198, + "step": 10521 + }, + { + "epoch": 0.8335908100613983, + "grad_norm": 1.8467580530748484, + "learning_rate": 1.4170918099218166e-06, + "loss": 0.1625, + "step": 10522 + }, + { + "epoch": 0.8336700336700337, + "grad_norm": 1.7755054552795317, + "learning_rate": 1.41577529845074e-06, + "loss": 0.2042, + "step": 10523 + }, + { + "epoch": 0.833749257278669, + "grad_norm": 1.5061588117386606, + "learning_rate": 1.4144593522095563e-06, + "loss": 0.1464, + "step": 10524 + }, + { + "epoch": 0.8338284808873044, + "grad_norm": 1.590326443681092, + "learning_rate": 1.4131439712849148e-06, + "loss": 0.1475, + "step": 10525 + }, + { + "epoch": 0.8339077044959398, + "grad_norm": 1.7514985934182123, + "learning_rate": 1.4118291557634223e-06, + "loss": 0.2652, + "step": 10526 + }, + { + "epoch": 0.8339869281045752, + "grad_norm": 2.411398825069949, + "learning_rate": 1.410514905731658e-06, + "loss": 0.2085, + "step": 10527 + }, + { + "epoch": 0.8340661517132105, + "grad_norm": 1.2223871485219073, + "learning_rate": 1.4092012212761574e-06, + "loss": 0.133, + "step": 10528 + }, + { + "epoch": 0.834145375321846, + "grad_norm": 1.0762834006052167, + "learning_rate": 1.4078881024834213e-06, + "loss": 0.0802, + "step": 10529 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 1.3533504551646354, + "learning_rate": 1.406575549439907e-06, + "loss": 0.1095, + "step": 10530 + }, + { + "epoch": 0.8343038225391166, + "grad_norm": 1.5615696122225395, + "learning_rate": 1.4052635622320477e-06, + "loss": 0.1515, + "step": 10531 + }, + { + "epoch": 0.834383046147752, + "grad_norm": 1.651797849652785, + "learning_rate": 1.4039521409462265e-06, + "loss": 0.2142, + "step": 10532 + }, + { + "epoch": 0.8344622697563874, + "grad_norm": 1.5203158419970122, + "learning_rate": 1.4026412856687931e-06, + "loss": 0.1758, + "step": 10533 + }, + { + "epoch": 0.8345414933650228, + "grad_norm": 1.4259578363282222, + "learning_rate": 1.4013309964860667e-06, + "loss": 0.1282, + "step": 10534 + }, + { + "epoch": 0.8346207169736581, + "grad_norm": 1.3212456431686352, + "learning_rate": 1.4000212734843187e-06, + "loss": 0.119, + "step": 10535 + }, + { + "epoch": 0.8346999405822936, + "grad_norm": 1.9600433079024036, + "learning_rate": 1.3987121167497874e-06, + "loss": 0.1707, + "step": 10536 + }, + { + "epoch": 0.8347791641909289, + "grad_norm": 1.2621085532116656, + "learning_rate": 1.3974035263686792e-06, + "loss": 0.1421, + "step": 10537 + }, + { + "epoch": 0.8348583877995642, + "grad_norm": 1.3129518464181507, + "learning_rate": 1.396095502427155e-06, + "loss": 0.0768, + "step": 10538 + }, + { + "epoch": 0.8349376114081997, + "grad_norm": 1.558708814936815, + "learning_rate": 1.3947880450113404e-06, + "loss": 0.1544, + "step": 10539 + }, + { + "epoch": 0.835016835016835, + "grad_norm": 1.462600127578735, + "learning_rate": 1.39348115420733e-06, + "loss": 0.1276, + "step": 10540 + }, + { + "epoch": 0.8350960586254704, + "grad_norm": 1.598430391294099, + "learning_rate": 1.392174830101174e-06, + "loss": 0.1136, + "step": 10541 + }, + { + "epoch": 0.8351752822341058, + "grad_norm": 1.2325142503958595, + "learning_rate": 1.3908690727788842e-06, + "loss": 0.0815, + "step": 10542 + }, + { + "epoch": 0.8352545058427412, + "grad_norm": 1.4081913359628437, + "learning_rate": 1.3895638823264447e-06, + "loss": 0.1615, + "step": 10543 + }, + { + "epoch": 0.8353337294513765, + "grad_norm": 1.978628812507835, + "learning_rate": 1.3882592588297917e-06, + "loss": 0.1672, + "step": 10544 + }, + { + "epoch": 0.8354129530600118, + "grad_norm": 1.522327406025572, + "learning_rate": 1.38695520237483e-06, + "loss": 0.1482, + "step": 10545 + }, + { + "epoch": 0.8354921766686473, + "grad_norm": 1.4364688419079543, + "learning_rate": 1.3856517130474235e-06, + "loss": 0.1688, + "step": 10546 + }, + { + "epoch": 0.8355714002772826, + "grad_norm": 1.5535409529760795, + "learning_rate": 1.384348790933403e-06, + "loss": 0.2004, + "step": 10547 + }, + { + "epoch": 0.835650623885918, + "grad_norm": 1.3479364472814601, + "learning_rate": 1.3830464361185592e-06, + "loss": 0.1021, + "step": 10548 + }, + { + "epoch": 0.8357298474945534, + "grad_norm": 1.474193020992869, + "learning_rate": 1.3817446486886433e-06, + "loss": 0.119, + "step": 10549 + }, + { + "epoch": 0.8358090711031888, + "grad_norm": 1.288308881807844, + "learning_rate": 1.3804434287293756e-06, + "loss": 0.1054, + "step": 10550 + }, + { + "epoch": 0.8358882947118241, + "grad_norm": 1.8507964553654104, + "learning_rate": 1.3791427763264342e-06, + "loss": 0.1497, + "step": 10551 + }, + { + "epoch": 0.8359675183204595, + "grad_norm": 1.9102776357586515, + "learning_rate": 1.3778426915654575e-06, + "loss": 0.2033, + "step": 10552 + }, + { + "epoch": 0.8360467419290949, + "grad_norm": 1.8215271961125292, + "learning_rate": 1.3765431745320546e-06, + "loss": 0.1428, + "step": 10553 + }, + { + "epoch": 0.8361259655377302, + "grad_norm": 1.6318011769839191, + "learning_rate": 1.3752442253117903e-06, + "loss": 0.2014, + "step": 10554 + }, + { + "epoch": 0.8362051891463657, + "grad_norm": 1.3051396480445032, + "learning_rate": 1.373945843990192e-06, + "loss": 0.1414, + "step": 10555 + }, + { + "epoch": 0.836284412755001, + "grad_norm": 1.5482829512802534, + "learning_rate": 1.3726480306527578e-06, + "loss": 0.1416, + "step": 10556 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 1.5493978244151865, + "learning_rate": 1.3713507853849373e-06, + "loss": 0.1638, + "step": 10557 + }, + { + "epoch": 0.8364428599722717, + "grad_norm": 1.2104152350452513, + "learning_rate": 1.3700541082721464e-06, + "loss": 0.1032, + "step": 10558 + }, + { + "epoch": 0.8365220835809071, + "grad_norm": 1.6917040484946821, + "learning_rate": 1.3687579993997703e-06, + "loss": 0.1815, + "step": 10559 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 1.4653591812220614, + "learning_rate": 1.3674624588531481e-06, + "loss": 0.1365, + "step": 10560 + }, + { + "epoch": 0.8366805307981778, + "grad_norm": 1.5928506207818265, + "learning_rate": 1.3661674867175844e-06, + "loss": 0.1413, + "step": 10561 + }, + { + "epoch": 0.8367597544068133, + "grad_norm": 1.3836968799451774, + "learning_rate": 1.3648730830783507e-06, + "loss": 0.1442, + "step": 10562 + }, + { + "epoch": 0.8368389780154486, + "grad_norm": 1.336818670425734, + "learning_rate": 1.3635792480206744e-06, + "loss": 0.1299, + "step": 10563 + }, + { + "epoch": 0.8369182016240839, + "grad_norm": 1.6592427137990626, + "learning_rate": 1.3622859816297473e-06, + "loss": 0.1707, + "step": 10564 + }, + { + "epoch": 0.8369974252327194, + "grad_norm": 1.317400189646492, + "learning_rate": 1.3609932839907281e-06, + "loss": 0.1407, + "step": 10565 + }, + { + "epoch": 0.8370766488413547, + "grad_norm": 1.5954592676631707, + "learning_rate": 1.3597011551887329e-06, + "loss": 0.1634, + "step": 10566 + }, + { + "epoch": 0.8371558724499901, + "grad_norm": 1.4345800345204196, + "learning_rate": 1.3584095953088405e-06, + "loss": 0.139, + "step": 10567 + }, + { + "epoch": 0.8372350960586254, + "grad_norm": 1.0091877112973309, + "learning_rate": 1.3571186044360973e-06, + "loss": 0.0891, + "step": 10568 + }, + { + "epoch": 0.8373143196672609, + "grad_norm": 1.449045984177576, + "learning_rate": 1.3558281826555065e-06, + "loss": 0.1878, + "step": 10569 + }, + { + "epoch": 0.8373935432758962, + "grad_norm": 2.6708432801095783, + "learning_rate": 1.3545383300520375e-06, + "loss": 0.2549, + "step": 10570 + }, + { + "epoch": 0.8374727668845315, + "grad_norm": 1.6230606633414757, + "learning_rate": 1.3532490467106186e-06, + "loss": 0.1997, + "step": 10571 + }, + { + "epoch": 0.837551990493167, + "grad_norm": 1.7002180598197696, + "learning_rate": 1.3519603327161456e-06, + "loss": 0.2332, + "step": 10572 + }, + { + "epoch": 0.8376312141018023, + "grad_norm": 1.5496895141462625, + "learning_rate": 1.3506721881534734e-06, + "loss": 0.1076, + "step": 10573 + }, + { + "epoch": 0.8377104377104377, + "grad_norm": 1.6682904891176977, + "learning_rate": 1.3493846131074173e-06, + "loss": 0.0915, + "step": 10574 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 1.7975817576352944, + "learning_rate": 1.3480976076627617e-06, + "loss": 0.2087, + "step": 10575 + }, + { + "epoch": 0.8378688849277085, + "grad_norm": 1.6679796344416764, + "learning_rate": 1.3468111719042497e-06, + "loss": 0.2132, + "step": 10576 + }, + { + "epoch": 0.8379481085363438, + "grad_norm": 1.2110061635641807, + "learning_rate": 1.345525305916583e-06, + "loss": 0.0658, + "step": 10577 + }, + { + "epoch": 0.8380273321449792, + "grad_norm": 1.4521348615640655, + "learning_rate": 1.3442400097844344e-06, + "loss": 0.1209, + "step": 10578 + }, + { + "epoch": 0.8381065557536146, + "grad_norm": 1.4251480823392397, + "learning_rate": 1.342955283592432e-06, + "loss": 0.1274, + "step": 10579 + }, + { + "epoch": 0.8381857793622499, + "grad_norm": 1.563206602406747, + "learning_rate": 1.3416711274251671e-06, + "loss": 0.19, + "step": 10580 + }, + { + "epoch": 0.8382650029708854, + "grad_norm": 1.7772029117015589, + "learning_rate": 1.3403875413671997e-06, + "loss": 0.1569, + "step": 10581 + }, + { + "epoch": 0.8383442265795207, + "grad_norm": 2.0137756255469603, + "learning_rate": 1.3391045255030444e-06, + "loss": 0.1786, + "step": 10582 + }, + { + "epoch": 0.8384234501881561, + "grad_norm": 1.2991353238836698, + "learning_rate": 1.3378220799171815e-06, + "loss": 0.1282, + "step": 10583 + }, + { + "epoch": 0.8385026737967914, + "grad_norm": 1.6763781060671676, + "learning_rate": 1.3365402046940569e-06, + "loss": 0.2153, + "step": 10584 + }, + { + "epoch": 0.8385818974054268, + "grad_norm": 1.5906710242051936, + "learning_rate": 1.3352588999180726e-06, + "loss": 0.1684, + "step": 10585 + }, + { + "epoch": 0.8386611210140622, + "grad_norm": 1.5146384858409359, + "learning_rate": 1.3339781656735995e-06, + "loss": 0.1582, + "step": 10586 + }, + { + "epoch": 0.8387403446226975, + "grad_norm": 1.5384001888776324, + "learning_rate": 1.3326980020449621e-06, + "loss": 0.1332, + "step": 10587 + }, + { + "epoch": 0.838819568231333, + "grad_norm": 1.8285184477364784, + "learning_rate": 1.3314184091164605e-06, + "loss": 0.2016, + "step": 10588 + }, + { + "epoch": 0.8388987918399683, + "grad_norm": 1.9822735091743064, + "learning_rate": 1.3301393869723457e-06, + "loss": 0.1729, + "step": 10589 + }, + { + "epoch": 0.8389780154486037, + "grad_norm": 1.9670849238208123, + "learning_rate": 1.328860935696833e-06, + "loss": 0.1524, + "step": 10590 + }, + { + "epoch": 0.8390572390572391, + "grad_norm": 1.3845555195916062, + "learning_rate": 1.3275830553741066e-06, + "loss": 0.1004, + "step": 10591 + }, + { + "epoch": 0.8391364626658744, + "grad_norm": 1.5513397752911098, + "learning_rate": 1.3263057460883078e-06, + "loss": 0.0931, + "step": 10592 + }, + { + "epoch": 0.8392156862745098, + "grad_norm": 1.364158766351369, + "learning_rate": 1.3250290079235383e-06, + "loss": 0.1636, + "step": 10593 + }, + { + "epoch": 0.8392949098831451, + "grad_norm": 1.5542462651824454, + "learning_rate": 1.3237528409638688e-06, + "loss": 0.1314, + "step": 10594 + }, + { + "epoch": 0.8393741334917806, + "grad_norm": 1.8074584377236935, + "learning_rate": 1.3224772452933277e-06, + "loss": 0.2128, + "step": 10595 + }, + { + "epoch": 0.8394533571004159, + "grad_norm": 1.3108518698049427, + "learning_rate": 1.321202220995904e-06, + "loss": 0.1643, + "step": 10596 + }, + { + "epoch": 0.8395325807090513, + "grad_norm": 1.4929573229698696, + "learning_rate": 1.3199277681555578e-06, + "loss": 0.1231, + "step": 10597 + }, + { + "epoch": 0.8396118043176867, + "grad_norm": 1.7200546763013393, + "learning_rate": 1.3186538868562004e-06, + "loss": 0.1619, + "step": 10598 + }, + { + "epoch": 0.839691027926322, + "grad_norm": 1.434429725089167, + "learning_rate": 1.3173805771817138e-06, + "loss": 0.0887, + "step": 10599 + }, + { + "epoch": 0.8397702515349574, + "grad_norm": 1.700529937298641, + "learning_rate": 1.3161078392159355e-06, + "loss": 0.1974, + "step": 10600 + }, + { + "epoch": 0.8398494751435928, + "grad_norm": 1.322446418537356, + "learning_rate": 1.3148356730426737e-06, + "loss": 0.0995, + "step": 10601 + }, + { + "epoch": 0.8399286987522282, + "grad_norm": 1.4898986168307924, + "learning_rate": 1.3135640787456926e-06, + "loss": 0.172, + "step": 10602 + }, + { + "epoch": 0.8400079223608635, + "grad_norm": 1.659367245436722, + "learning_rate": 1.312293056408719e-06, + "loss": 0.1718, + "step": 10603 + }, + { + "epoch": 0.840087145969499, + "grad_norm": 1.6175794090539586, + "learning_rate": 1.3110226061154462e-06, + "loss": 0.1763, + "step": 10604 + }, + { + "epoch": 0.8401663695781343, + "grad_norm": 1.1190365769410344, + "learning_rate": 1.309752727949527e-06, + "loss": 0.0736, + "step": 10605 + }, + { + "epoch": 0.8402455931867696, + "grad_norm": 1.2906239326450222, + "learning_rate": 1.3084834219945731e-06, + "loss": 0.1134, + "step": 10606 + }, + { + "epoch": 0.840324816795405, + "grad_norm": 1.4438588750192283, + "learning_rate": 1.3072146883341675e-06, + "loss": 0.152, + "step": 10607 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 1.5742421031576537, + "learning_rate": 1.3059465270518469e-06, + "loss": 0.1367, + "step": 10608 + }, + { + "epoch": 0.8404832640126758, + "grad_norm": 1.6221856259207226, + "learning_rate": 1.3046789382311132e-06, + "loss": 0.1193, + "step": 10609 + }, + { + "epoch": 0.8405624876213111, + "grad_norm": 1.6041009773109047, + "learning_rate": 1.3034119219554341e-06, + "loss": 0.1351, + "step": 10610 + }, + { + "epoch": 0.8406417112299466, + "grad_norm": 1.092911996819517, + "learning_rate": 1.3021454783082344e-06, + "loss": 0.0845, + "step": 10611 + }, + { + "epoch": 0.8407209348385819, + "grad_norm": 1.4935547009489953, + "learning_rate": 1.3008796073729013e-06, + "loss": 0.2057, + "step": 10612 + }, + { + "epoch": 0.8408001584472172, + "grad_norm": 1.3958593359229354, + "learning_rate": 1.2996143092327906e-06, + "loss": 0.1326, + "step": 10613 + }, + { + "epoch": 0.8408793820558527, + "grad_norm": 2.121863250466487, + "learning_rate": 1.2983495839712146e-06, + "loss": 0.2957, + "step": 10614 + }, + { + "epoch": 0.840958605664488, + "grad_norm": 1.853954069617716, + "learning_rate": 1.2970854316714477e-06, + "loss": 0.1857, + "step": 10615 + }, + { + "epoch": 0.8410378292731234, + "grad_norm": 1.5661612646522565, + "learning_rate": 1.2958218524167288e-06, + "loss": 0.1331, + "step": 10616 + }, + { + "epoch": 0.8411170528817588, + "grad_norm": 1.6990134783980666, + "learning_rate": 1.2945588462902603e-06, + "loss": 0.1615, + "step": 10617 + }, + { + "epoch": 0.8411962764903942, + "grad_norm": 1.44080534395435, + "learning_rate": 1.2932964133752036e-06, + "loss": 0.1542, + "step": 10618 + }, + { + "epoch": 0.8412755000990295, + "grad_norm": 1.4644739533638742, + "learning_rate": 1.292034553754683e-06, + "loss": 0.1158, + "step": 10619 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 1.7982692887055565, + "learning_rate": 1.2907732675117878e-06, + "loss": 0.1912, + "step": 10620 + }, + { + "epoch": 0.8414339473163003, + "grad_norm": 1.5005404675239695, + "learning_rate": 1.2895125547295672e-06, + "loss": 0.1224, + "step": 10621 + }, + { + "epoch": 0.8415131709249356, + "grad_norm": 1.4940693568453378, + "learning_rate": 1.2882524154910314e-06, + "loss": 0.1688, + "step": 10622 + }, + { + "epoch": 0.841592394533571, + "grad_norm": 1.3976813593887059, + "learning_rate": 1.2869928498791572e-06, + "loss": 0.1359, + "step": 10623 + }, + { + "epoch": 0.8416716181422064, + "grad_norm": 1.22190997864856, + "learning_rate": 1.2857338579768796e-06, + "loss": 0.1067, + "step": 10624 + }, + { + "epoch": 0.8417508417508418, + "grad_norm": 1.720875838845788, + "learning_rate": 1.2844754398670954e-06, + "loss": 0.1808, + "step": 10625 + }, + { + "epoch": 0.8418300653594771, + "grad_norm": 1.7900478640331172, + "learning_rate": 1.2832175956326686e-06, + "loss": 0.1225, + "step": 10626 + }, + { + "epoch": 0.8419092889681125, + "grad_norm": 1.6646253019478188, + "learning_rate": 1.2819603253564206e-06, + "loss": 0.177, + "step": 10627 + }, + { + "epoch": 0.8419885125767479, + "grad_norm": 1.6835433134538447, + "learning_rate": 1.280703629121135e-06, + "loss": 0.172, + "step": 10628 + }, + { + "epoch": 0.8420677361853832, + "grad_norm": 1.74062051178521, + "learning_rate": 1.2794475070095624e-06, + "loss": 0.199, + "step": 10629 + }, + { + "epoch": 0.8421469597940187, + "grad_norm": 2.315707132980495, + "learning_rate": 1.2781919591044113e-06, + "loss": 0.2665, + "step": 10630 + }, + { + "epoch": 0.842226183402654, + "grad_norm": 1.580855381918244, + "learning_rate": 1.2769369854883528e-06, + "loss": 0.1344, + "step": 10631 + }, + { + "epoch": 0.8423054070112893, + "grad_norm": 1.5580590840778559, + "learning_rate": 1.2756825862440192e-06, + "loss": 0.146, + "step": 10632 + }, + { + "epoch": 0.8423846306199247, + "grad_norm": 1.3833071910128458, + "learning_rate": 1.2744287614540108e-06, + "loss": 0.1439, + "step": 10633 + }, + { + "epoch": 0.8424638542285601, + "grad_norm": 1.467236894245345, + "learning_rate": 1.2731755112008838e-06, + "loss": 0.1439, + "step": 10634 + }, + { + "epoch": 0.8425430778371955, + "grad_norm": 1.2835347226032316, + "learning_rate": 1.2719228355671576e-06, + "loss": 0.1028, + "step": 10635 + }, + { + "epoch": 0.8426223014458308, + "grad_norm": 1.2814085411095317, + "learning_rate": 1.2706707346353165e-06, + "loss": 0.0918, + "step": 10636 + }, + { + "epoch": 0.8427015250544663, + "grad_norm": 1.213247735202157, + "learning_rate": 1.2694192084878032e-06, + "loss": 0.1229, + "step": 10637 + }, + { + "epoch": 0.8427807486631016, + "grad_norm": 1.6758196082021466, + "learning_rate": 1.2681682572070275e-06, + "loss": 0.1772, + "step": 10638 + }, + { + "epoch": 0.8428599722717369, + "grad_norm": 1.4036752028238526, + "learning_rate": 1.2669178808753568e-06, + "loss": 0.1423, + "step": 10639 + }, + { + "epoch": 0.8429391958803724, + "grad_norm": 1.803465808092052, + "learning_rate": 1.265668079575124e-06, + "loss": 0.1929, + "step": 10640 + }, + { + "epoch": 0.8430184194890077, + "grad_norm": 1.3030318664980867, + "learning_rate": 1.264418853388618e-06, + "loss": 0.1173, + "step": 10641 + }, + { + "epoch": 0.8430976430976431, + "grad_norm": 1.3591640343531863, + "learning_rate": 1.2631702023980997e-06, + "loss": 0.0885, + "step": 10642 + }, + { + "epoch": 0.8431768667062784, + "grad_norm": 1.2844208331143927, + "learning_rate": 1.2619221266857851e-06, + "loss": 0.0997, + "step": 10643 + }, + { + "epoch": 0.8432560903149139, + "grad_norm": 1.473859767773709, + "learning_rate": 1.260674626333851e-06, + "loss": 0.1358, + "step": 10644 + }, + { + "epoch": 0.8433353139235492, + "grad_norm": 1.5754037872076125, + "learning_rate": 1.259427701424445e-06, + "loss": 0.2007, + "step": 10645 + }, + { + "epoch": 0.8434145375321845, + "grad_norm": 1.330050597190308, + "learning_rate": 1.2581813520396668e-06, + "loss": 0.1124, + "step": 10646 + }, + { + "epoch": 0.84349376114082, + "grad_norm": 1.4263162770836706, + "learning_rate": 1.256935578261581e-06, + "loss": 0.1177, + "step": 10647 + }, + { + "epoch": 0.8435729847494553, + "grad_norm": 3.0772168883343913, + "learning_rate": 1.255690380172222e-06, + "loss": 0.1519, + "step": 10648 + }, + { + "epoch": 0.8436522083580907, + "grad_norm": 1.831858704793479, + "learning_rate": 1.2544457578535764e-06, + "loss": 0.2082, + "step": 10649 + }, + { + "epoch": 0.8437314319667261, + "grad_norm": 1.3041486853598445, + "learning_rate": 1.253201711387594e-06, + "loss": 0.104, + "step": 10650 + }, + { + "epoch": 0.8438106555753615, + "grad_norm": 1.408366720003487, + "learning_rate": 1.2519582408561936e-06, + "loss": 0.1484, + "step": 10651 + }, + { + "epoch": 0.8438898791839968, + "grad_norm": 1.3085834716889044, + "learning_rate": 1.2507153463412513e-06, + "loss": 0.099, + "step": 10652 + }, + { + "epoch": 0.8439691027926322, + "grad_norm": 1.3676222409141765, + "learning_rate": 1.2494730279246014e-06, + "loss": 0.1312, + "step": 10653 + }, + { + "epoch": 0.8440483264012676, + "grad_norm": 1.4013296168784446, + "learning_rate": 1.2482312856880506e-06, + "loss": 0.1051, + "step": 10654 + }, + { + "epoch": 0.8441275500099029, + "grad_norm": 1.4668953876345308, + "learning_rate": 1.2469901197133582e-06, + "loss": 0.2093, + "step": 10655 + }, + { + "epoch": 0.8442067736185384, + "grad_norm": 1.5331215415510298, + "learning_rate": 1.2457495300822497e-06, + "loss": 0.1576, + "step": 10656 + }, + { + "epoch": 0.8442859972271737, + "grad_norm": 1.348251884135471, + "learning_rate": 1.244509516876411e-06, + "loss": 0.0886, + "step": 10657 + }, + { + "epoch": 0.8443652208358091, + "grad_norm": 1.658490860092014, + "learning_rate": 1.2432700801774923e-06, + "loss": 0.2101, + "step": 10658 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 1.897794198619597, + "learning_rate": 1.2420312200671048e-06, + "loss": 0.1657, + "step": 10659 + }, + { + "epoch": 0.8445236680530798, + "grad_norm": 1.855774330869629, + "learning_rate": 1.240792936626819e-06, + "loss": 0.199, + "step": 10660 + }, + { + "epoch": 0.8446028916617152, + "grad_norm": 1.4691100723579162, + "learning_rate": 1.2395552299381742e-06, + "loss": 0.1506, + "step": 10661 + }, + { + "epoch": 0.8446821152703505, + "grad_norm": 1.0930413770737535, + "learning_rate": 1.238318100082664e-06, + "loss": 0.0826, + "step": 10662 + }, + { + "epoch": 0.844761338878986, + "grad_norm": 1.7966270129389204, + "learning_rate": 1.2370815471417464e-06, + "loss": 0.2342, + "step": 10663 + }, + { + "epoch": 0.8448405624876213, + "grad_norm": 1.2941394872342418, + "learning_rate": 1.2358455711968463e-06, + "loss": 0.1273, + "step": 10664 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 1.6333518181697004, + "learning_rate": 1.2346101723293457e-06, + "loss": 0.1538, + "step": 10665 + }, + { + "epoch": 0.8449990097048921, + "grad_norm": 1.317741960859693, + "learning_rate": 1.233375350620587e-06, + "loss": 0.1282, + "step": 10666 + }, + { + "epoch": 0.8450782333135274, + "grad_norm": 1.7954736746770787, + "learning_rate": 1.2321411061518807e-06, + "loss": 0.1446, + "step": 10667 + }, + { + "epoch": 0.8451574569221628, + "grad_norm": 2.059515867163736, + "learning_rate": 1.2309074390044939e-06, + "loss": 0.1657, + "step": 10668 + }, + { + "epoch": 0.8452366805307981, + "grad_norm": 1.2631296994543926, + "learning_rate": 1.2296743492596587e-06, + "loss": 0.1166, + "step": 10669 + }, + { + "epoch": 0.8453159041394336, + "grad_norm": 1.626095521050016, + "learning_rate": 1.2284418369985651e-06, + "loss": 0.182, + "step": 10670 + }, + { + "epoch": 0.8453951277480689, + "grad_norm": 1.3158982902668888, + "learning_rate": 1.227209902302372e-06, + "loss": 0.1154, + "step": 10671 + }, + { + "epoch": 0.8454743513567043, + "grad_norm": 1.3754095669626523, + "learning_rate": 1.2259785452521956e-06, + "loss": 0.1439, + "step": 10672 + }, + { + "epoch": 0.8455535749653397, + "grad_norm": 1.4965944818283856, + "learning_rate": 1.2247477659291118e-06, + "loss": 0.1655, + "step": 10673 + }, + { + "epoch": 0.845632798573975, + "grad_norm": 1.3557962784722621, + "learning_rate": 1.223517564414166e-06, + "loss": 0.0949, + "step": 10674 + }, + { + "epoch": 0.8457120221826104, + "grad_norm": 1.543222769771887, + "learning_rate": 1.2222879407883592e-06, + "loss": 0.1262, + "step": 10675 + }, + { + "epoch": 0.8457912457912458, + "grad_norm": 1.5546244580120567, + "learning_rate": 1.2210588951326542e-06, + "loss": 0.1899, + "step": 10676 + }, + { + "epoch": 0.8458704693998812, + "grad_norm": 1.885352706403217, + "learning_rate": 1.2198304275279805e-06, + "loss": 0.1914, + "step": 10677 + }, + { + "epoch": 0.8459496930085165, + "grad_norm": 2.0336636813408924, + "learning_rate": 1.2186025380552259e-06, + "loss": 0.2013, + "step": 10678 + }, + { + "epoch": 0.846028916617152, + "grad_norm": 1.861662581942874, + "learning_rate": 1.2173752267952376e-06, + "loss": 0.2344, + "step": 10679 + }, + { + "epoch": 0.8461081402257873, + "grad_norm": 1.5795910758171352, + "learning_rate": 1.2161484938288348e-06, + "loss": 0.1673, + "step": 10680 + }, + { + "epoch": 0.8461873638344226, + "grad_norm": 1.5839058102081605, + "learning_rate": 1.214922339236788e-06, + "loss": 0.1903, + "step": 10681 + }, + { + "epoch": 0.846266587443058, + "grad_norm": 1.399027769939382, + "learning_rate": 1.213696763099832e-06, + "loss": 0.1301, + "step": 10682 + }, + { + "epoch": 0.8463458110516934, + "grad_norm": 1.348424042624097, + "learning_rate": 1.2124717654986695e-06, + "loss": 0.1279, + "step": 10683 + }, + { + "epoch": 0.8464250346603288, + "grad_norm": 1.9523961313473757, + "learning_rate": 1.2112473465139586e-06, + "loss": 0.2408, + "step": 10684 + }, + { + "epoch": 0.8465042582689641, + "grad_norm": 1.9523735953415242, + "learning_rate": 1.210023506226321e-06, + "loss": 0.2004, + "step": 10685 + }, + { + "epoch": 0.8465834818775996, + "grad_norm": 1.5478258924179662, + "learning_rate": 1.2088002447163383e-06, + "loss": 0.1649, + "step": 10686 + }, + { + "epoch": 0.8466627054862349, + "grad_norm": 1.520408475955764, + "learning_rate": 1.2075775620645613e-06, + "loss": 0.1288, + "step": 10687 + }, + { + "epoch": 0.8467419290948702, + "grad_norm": 1.6425088673968629, + "learning_rate": 1.2063554583514947e-06, + "loss": 0.1825, + "step": 10688 + }, + { + "epoch": 0.8468211527035057, + "grad_norm": 1.3383072044081166, + "learning_rate": 1.2051339336576074e-06, + "loss": 0.1113, + "step": 10689 + }, + { + "epoch": 0.846900376312141, + "grad_norm": 1.409507248217889, + "learning_rate": 1.203912988063335e-06, + "loss": 0.1407, + "step": 10690 + }, + { + "epoch": 0.8469795999207764, + "grad_norm": 1.6913524402417588, + "learning_rate": 1.2026926216490675e-06, + "loss": 0.2201, + "step": 10691 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 1.7662384992939641, + "learning_rate": 1.2014728344951587e-06, + "loss": 0.1677, + "step": 10692 + }, + { + "epoch": 0.8471380471380472, + "grad_norm": 1.3816241585909814, + "learning_rate": 1.2002536266819309e-06, + "loss": 0.1349, + "step": 10693 + }, + { + "epoch": 0.8472172707466825, + "grad_norm": 2.0017394082175546, + "learning_rate": 1.1990349982896598e-06, + "loss": 0.2601, + "step": 10694 + }, + { + "epoch": 0.8472964943553178, + "grad_norm": 1.4062860135652977, + "learning_rate": 1.1978169493985836e-06, + "loss": 0.1666, + "step": 10695 + }, + { + "epoch": 0.8473757179639533, + "grad_norm": 1.466595131423128, + "learning_rate": 1.1965994800889113e-06, + "loss": 0.1551, + "step": 10696 + }, + { + "epoch": 0.8474549415725886, + "grad_norm": 1.5917681099215155, + "learning_rate": 1.1953825904408033e-06, + "loss": 0.157, + "step": 10697 + }, + { + "epoch": 0.847534165181224, + "grad_norm": 1.6968631714051232, + "learning_rate": 1.1941662805343846e-06, + "loss": 0.1566, + "step": 10698 + }, + { + "epoch": 0.8476133887898594, + "grad_norm": 1.5197051484697242, + "learning_rate": 1.1929505504497464e-06, + "loss": 0.1716, + "step": 10699 + }, + { + "epoch": 0.8476926123984948, + "grad_norm": 1.860364215240847, + "learning_rate": 1.191735400266939e-06, + "loss": 0.1929, + "step": 10700 + }, + { + "epoch": 0.8477718360071301, + "grad_norm": 1.6662502298455804, + "learning_rate": 1.190520830065972e-06, + "loss": 0.1519, + "step": 10701 + }, + { + "epoch": 0.8478510596157655, + "grad_norm": 1.029289426591469, + "learning_rate": 1.189306839926818e-06, + "loss": 0.0955, + "step": 10702 + }, + { + "epoch": 0.8479302832244009, + "grad_norm": 1.3631240466217436, + "learning_rate": 1.1880934299294167e-06, + "loss": 0.1308, + "step": 10703 + }, + { + "epoch": 0.8480095068330362, + "grad_norm": 1.6788271571037947, + "learning_rate": 1.1868806001536625e-06, + "loss": 0.1381, + "step": 10704 + }, + { + "epoch": 0.8480887304416717, + "grad_norm": 1.746864661650298, + "learning_rate": 1.185668350679413e-06, + "loss": 0.1065, + "step": 10705 + }, + { + "epoch": 0.848167954050307, + "grad_norm": 1.5660316279931281, + "learning_rate": 1.1844566815864921e-06, + "loss": 0.1403, + "step": 10706 + }, + { + "epoch": 0.8482471776589424, + "grad_norm": 1.302487619894792, + "learning_rate": 1.1832455929546827e-06, + "loss": 0.0911, + "step": 10707 + }, + { + "epoch": 0.8483264012675777, + "grad_norm": 1.662286876416712, + "learning_rate": 1.182035084863724e-06, + "loss": 0.145, + "step": 10708 + }, + { + "epoch": 0.8484056248762131, + "grad_norm": 1.5309196113711163, + "learning_rate": 1.1808251573933272e-06, + "loss": 0.1571, + "step": 10709 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.3450744293873835, + "learning_rate": 1.1796158106231603e-06, + "loss": 0.1353, + "step": 10710 + }, + { + "epoch": 0.8485640720934838, + "grad_norm": 1.380369853613987, + "learning_rate": 1.1784070446328477e-06, + "loss": 0.0908, + "step": 10711 + }, + { + "epoch": 0.8486432957021193, + "grad_norm": 1.6753488480979988, + "learning_rate": 1.177198859501989e-06, + "loss": 0.1583, + "step": 10712 + }, + { + "epoch": 0.8487225193107546, + "grad_norm": 1.5865528090853542, + "learning_rate": 1.1759912553101316e-06, + "loss": 0.2, + "step": 10713 + }, + { + "epoch": 0.8488017429193899, + "grad_norm": 1.1312338996565505, + "learning_rate": 1.1747842321367886e-06, + "loss": 0.1251, + "step": 10714 + }, + { + "epoch": 0.8488809665280254, + "grad_norm": 1.3844549861810165, + "learning_rate": 1.173577790061442e-06, + "loss": 0.1447, + "step": 10715 + }, + { + "epoch": 0.8489601901366607, + "grad_norm": 1.6629610526972813, + "learning_rate": 1.1723719291635272e-06, + "loss": 0.1592, + "step": 10716 + }, + { + "epoch": 0.8490394137452961, + "grad_norm": 1.899786902491622, + "learning_rate": 1.171166649522444e-06, + "loss": 0.2097, + "step": 10717 + }, + { + "epoch": 0.8491186373539314, + "grad_norm": 1.4779685483111693, + "learning_rate": 1.1699619512175563e-06, + "loss": 0.155, + "step": 10718 + }, + { + "epoch": 0.8491978609625669, + "grad_norm": 1.5631792328393774, + "learning_rate": 1.168757834328188e-06, + "loss": 0.1201, + "step": 10719 + }, + { + "epoch": 0.8492770845712022, + "grad_norm": 2.621245318854018, + "learning_rate": 1.1675542989336208e-06, + "loss": 0.1776, + "step": 10720 + }, + { + "epoch": 0.8493563081798375, + "grad_norm": 1.6246803734351898, + "learning_rate": 1.1663513451131047e-06, + "loss": 0.1479, + "step": 10721 + }, + { + "epoch": 0.849435531788473, + "grad_norm": 1.3830803311759983, + "learning_rate": 1.1651489729458487e-06, + "loss": 0.0968, + "step": 10722 + }, + { + "epoch": 0.8495147553971083, + "grad_norm": 1.4377400573625967, + "learning_rate": 1.1639471825110205e-06, + "loss": 0.123, + "step": 10723 + }, + { + "epoch": 0.8495939790057437, + "grad_norm": 1.5542109076718642, + "learning_rate": 1.1627459738877557e-06, + "loss": 0.1659, + "step": 10724 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 1.6332324553844815, + "learning_rate": 1.1615453471551462e-06, + "loss": 0.1268, + "step": 10725 + }, + { + "epoch": 0.8497524262230145, + "grad_norm": 1.2870130611851327, + "learning_rate": 1.1603453023922473e-06, + "loss": 0.0965, + "step": 10726 + }, + { + "epoch": 0.8498316498316498, + "grad_norm": 1.9095991502783456, + "learning_rate": 1.1591458396780753e-06, + "loss": 0.251, + "step": 10727 + }, + { + "epoch": 0.8499108734402852, + "grad_norm": 1.396403365650525, + "learning_rate": 1.1579469590916125e-06, + "loss": 0.17, + "step": 10728 + }, + { + "epoch": 0.8499900970489206, + "grad_norm": 1.782881100550267, + "learning_rate": 1.156748660711796e-06, + "loss": 0.1983, + "step": 10729 + }, + { + "epoch": 0.8500693206575559, + "grad_norm": 2.0620842280042146, + "learning_rate": 1.1555509446175284e-06, + "loss": 0.2231, + "step": 10730 + }, + { + "epoch": 0.8501485442661914, + "grad_norm": 2.0084242249029116, + "learning_rate": 1.1543538108876751e-06, + "loss": 0.2524, + "step": 10731 + }, + { + "epoch": 0.8502277678748267, + "grad_norm": 1.6925261180509055, + "learning_rate": 1.153157259601062e-06, + "loss": 0.1565, + "step": 10732 + }, + { + "epoch": 0.8503069914834621, + "grad_norm": 1.5909196224828253, + "learning_rate": 1.1519612908364718e-06, + "loss": 0.1663, + "step": 10733 + }, + { + "epoch": 0.8503862150920974, + "grad_norm": 1.2773661543164585, + "learning_rate": 1.1507659046726605e-06, + "loss": 0.1069, + "step": 10734 + }, + { + "epoch": 0.8504654387007328, + "grad_norm": 1.7757580336150474, + "learning_rate": 1.1495711011883325e-06, + "loss": 0.1587, + "step": 10735 + }, + { + "epoch": 0.8505446623093682, + "grad_norm": 1.4649717153794204, + "learning_rate": 1.148376880462161e-06, + "loss": 0.1371, + "step": 10736 + }, + { + "epoch": 0.8506238859180035, + "grad_norm": 1.4027923989333666, + "learning_rate": 1.1471832425727825e-06, + "loss": 0.1273, + "step": 10737 + }, + { + "epoch": 0.850703109526639, + "grad_norm": 2.2080231813661464, + "learning_rate": 1.14599018759879e-06, + "loss": 0.237, + "step": 10738 + }, + { + "epoch": 0.8507823331352743, + "grad_norm": 1.1946025165554786, + "learning_rate": 1.1447977156187395e-06, + "loss": 0.098, + "step": 10739 + }, + { + "epoch": 0.8508615567439097, + "grad_norm": 1.5738243768106932, + "learning_rate": 1.1436058267111527e-06, + "loss": 0.1262, + "step": 10740 + }, + { + "epoch": 0.8509407803525451, + "grad_norm": 1.451179334028792, + "learning_rate": 1.1424145209545079e-06, + "loss": 0.1459, + "step": 10741 + }, + { + "epoch": 0.8510200039611804, + "grad_norm": 1.3704756152187496, + "learning_rate": 1.1412237984272467e-06, + "loss": 0.0951, + "step": 10742 + }, + { + "epoch": 0.8510992275698158, + "grad_norm": 1.4400951994687616, + "learning_rate": 1.140033659207771e-06, + "loss": 0.1094, + "step": 10743 + }, + { + "epoch": 0.8511784511784511, + "grad_norm": 1.841590275935626, + "learning_rate": 1.1388441033744502e-06, + "loss": 0.1607, + "step": 10744 + }, + { + "epoch": 0.8512576747870866, + "grad_norm": 1.588652919642301, + "learning_rate": 1.1376551310056073e-06, + "loss": 0.1519, + "step": 10745 + }, + { + "epoch": 0.8513368983957219, + "grad_norm": 1.4371990395365755, + "learning_rate": 1.1364667421795283e-06, + "loss": 0.1583, + "step": 10746 + }, + { + "epoch": 0.8514161220043573, + "grad_norm": 1.4173112946760453, + "learning_rate": 1.1352789369744688e-06, + "loss": 0.1133, + "step": 10747 + }, + { + "epoch": 0.8514953456129927, + "grad_norm": 1.477009545268548, + "learning_rate": 1.134091715468636e-06, + "loss": 0.1484, + "step": 10748 + }, + { + "epoch": 0.851574569221628, + "grad_norm": 2.156278001422682, + "learning_rate": 1.132905077740203e-06, + "loss": 0.2004, + "step": 10749 + }, + { + "epoch": 0.8516537928302634, + "grad_norm": 2.384420320272095, + "learning_rate": 1.131719023867306e-06, + "loss": 0.2268, + "step": 10750 + }, + { + "epoch": 0.8517330164388988, + "grad_norm": 1.6216524252611697, + "learning_rate": 1.1305335539280392e-06, + "loss": 0.1669, + "step": 10751 + }, + { + "epoch": 0.8518122400475342, + "grad_norm": 1.402160304943576, + "learning_rate": 1.1293486680004607e-06, + "loss": 0.1135, + "step": 10752 + }, + { + "epoch": 0.8518914636561695, + "grad_norm": 1.7332955130269427, + "learning_rate": 1.1281643661625896e-06, + "loss": 0.1824, + "step": 10753 + }, + { + "epoch": 0.851970687264805, + "grad_norm": 1.6749409114378597, + "learning_rate": 1.1269806484924072e-06, + "loss": 0.1665, + "step": 10754 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 1.6215578409924865, + "learning_rate": 1.1257975150678557e-06, + "loss": 0.1649, + "step": 10755 + }, + { + "epoch": 0.8521291344820756, + "grad_norm": 2.0761668365764474, + "learning_rate": 1.124614965966835e-06, + "loss": 0.1978, + "step": 10756 + }, + { + "epoch": 0.852208358090711, + "grad_norm": 1.3136853998749878, + "learning_rate": 1.1234330012672146e-06, + "loss": 0.0878, + "step": 10757 + }, + { + "epoch": 0.8522875816993464, + "grad_norm": 1.592019295297516, + "learning_rate": 1.1222516210468204e-06, + "loss": 0.1363, + "step": 10758 + }, + { + "epoch": 0.8523668053079818, + "grad_norm": 1.1758758040708581, + "learning_rate": 1.121070825383438e-06, + "loss": 0.096, + "step": 10759 + }, + { + "epoch": 0.8524460289166171, + "grad_norm": 1.631126614257944, + "learning_rate": 1.1198906143548216e-06, + "loss": 0.1193, + "step": 10760 + }, + { + "epoch": 0.8525252525252526, + "grad_norm": 1.4731571300171946, + "learning_rate": 1.1187109880386794e-06, + "loss": 0.1313, + "step": 10761 + }, + { + "epoch": 0.8526044761338879, + "grad_norm": 1.5096728444822427, + "learning_rate": 1.117531946512682e-06, + "loss": 0.1284, + "step": 10762 + }, + { + "epoch": 0.8526836997425232, + "grad_norm": 1.449893354010232, + "learning_rate": 1.1163534898544692e-06, + "loss": 0.1503, + "step": 10763 + }, + { + "epoch": 0.8527629233511587, + "grad_norm": 1.3353347513699871, + "learning_rate": 1.1151756181416328e-06, + "loss": 0.1392, + "step": 10764 + }, + { + "epoch": 0.852842146959794, + "grad_norm": 1.2333673607803275, + "learning_rate": 1.1139983314517288e-06, + "loss": 0.0815, + "step": 10765 + }, + { + "epoch": 0.8529213705684294, + "grad_norm": 1.131887101651616, + "learning_rate": 1.1128216298622808e-06, + "loss": 0.0838, + "step": 10766 + }, + { + "epoch": 0.8530005941770648, + "grad_norm": 1.4841197255398926, + "learning_rate": 1.1116455134507665e-06, + "loss": 0.1256, + "step": 10767 + }, + { + "epoch": 0.8530798177857002, + "grad_norm": 2.0210254513643395, + "learning_rate": 1.110469982294624e-06, + "loss": 0.1639, + "step": 10768 + }, + { + "epoch": 0.8531590413943355, + "grad_norm": 1.3508216702846854, + "learning_rate": 1.1092950364712617e-06, + "loss": 0.1139, + "step": 10769 + }, + { + "epoch": 0.8532382650029708, + "grad_norm": 1.9734629410394948, + "learning_rate": 1.1081206760580422e-06, + "loss": 0.185, + "step": 10770 + }, + { + "epoch": 0.8533174886116063, + "grad_norm": 1.4024693287905854, + "learning_rate": 1.1069469011322908e-06, + "loss": 0.1416, + "step": 10771 + }, + { + "epoch": 0.8533967122202416, + "grad_norm": 1.6091626929897471, + "learning_rate": 1.1057737117712941e-06, + "loss": 0.158, + "step": 10772 + }, + { + "epoch": 0.853475935828877, + "grad_norm": 1.3352103199377978, + "learning_rate": 1.1046011080523034e-06, + "loss": 0.1424, + "step": 10773 + }, + { + "epoch": 0.8535551594375124, + "grad_norm": 1.4384358195654565, + "learning_rate": 1.1034290900525279e-06, + "loss": 0.176, + "step": 10774 + }, + { + "epoch": 0.8536343830461478, + "grad_norm": 1.2364957322053722, + "learning_rate": 1.1022576578491372e-06, + "loss": 0.1267, + "step": 10775 + }, + { + "epoch": 0.8537136066547831, + "grad_norm": 1.5226341616040144, + "learning_rate": 1.1010868115192696e-06, + "loss": 0.1675, + "step": 10776 + }, + { + "epoch": 0.8537928302634185, + "grad_norm": 1.4993668987531505, + "learning_rate": 1.0999165511400157e-06, + "loss": 0.1191, + "step": 10777 + }, + { + "epoch": 0.8538720538720539, + "grad_norm": 1.616445639301569, + "learning_rate": 1.09874687678843e-06, + "loss": 0.1332, + "step": 10778 + }, + { + "epoch": 0.8539512774806892, + "grad_norm": 1.7291532180765008, + "learning_rate": 1.097577788541535e-06, + "loss": 0.1963, + "step": 10779 + }, + { + "epoch": 0.8540305010893247, + "grad_norm": 1.9633627835641465, + "learning_rate": 1.0964092864763065e-06, + "loss": 0.1586, + "step": 10780 + }, + { + "epoch": 0.85410972469796, + "grad_norm": 1.8994142774003182, + "learning_rate": 1.095241370669684e-06, + "loss": 0.1987, + "step": 10781 + }, + { + "epoch": 0.8541889483065954, + "grad_norm": 1.213025168452882, + "learning_rate": 1.0940740411985718e-06, + "loss": 0.1057, + "step": 10782 + }, + { + "epoch": 0.8542681719152307, + "grad_norm": 1.3279690628374319, + "learning_rate": 1.0929072981398313e-06, + "loss": 0.1029, + "step": 10783 + }, + { + "epoch": 0.8543473955238661, + "grad_norm": 1.7370033958933218, + "learning_rate": 1.091741141570285e-06, + "loss": 0.1757, + "step": 10784 + }, + { + "epoch": 0.8544266191325015, + "grad_norm": 1.5499041717521618, + "learning_rate": 1.0905755715667222e-06, + "loss": 0.1648, + "step": 10785 + }, + { + "epoch": 0.8545058427411368, + "grad_norm": 1.6940885929497413, + "learning_rate": 1.0894105882058891e-06, + "loss": 0.1804, + "step": 10786 + }, + { + "epoch": 0.8545850663497723, + "grad_norm": 2.006176652490394, + "learning_rate": 1.0882461915644936e-06, + "loss": 0.1417, + "step": 10787 + }, + { + "epoch": 0.8546642899584076, + "grad_norm": 2.2155747054148796, + "learning_rate": 1.0870823817192045e-06, + "loss": 0.2456, + "step": 10788 + }, + { + "epoch": 0.854743513567043, + "grad_norm": 1.3053202447701555, + "learning_rate": 1.0859191587466556e-06, + "loss": 0.0999, + "step": 10789 + }, + { + "epoch": 0.8548227371756784, + "grad_norm": 1.5984221308445026, + "learning_rate": 1.0847565227234392e-06, + "loss": 0.1523, + "step": 10790 + }, + { + "epoch": 0.8549019607843137, + "grad_norm": 1.5777703577484765, + "learning_rate": 1.0835944737261072e-06, + "loss": 0.1643, + "step": 10791 + }, + { + "epoch": 0.8549811843929491, + "grad_norm": 1.4860269615427253, + "learning_rate": 1.0824330118311765e-06, + "loss": 0.1364, + "step": 10792 + }, + { + "epoch": 0.8550604080015844, + "grad_norm": 1.9153429872493002, + "learning_rate": 1.0812721371151213e-06, + "loss": 0.1675, + "step": 10793 + }, + { + "epoch": 0.8551396316102199, + "grad_norm": 1.2205180080888183, + "learning_rate": 1.080111849654384e-06, + "loss": 0.0926, + "step": 10794 + }, + { + "epoch": 0.8552188552188552, + "grad_norm": 1.7285872323238727, + "learning_rate": 1.078952149525362e-06, + "loss": 0.1365, + "step": 10795 + }, + { + "epoch": 0.8552980788274905, + "grad_norm": 1.3793799193755691, + "learning_rate": 1.0777930368044143e-06, + "loss": 0.1207, + "step": 10796 + }, + { + "epoch": 0.855377302436126, + "grad_norm": 1.8215710165403955, + "learning_rate": 1.0766345115678633e-06, + "loss": 0.1572, + "step": 10797 + }, + { + "epoch": 0.8554565260447613, + "grad_norm": 1.5703175832446667, + "learning_rate": 1.0754765738919947e-06, + "loss": 0.1833, + "step": 10798 + }, + { + "epoch": 0.8555357496533967, + "grad_norm": 1.4602348802495926, + "learning_rate": 1.074319223853052e-06, + "loss": 0.1968, + "step": 10799 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 1.1861156951345262, + "learning_rate": 1.0731624615272385e-06, + "loss": 0.0817, + "step": 10800 + }, + { + "epoch": 0.8556941968706675, + "grad_norm": 1.2166464024514778, + "learning_rate": 1.0720062869907255e-06, + "loss": 0.1146, + "step": 10801 + }, + { + "epoch": 0.8557734204793028, + "grad_norm": 1.4642846957734983, + "learning_rate": 1.07085070031964e-06, + "loss": 0.1175, + "step": 10802 + }, + { + "epoch": 0.8558526440879382, + "grad_norm": 1.5136991003757623, + "learning_rate": 1.06969570159007e-06, + "loss": 0.229, + "step": 10803 + }, + { + "epoch": 0.8559318676965736, + "grad_norm": 1.7684758075250464, + "learning_rate": 1.0685412908780702e-06, + "loss": 0.2779, + "step": 10804 + }, + { + "epoch": 0.8560110913052089, + "grad_norm": 1.6070700715321955, + "learning_rate": 1.0673874682596497e-06, + "loss": 0.2305, + "step": 10805 + }, + { + "epoch": 0.8560903149138444, + "grad_norm": 1.16904737079037, + "learning_rate": 1.0662342338107823e-06, + "loss": 0.0889, + "step": 10806 + }, + { + "epoch": 0.8561695385224797, + "grad_norm": 1.9044992815805222, + "learning_rate": 1.065081587607406e-06, + "loss": 0.2386, + "step": 10807 + }, + { + "epoch": 0.8562487621311151, + "grad_norm": 1.6172652451385783, + "learning_rate": 1.0639295297254149e-06, + "loss": 0.1262, + "step": 10808 + }, + { + "epoch": 0.8563279857397504, + "grad_norm": 1.4735378412564195, + "learning_rate": 1.0627780602406656e-06, + "loss": 0.1347, + "step": 10809 + }, + { + "epoch": 0.8564072093483858, + "grad_norm": 1.3239435164978808, + "learning_rate": 1.061627179228979e-06, + "loss": 0.1508, + "step": 10810 + }, + { + "epoch": 0.8564864329570212, + "grad_norm": 1.9948249869847379, + "learning_rate": 1.0604768867661342e-06, + "loss": 0.1812, + "step": 10811 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 1.9830625571650182, + "learning_rate": 1.0593271829278718e-06, + "loss": 0.2253, + "step": 10812 + }, + { + "epoch": 0.856644880174292, + "grad_norm": 1.4365093643391995, + "learning_rate": 1.0581780677898924e-06, + "loss": 0.1588, + "step": 10813 + }, + { + "epoch": 0.8567241037829273, + "grad_norm": 1.8006801448401626, + "learning_rate": 1.0570295414278642e-06, + "loss": 0.1818, + "step": 10814 + }, + { + "epoch": 0.8568033273915627, + "grad_norm": 1.3889140714159647, + "learning_rate": 1.0558816039174102e-06, + "loss": 0.14, + "step": 10815 + }, + { + "epoch": 0.8568825510001981, + "grad_norm": 2.2535281048472844, + "learning_rate": 1.0547342553341144e-06, + "loss": 0.1794, + "step": 10816 + }, + { + "epoch": 0.8569617746088334, + "grad_norm": 1.3078616228040925, + "learning_rate": 1.0535874957535275e-06, + "loss": 0.1385, + "step": 10817 + }, + { + "epoch": 0.8570409982174688, + "grad_norm": 1.3735011857183317, + "learning_rate": 1.0524413252511567e-06, + "loss": 0.1045, + "step": 10818 + }, + { + "epoch": 0.8571202218261041, + "grad_norm": 1.5673797396505862, + "learning_rate": 1.0512957439024697e-06, + "loss": 0.141, + "step": 10819 + }, + { + "epoch": 0.8571994454347396, + "grad_norm": 1.9024765870579567, + "learning_rate": 1.0501507517829012e-06, + "loss": 0.1314, + "step": 10820 + }, + { + "epoch": 0.8572786690433749, + "grad_norm": 1.5778784050280112, + "learning_rate": 1.0490063489678427e-06, + "loss": 0.1269, + "step": 10821 + }, + { + "epoch": 0.8573578926520103, + "grad_norm": 1.4779781148025237, + "learning_rate": 1.0478625355326445e-06, + "loss": 0.1351, + "step": 10822 + }, + { + "epoch": 0.8574371162606457, + "grad_norm": 1.320203920000287, + "learning_rate": 1.0467193115526254e-06, + "loss": 0.1101, + "step": 10823 + }, + { + "epoch": 0.857516339869281, + "grad_norm": 1.5433091785593682, + "learning_rate": 1.0455766771030585e-06, + "loss": 0.16, + "step": 10824 + }, + { + "epoch": 0.8575955634779164, + "grad_norm": 1.8562092527200387, + "learning_rate": 1.0444346322591804e-06, + "loss": 0.2233, + "step": 10825 + }, + { + "epoch": 0.8576747870865518, + "grad_norm": 1.6462935814332227, + "learning_rate": 1.0432931770961907e-06, + "loss": 0.1618, + "step": 10826 + }, + { + "epoch": 0.8577540106951872, + "grad_norm": 1.5957889532741771, + "learning_rate": 1.0421523116892496e-06, + "loss": 0.1251, + "step": 10827 + }, + { + "epoch": 0.8578332343038225, + "grad_norm": 1.304027012229241, + "learning_rate": 1.0410120361134767e-06, + "loss": 0.126, + "step": 10828 + }, + { + "epoch": 0.857912457912458, + "grad_norm": 1.9042148690092953, + "learning_rate": 1.0398723504439512e-06, + "loss": 0.1829, + "step": 10829 + }, + { + "epoch": 0.8579916815210933, + "grad_norm": 2.342898273114089, + "learning_rate": 1.0387332547557194e-06, + "loss": 0.1685, + "step": 10830 + }, + { + "epoch": 0.8580709051297286, + "grad_norm": 1.7327325448050124, + "learning_rate": 1.0375947491237836e-06, + "loss": 0.1534, + "step": 10831 + }, + { + "epoch": 0.858150128738364, + "grad_norm": 1.5784442859851633, + "learning_rate": 1.0364568336231085e-06, + "loss": 0.1815, + "step": 10832 + }, + { + "epoch": 0.8582293523469994, + "grad_norm": 1.195464172406603, + "learning_rate": 1.0353195083286226e-06, + "loss": 0.0939, + "step": 10833 + }, + { + "epoch": 0.8583085759556348, + "grad_norm": 1.5450215768030042, + "learning_rate": 1.034182773315211e-06, + "loss": 0.147, + "step": 10834 + }, + { + "epoch": 0.8583877995642701, + "grad_norm": 1.8235731676005325, + "learning_rate": 1.0330466286577224e-06, + "loss": 0.1759, + "step": 10835 + }, + { + "epoch": 0.8584670231729056, + "grad_norm": 1.8986666570621906, + "learning_rate": 1.031911074430968e-06, + "loss": 0.1597, + "step": 10836 + }, + { + "epoch": 0.8585462467815409, + "grad_norm": 1.4176442126967628, + "learning_rate": 1.030776110709718e-06, + "loss": 0.1732, + "step": 10837 + }, + { + "epoch": 0.8586254703901762, + "grad_norm": 1.7690483511991986, + "learning_rate": 1.0296417375687017e-06, + "loss": 0.1517, + "step": 10838 + }, + { + "epoch": 0.8587046939988117, + "grad_norm": 1.3487759139396924, + "learning_rate": 1.0285079550826172e-06, + "loss": 0.1507, + "step": 10839 + }, + { + "epoch": 0.858783917607447, + "grad_norm": 1.398186009240261, + "learning_rate": 1.0273747633261144e-06, + "loss": 0.1148, + "step": 10840 + }, + { + "epoch": 0.8588631412160824, + "grad_norm": 1.4749994500332302, + "learning_rate": 1.0262421623738105e-06, + "loss": 0.1728, + "step": 10841 + }, + { + "epoch": 0.8589423648247178, + "grad_norm": 1.5673828105074443, + "learning_rate": 1.0251101523002805e-06, + "loss": 0.1124, + "step": 10842 + }, + { + "epoch": 0.8590215884333532, + "grad_norm": 1.2554095222069959, + "learning_rate": 1.0239787331800632e-06, + "loss": 0.0925, + "step": 10843 + }, + { + "epoch": 0.8591008120419885, + "grad_norm": 1.2759442326899917, + "learning_rate": 1.022847905087656e-06, + "loss": 0.1163, + "step": 10844 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 2.366365731786112, + "learning_rate": 1.0217176680975183e-06, + "loss": 0.1802, + "step": 10845 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 1.4738024555333302, + "learning_rate": 1.0205880222840726e-06, + "loss": 0.1724, + "step": 10846 + }, + { + "epoch": 0.8593384828678946, + "grad_norm": 1.4613123930405818, + "learning_rate": 1.0194589677216992e-06, + "loss": 0.1366, + "step": 10847 + }, + { + "epoch": 0.85941770647653, + "grad_norm": 1.7077496912084549, + "learning_rate": 1.0183305044847402e-06, + "loss": 0.1232, + "step": 10848 + }, + { + "epoch": 0.8594969300851654, + "grad_norm": 1.4517121254553849, + "learning_rate": 1.0172026326475016e-06, + "loss": 0.152, + "step": 10849 + }, + { + "epoch": 0.8595761536938008, + "grad_norm": 1.4148203809648374, + "learning_rate": 1.0160753522842482e-06, + "loss": 0.1736, + "step": 10850 + }, + { + "epoch": 0.8596553773024361, + "grad_norm": 2.086715696888453, + "learning_rate": 1.0149486634692019e-06, + "loss": 0.255, + "step": 10851 + }, + { + "epoch": 0.8597346009110715, + "grad_norm": 1.9728700402459152, + "learning_rate": 1.0138225662765555e-06, + "loss": 0.1897, + "step": 10852 + }, + { + "epoch": 0.8598138245197069, + "grad_norm": 1.7380816456086252, + "learning_rate": 1.0126970607804532e-06, + "loss": 0.1037, + "step": 10853 + }, + { + "epoch": 0.8598930481283422, + "grad_norm": 1.0552491580208538, + "learning_rate": 1.0115721470550045e-06, + "loss": 0.1065, + "step": 10854 + }, + { + "epoch": 0.8599722717369777, + "grad_norm": 1.7852307757394277, + "learning_rate": 1.0104478251742822e-06, + "loss": 0.2004, + "step": 10855 + }, + { + "epoch": 0.860051495345613, + "grad_norm": 1.643207040178242, + "learning_rate": 1.009324095212315e-06, + "loss": 0.1728, + "step": 10856 + }, + { + "epoch": 0.8601307189542484, + "grad_norm": 1.648967191646453, + "learning_rate": 1.0082009572430963e-06, + "loss": 0.1906, + "step": 10857 + }, + { + "epoch": 0.8602099425628837, + "grad_norm": 1.654034385543417, + "learning_rate": 1.0070784113405763e-06, + "loss": 0.1747, + "step": 10858 + }, + { + "epoch": 0.8602891661715191, + "grad_norm": 1.5233594064053226, + "learning_rate": 1.005956457578675e-06, + "loss": 0.1309, + "step": 10859 + }, + { + "epoch": 0.8603683897801545, + "grad_norm": 1.3149657327820967, + "learning_rate": 1.0048350960312637e-06, + "loss": 0.1086, + "step": 10860 + }, + { + "epoch": 0.8604476133887898, + "grad_norm": 1.2705457344890998, + "learning_rate": 1.003714326772176e-06, + "loss": 0.1007, + "step": 10861 + }, + { + "epoch": 0.8605268369974253, + "grad_norm": 1.6777246883902823, + "learning_rate": 1.0025941498752167e-06, + "loss": 0.1468, + "step": 10862 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 1.6829933122991079, + "learning_rate": 1.001474565414139e-06, + "loss": 0.2187, + "step": 10863 + }, + { + "epoch": 0.860685284214696, + "grad_norm": 1.4023549832523587, + "learning_rate": 1.0003555734626603e-06, + "loss": 0.1048, + "step": 10864 + }, + { + "epoch": 0.8607645078233314, + "grad_norm": 1.447306692629291, + "learning_rate": 9.992371740944663e-07, + "loss": 0.1227, + "step": 10865 + }, + { + "epoch": 0.8608437314319667, + "grad_norm": 1.1714256896915287, + "learning_rate": 9.981193673831946e-07, + "loss": 0.1149, + "step": 10866 + }, + { + "epoch": 0.8609229550406021, + "grad_norm": 2.1047922878558203, + "learning_rate": 9.970021534024476e-07, + "loss": 0.1971, + "step": 10867 + }, + { + "epoch": 0.8610021786492374, + "grad_norm": 1.4249657713444897, + "learning_rate": 9.958855322257922e-07, + "loss": 0.1609, + "step": 10868 + }, + { + "epoch": 0.8610814022578729, + "grad_norm": 1.599016886892758, + "learning_rate": 9.94769503926748e-07, + "loss": 0.1421, + "step": 10869 + }, + { + "epoch": 0.8611606258665082, + "grad_norm": 1.237440408740556, + "learning_rate": 9.936540685787998e-07, + "loss": 0.1082, + "step": 10870 + }, + { + "epoch": 0.8612398494751435, + "grad_norm": 1.6487295026807114, + "learning_rate": 9.925392262553968e-07, + "loss": 0.1617, + "step": 10871 + }, + { + "epoch": 0.861319073083779, + "grad_norm": 1.741309506424304, + "learning_rate": 9.914249770299445e-07, + "loss": 0.1306, + "step": 10872 + }, + { + "epoch": 0.8613982966924143, + "grad_norm": 1.279019951543846, + "learning_rate": 9.903113209758098e-07, + "loss": 0.1404, + "step": 10873 + }, + { + "epoch": 0.8614775203010497, + "grad_norm": 1.3963363639152515, + "learning_rate": 9.89198258166324e-07, + "loss": 0.1525, + "step": 10874 + }, + { + "epoch": 0.8615567439096851, + "grad_norm": 1.8041897769278583, + "learning_rate": 9.880857886747753e-07, + "loss": 0.1598, + "step": 10875 + }, + { + "epoch": 0.8616359675183205, + "grad_norm": 1.7045084859332988, + "learning_rate": 9.869739125744138e-07, + "loss": 0.1602, + "step": 10876 + }, + { + "epoch": 0.8617151911269558, + "grad_norm": 2.042644580505003, + "learning_rate": 9.858626299384532e-07, + "loss": 0.1366, + "step": 10877 + }, + { + "epoch": 0.8617944147355912, + "grad_norm": 1.5019622331554636, + "learning_rate": 9.847519408400663e-07, + "loss": 0.1378, + "step": 10878 + }, + { + "epoch": 0.8618736383442266, + "grad_norm": 1.3394970830297506, + "learning_rate": 9.836418453523833e-07, + "loss": 0.1108, + "step": 10879 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 1.1974992808467884, + "learning_rate": 9.825323435485024e-07, + "loss": 0.1237, + "step": 10880 + }, + { + "epoch": 0.8620320855614974, + "grad_norm": 1.366608349419337, + "learning_rate": 9.814234355014774e-07, + "loss": 0.1505, + "step": 10881 + }, + { + "epoch": 0.8621113091701327, + "grad_norm": 1.8277586847796512, + "learning_rate": 9.803151212843253e-07, + "loss": 0.1458, + "step": 10882 + }, + { + "epoch": 0.8621905327787681, + "grad_norm": 1.3973806595432972, + "learning_rate": 9.792074009700192e-07, + "loss": 0.1222, + "step": 10883 + }, + { + "epoch": 0.8622697563874034, + "grad_norm": 1.330393299093837, + "learning_rate": 9.781002746315039e-07, + "loss": 0.1544, + "step": 10884 + }, + { + "epoch": 0.8623489799960388, + "grad_norm": 1.3206180880977487, + "learning_rate": 9.769937423416741e-07, + "loss": 0.1195, + "step": 10885 + }, + { + "epoch": 0.8624282036046742, + "grad_norm": 1.7941559545091346, + "learning_rate": 9.758878041733877e-07, + "loss": 0.1668, + "step": 10886 + }, + { + "epoch": 0.8625074272133095, + "grad_norm": 1.219754197377734, + "learning_rate": 9.747824601994715e-07, + "loss": 0.1284, + "step": 10887 + }, + { + "epoch": 0.862586650821945, + "grad_norm": 1.6937680598776363, + "learning_rate": 9.73677710492703e-07, + "loss": 0.1795, + "step": 10888 + }, + { + "epoch": 0.8626658744305803, + "grad_norm": 1.1020138151894168, + "learning_rate": 9.725735551258241e-07, + "loss": 0.0877, + "step": 10889 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 1.7430455854755134, + "learning_rate": 9.7146999417154e-07, + "loss": 0.1866, + "step": 10890 + }, + { + "epoch": 0.8628243216478511, + "grad_norm": 1.6091114606406487, + "learning_rate": 9.703670277025158e-07, + "loss": 0.1385, + "step": 10891 + }, + { + "epoch": 0.8629035452564864, + "grad_norm": 1.6923879144965714, + "learning_rate": 9.69264655791372e-07, + "loss": 0.1383, + "step": 10892 + }, + { + "epoch": 0.8629827688651218, + "grad_norm": 1.3407640947865838, + "learning_rate": 9.681628785107e-07, + "loss": 0.1172, + "step": 10893 + }, + { + "epoch": 0.8630619924737571, + "grad_norm": 2.0665182299967375, + "learning_rate": 9.670616959330437e-07, + "loss": 0.1897, + "step": 10894 + }, + { + "epoch": 0.8631412160823926, + "grad_norm": 1.6023769922756916, + "learning_rate": 9.659611081309095e-07, + "loss": 0.1356, + "step": 10895 + }, + { + "epoch": 0.8632204396910279, + "grad_norm": 1.533479378923773, + "learning_rate": 9.648611151767683e-07, + "loss": 0.1318, + "step": 10896 + }, + { + "epoch": 0.8632996632996633, + "grad_norm": 1.684501421004511, + "learning_rate": 9.637617171430492e-07, + "loss": 0.1501, + "step": 10897 + }, + { + "epoch": 0.8633788869082987, + "grad_norm": 1.3926314796341706, + "learning_rate": 9.626629141021414e-07, + "loss": 0.14, + "step": 10898 + }, + { + "epoch": 0.863458110516934, + "grad_norm": 1.9107749568752463, + "learning_rate": 9.615647061263933e-07, + "loss": 0.1826, + "step": 10899 + }, + { + "epoch": 0.8635373341255694, + "grad_norm": 1.5473046932735017, + "learning_rate": 9.604670932881211e-07, + "loss": 0.1558, + "step": 10900 + }, + { + "epoch": 0.8636165577342048, + "grad_norm": 1.5360773646610772, + "learning_rate": 9.593700756595958e-07, + "loss": 0.1263, + "step": 10901 + }, + { + "epoch": 0.8636957813428402, + "grad_norm": 1.2362728298280266, + "learning_rate": 9.582736533130488e-07, + "loss": 0.1155, + "step": 10902 + }, + { + "epoch": 0.8637750049514755, + "grad_norm": 1.630227533659458, + "learning_rate": 9.571778263206767e-07, + "loss": 0.1688, + "step": 10903 + }, + { + "epoch": 0.863854228560111, + "grad_norm": 1.2581368232886827, + "learning_rate": 9.560825947546337e-07, + "loss": 0.0715, + "step": 10904 + }, + { + "epoch": 0.8639334521687463, + "grad_norm": 1.4397242223122362, + "learning_rate": 9.549879586870336e-07, + "loss": 0.1084, + "step": 10905 + }, + { + "epoch": 0.8640126757773816, + "grad_norm": 1.935895679477906, + "learning_rate": 9.538939181899565e-07, + "loss": 0.1921, + "step": 10906 + }, + { + "epoch": 0.864091899386017, + "grad_norm": 1.8147061921474954, + "learning_rate": 9.528004733354379e-07, + "loss": 0.224, + "step": 10907 + }, + { + "epoch": 0.8641711229946524, + "grad_norm": 1.637831248536567, + "learning_rate": 9.517076241954737e-07, + "loss": 0.1346, + "step": 10908 + }, + { + "epoch": 0.8642503466032878, + "grad_norm": 1.3868415411157224, + "learning_rate": 9.506153708420263e-07, + "loss": 0.1589, + "step": 10909 + }, + { + "epoch": 0.8643295702119231, + "grad_norm": 1.8401082024487943, + "learning_rate": 9.495237133470148e-07, + "loss": 0.1527, + "step": 10910 + }, + { + "epoch": 0.8644087938205586, + "grad_norm": 1.7338112602095568, + "learning_rate": 9.484326517823173e-07, + "loss": 0.1157, + "step": 10911 + }, + { + "epoch": 0.8644880174291939, + "grad_norm": 1.3311708290987254, + "learning_rate": 9.473421862197751e-07, + "loss": 0.0921, + "step": 10912 + }, + { + "epoch": 0.8645672410378292, + "grad_norm": 1.38500024275264, + "learning_rate": 9.462523167311943e-07, + "loss": 0.1236, + "step": 10913 + }, + { + "epoch": 0.8646464646464647, + "grad_norm": 1.3587894687926279, + "learning_rate": 9.45163043388333e-07, + "loss": 0.1406, + "step": 10914 + }, + { + "epoch": 0.8647256882551, + "grad_norm": 1.4853222950267773, + "learning_rate": 9.440743662629149e-07, + "loss": 0.119, + "step": 10915 + }, + { + "epoch": 0.8648049118637354, + "grad_norm": 1.3161393839214954, + "learning_rate": 9.429862854266281e-07, + "loss": 0.1298, + "step": 10916 + }, + { + "epoch": 0.8648841354723708, + "grad_norm": 1.30155870209671, + "learning_rate": 9.418988009511143e-07, + "loss": 0.1877, + "step": 10917 + }, + { + "epoch": 0.8649633590810062, + "grad_norm": 1.3772659044981126, + "learning_rate": 9.408119129079774e-07, + "loss": 0.1074, + "step": 10918 + }, + { + "epoch": 0.8650425826896415, + "grad_norm": 1.8152303268814725, + "learning_rate": 9.397256213687877e-07, + "loss": 0.1727, + "step": 10919 + }, + { + "epoch": 0.8651218062982768, + "grad_norm": 1.630963268026159, + "learning_rate": 9.386399264050705e-07, + "loss": 0.1516, + "step": 10920 + }, + { + "epoch": 0.8652010299069123, + "grad_norm": 1.7559384345626894, + "learning_rate": 9.375548280883129e-07, + "loss": 0.1752, + "step": 10921 + }, + { + "epoch": 0.8652802535155476, + "grad_norm": 1.0052979564283187, + "learning_rate": 9.364703264899655e-07, + "loss": 0.0694, + "step": 10922 + }, + { + "epoch": 0.865359477124183, + "grad_norm": 1.4307741989888934, + "learning_rate": 9.353864216814356e-07, + "loss": 0.1152, + "step": 10923 + }, + { + "epoch": 0.8654387007328184, + "grad_norm": 1.5714513861751531, + "learning_rate": 9.34303113734093e-07, + "loss": 0.1613, + "step": 10924 + }, + { + "epoch": 0.8655179243414538, + "grad_norm": 1.6158667691647737, + "learning_rate": 9.332204027192693e-07, + "loss": 0.162, + "step": 10925 + }, + { + "epoch": 0.8655971479500891, + "grad_norm": 1.9475152060317027, + "learning_rate": 9.321382887082564e-07, + "loss": 0.1849, + "step": 10926 + }, + { + "epoch": 0.8656763715587245, + "grad_norm": 1.4501968352367012, + "learning_rate": 9.310567717723063e-07, + "loss": 0.1395, + "step": 10927 + }, + { + "epoch": 0.8657555951673599, + "grad_norm": 2.023412276534783, + "learning_rate": 9.299758519826274e-07, + "loss": 0.1785, + "step": 10928 + }, + { + "epoch": 0.8658348187759952, + "grad_norm": 1.8156694064351233, + "learning_rate": 9.288955294103996e-07, + "loss": 0.1752, + "step": 10929 + }, + { + "epoch": 0.8659140423846307, + "grad_norm": 1.419405780459077, + "learning_rate": 9.278158041267526e-07, + "loss": 0.1097, + "step": 10930 + }, + { + "epoch": 0.865993265993266, + "grad_norm": 1.3306470307918916, + "learning_rate": 9.267366762027818e-07, + "loss": 0.1122, + "step": 10931 + }, + { + "epoch": 0.8660724896019014, + "grad_norm": 1.4799954587417474, + "learning_rate": 9.256581457095437e-07, + "loss": 0.09, + "step": 10932 + }, + { + "epoch": 0.8661517132105367, + "grad_norm": 1.618289662372727, + "learning_rate": 9.245802127180547e-07, + "loss": 0.1318, + "step": 10933 + }, + { + "epoch": 0.8662309368191721, + "grad_norm": 1.8771465956869073, + "learning_rate": 9.235028772992883e-07, + "loss": 0.2053, + "step": 10934 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 1.4435619988728658, + "learning_rate": 9.224261395241862e-07, + "loss": 0.1198, + "step": 10935 + }, + { + "epoch": 0.8663893840364428, + "grad_norm": 1.4124381688410097, + "learning_rate": 9.213499994636443e-07, + "loss": 0.1343, + "step": 10936 + }, + { + "epoch": 0.8664686076450783, + "grad_norm": 1.4718364248482843, + "learning_rate": 9.202744571885191e-07, + "loss": 0.1486, + "step": 10937 + }, + { + "epoch": 0.8665478312537136, + "grad_norm": 1.3611804032830201, + "learning_rate": 9.19199512769634e-07, + "loss": 0.1545, + "step": 10938 + }, + { + "epoch": 0.866627054862349, + "grad_norm": 1.5509965022058763, + "learning_rate": 9.181251662777668e-07, + "loss": 0.1542, + "step": 10939 + }, + { + "epoch": 0.8667062784709844, + "grad_norm": 1.4241131746379252, + "learning_rate": 9.170514177836565e-07, + "loss": 0.1179, + "step": 10940 + }, + { + "epoch": 0.8667855020796197, + "grad_norm": 1.4927536056643087, + "learning_rate": 9.159782673580075e-07, + "loss": 0.0852, + "step": 10941 + }, + { + "epoch": 0.8668647256882551, + "grad_norm": 1.7781950234357393, + "learning_rate": 9.149057150714802e-07, + "loss": 0.1503, + "step": 10942 + }, + { + "epoch": 0.8669439492968904, + "grad_norm": 1.3215882219520056, + "learning_rate": 9.138337609946979e-07, + "loss": 0.1348, + "step": 10943 + }, + { + "epoch": 0.8670231729055259, + "grad_norm": 1.615757534741732, + "learning_rate": 9.127624051982398e-07, + "loss": 0.1485, + "step": 10944 + }, + { + "epoch": 0.8671023965141612, + "grad_norm": 1.411099802195532, + "learning_rate": 9.116916477526539e-07, + "loss": 0.1247, + "step": 10945 + }, + { + "epoch": 0.8671816201227966, + "grad_norm": 1.609359406492289, + "learning_rate": 9.106214887284437e-07, + "loss": 0.1871, + "step": 10946 + }, + { + "epoch": 0.867260843731432, + "grad_norm": 1.5128977160429287, + "learning_rate": 9.095519281960729e-07, + "loss": 0.1332, + "step": 10947 + }, + { + "epoch": 0.8673400673400673, + "grad_norm": 1.4739064857942477, + "learning_rate": 9.084829662259665e-07, + "loss": 0.1428, + "step": 10948 + }, + { + "epoch": 0.8674192909487027, + "grad_norm": 2.0304058916317365, + "learning_rate": 9.0741460288851e-07, + "loss": 0.2366, + "step": 10949 + }, + { + "epoch": 0.8674985145573381, + "grad_norm": 1.4281256566306535, + "learning_rate": 9.06346838254053e-07, + "loss": 0.1284, + "step": 10950 + }, + { + "epoch": 0.8675777381659735, + "grad_norm": 1.4461723891510856, + "learning_rate": 9.052796723929002e-07, + "loss": 0.1362, + "step": 10951 + }, + { + "epoch": 0.8676569617746088, + "grad_norm": 1.71993145427732, + "learning_rate": 9.042131053753211e-07, + "loss": 0.1475, + "step": 10952 + }, + { + "epoch": 0.8677361853832442, + "grad_norm": 1.8596572122954313, + "learning_rate": 9.031471372715405e-07, + "loss": 0.1775, + "step": 10953 + }, + { + "epoch": 0.8678154089918796, + "grad_norm": 1.6769984517695262, + "learning_rate": 9.020817681517513e-07, + "loss": 0.1508, + "step": 10954 + }, + { + "epoch": 0.8678946326005149, + "grad_norm": 1.4244549219414049, + "learning_rate": 9.010169980861005e-07, + "loss": 0.1495, + "step": 10955 + }, + { + "epoch": 0.8679738562091504, + "grad_norm": 1.505802156717825, + "learning_rate": 8.999528271446989e-07, + "loss": 0.1039, + "step": 10956 + }, + { + "epoch": 0.8680530798177857, + "grad_norm": 1.4340402783786517, + "learning_rate": 8.988892553976169e-07, + "loss": 0.099, + "step": 10957 + }, + { + "epoch": 0.8681323034264211, + "grad_norm": 2.115247406856523, + "learning_rate": 8.978262829148876e-07, + "loss": 0.1915, + "step": 10958 + }, + { + "epoch": 0.8682115270350564, + "grad_norm": 1.8304690097727192, + "learning_rate": 8.96763909766497e-07, + "loss": 0.1898, + "step": 10959 + }, + { + "epoch": 0.8682907506436918, + "grad_norm": 1.5608265902746414, + "learning_rate": 8.957021360224039e-07, + "loss": 0.1723, + "step": 10960 + }, + { + "epoch": 0.8683699742523272, + "grad_norm": 1.8236701291473523, + "learning_rate": 8.946409617525175e-07, + "loss": 0.1876, + "step": 10961 + }, + { + "epoch": 0.8684491978609625, + "grad_norm": 1.8918107502369663, + "learning_rate": 8.935803870267101e-07, + "loss": 0.1825, + "step": 10962 + }, + { + "epoch": 0.868528421469598, + "grad_norm": 1.3430174678914524, + "learning_rate": 8.925204119148189e-07, + "loss": 0.1137, + "step": 10963 + }, + { + "epoch": 0.8686076450782333, + "grad_norm": 1.5633821790368312, + "learning_rate": 8.914610364866361e-07, + "loss": 0.2083, + "step": 10964 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 1.6862680967035257, + "learning_rate": 8.904022608119145e-07, + "loss": 0.1839, + "step": 10965 + }, + { + "epoch": 0.868766092295504, + "grad_norm": 1.3817487535325526, + "learning_rate": 8.89344084960374e-07, + "loss": 0.161, + "step": 10966 + }, + { + "epoch": 0.8688453159041394, + "grad_norm": 1.3764737775294633, + "learning_rate": 8.882865090016868e-07, + "loss": 0.1423, + "step": 10967 + }, + { + "epoch": 0.8689245395127748, + "grad_norm": 2.0387122250176666, + "learning_rate": 8.872295330054915e-07, + "loss": 0.2511, + "step": 10968 + }, + { + "epoch": 0.8690037631214101, + "grad_norm": 1.7417491095967168, + "learning_rate": 8.861731570413801e-07, + "loss": 0.1435, + "step": 10969 + }, + { + "epoch": 0.8690829867300456, + "grad_norm": 1.8245586735407713, + "learning_rate": 8.85117381178916e-07, + "loss": 0.1719, + "step": 10970 + }, + { + "epoch": 0.8691622103386809, + "grad_norm": 1.4348621479945678, + "learning_rate": 8.840622054876147e-07, + "loss": 0.1018, + "step": 10971 + }, + { + "epoch": 0.8692414339473163, + "grad_norm": 1.4071228506766478, + "learning_rate": 8.830076300369517e-07, + "loss": 0.1069, + "step": 10972 + }, + { + "epoch": 0.8693206575559517, + "grad_norm": 1.4981608182394808, + "learning_rate": 8.819536548963703e-07, + "loss": 0.1099, + "step": 10973 + }, + { + "epoch": 0.869399881164587, + "grad_norm": 1.5620643659361901, + "learning_rate": 8.809002801352673e-07, + "loss": 0.127, + "step": 10974 + }, + { + "epoch": 0.8694791047732224, + "grad_norm": 1.197118461905713, + "learning_rate": 8.798475058230005e-07, + "loss": 0.0705, + "step": 10975 + }, + { + "epoch": 0.8695583283818578, + "grad_norm": 2.010391264621694, + "learning_rate": 8.787953320288945e-07, + "loss": 0.1983, + "step": 10976 + }, + { + "epoch": 0.8696375519904932, + "grad_norm": 1.3910331028974718, + "learning_rate": 8.777437588222271e-07, + "loss": 0.1286, + "step": 10977 + }, + { + "epoch": 0.8697167755991285, + "grad_norm": 1.985523289622962, + "learning_rate": 8.766927862722374e-07, + "loss": 0.1563, + "step": 10978 + }, + { + "epoch": 0.869795999207764, + "grad_norm": 1.6063611268815923, + "learning_rate": 8.756424144481313e-07, + "loss": 0.1513, + "step": 10979 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 1.5963887677771678, + "learning_rate": 8.745926434190688e-07, + "loss": 0.1044, + "step": 10980 + }, + { + "epoch": 0.8699544464250346, + "grad_norm": 1.9144467977065878, + "learning_rate": 8.735434732541704e-07, + "loss": 0.1634, + "step": 10981 + }, + { + "epoch": 0.87003367003367, + "grad_norm": 1.6848805101341802, + "learning_rate": 8.724949040225217e-07, + "loss": 0.1141, + "step": 10982 + }, + { + "epoch": 0.8701128936423054, + "grad_norm": 1.3423442401763854, + "learning_rate": 8.714469357931654e-07, + "loss": 0.0825, + "step": 10983 + }, + { + "epoch": 0.8701921172509408, + "grad_norm": 1.5686682865637216, + "learning_rate": 8.703995686351041e-07, + "loss": 0.1458, + "step": 10984 + }, + { + "epoch": 0.8702713408595761, + "grad_norm": 1.437883277949371, + "learning_rate": 8.693528026173015e-07, + "loss": 0.1175, + "step": 10985 + }, + { + "epoch": 0.8703505644682116, + "grad_norm": 1.6538878190964008, + "learning_rate": 8.683066378086846e-07, + "loss": 0.1655, + "step": 10986 + }, + { + "epoch": 0.8704297880768469, + "grad_norm": 1.8758151922182287, + "learning_rate": 8.672610742781363e-07, + "loss": 0.1794, + "step": 10987 + }, + { + "epoch": 0.8705090116854822, + "grad_norm": 1.5345979598187736, + "learning_rate": 8.662161120945e-07, + "loss": 0.1065, + "step": 10988 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 1.8064827340296348, + "learning_rate": 8.651717513265867e-07, + "loss": 0.187, + "step": 10989 + }, + { + "epoch": 0.870667458902753, + "grad_norm": 1.5954980427273124, + "learning_rate": 8.641279920431589e-07, + "loss": 0.1521, + "step": 10990 + }, + { + "epoch": 0.8707466825113884, + "grad_norm": 1.306287513997135, + "learning_rate": 8.630848343129417e-07, + "loss": 0.1018, + "step": 10991 + }, + { + "epoch": 0.8708259061200238, + "grad_norm": 1.7290935975526414, + "learning_rate": 8.620422782046268e-07, + "loss": 0.1897, + "step": 10992 + }, + { + "epoch": 0.8709051297286592, + "grad_norm": 1.943586483832113, + "learning_rate": 8.61000323786858e-07, + "loss": 0.1669, + "step": 10993 + }, + { + "epoch": 0.8709843533372945, + "grad_norm": 1.783851774803501, + "learning_rate": 8.599589711282419e-07, + "loss": 0.1933, + "step": 10994 + }, + { + "epoch": 0.8710635769459298, + "grad_norm": 1.5047946982353315, + "learning_rate": 8.589182202973512e-07, + "loss": 0.1061, + "step": 10995 + }, + { + "epoch": 0.8711428005545653, + "grad_norm": 1.851562489959027, + "learning_rate": 8.578780713627111e-07, + "loss": 0.1466, + "step": 10996 + }, + { + "epoch": 0.8712220241632006, + "grad_norm": 1.5071103075973546, + "learning_rate": 8.568385243928112e-07, + "loss": 0.1472, + "step": 10997 + }, + { + "epoch": 0.871301247771836, + "grad_norm": 1.8438473502919566, + "learning_rate": 8.55799579456098e-07, + "loss": 0.1521, + "step": 10998 + }, + { + "epoch": 0.8713804713804714, + "grad_norm": 1.5013749162864427, + "learning_rate": 8.547612366209856e-07, + "loss": 0.117, + "step": 10999 + }, + { + "epoch": 0.8714596949891068, + "grad_norm": 1.6431376866387268, + "learning_rate": 8.537234959558416e-07, + "loss": 0.1588, + "step": 11000 + }, + { + "epoch": 0.8715389185977421, + "grad_norm": 1.4852220193625887, + "learning_rate": 8.526863575289945e-07, + "loss": 0.0998, + "step": 11001 + }, + { + "epoch": 0.8716181422063775, + "grad_norm": 1.5706793268387085, + "learning_rate": 8.516498214087387e-07, + "loss": 0.1736, + "step": 11002 + }, + { + "epoch": 0.8716973658150129, + "grad_norm": 1.598329115451917, + "learning_rate": 8.50613887663323e-07, + "loss": 0.1409, + "step": 11003 + }, + { + "epoch": 0.8717765894236482, + "grad_norm": 1.7190647850687741, + "learning_rate": 8.495785563609571e-07, + "loss": 0.1634, + "step": 11004 + }, + { + "epoch": 0.8718558130322837, + "grad_norm": 1.1667699969143297, + "learning_rate": 8.485438275698154e-07, + "loss": 0.0918, + "step": 11005 + }, + { + "epoch": 0.871935036640919, + "grad_norm": 1.7757176440618103, + "learning_rate": 8.475097013580292e-07, + "loss": 0.2365, + "step": 11006 + }, + { + "epoch": 0.8720142602495544, + "grad_norm": 1.976556341946614, + "learning_rate": 8.46476177793688e-07, + "loss": 0.2056, + "step": 11007 + }, + { + "epoch": 0.8720934838581897, + "grad_norm": 1.5208400080275073, + "learning_rate": 8.454432569448489e-07, + "loss": 0.1599, + "step": 11008 + }, + { + "epoch": 0.8721727074668251, + "grad_norm": 1.4873676647060463, + "learning_rate": 8.444109388795218e-07, + "loss": 0.1527, + "step": 11009 + }, + { + "epoch": 0.8722519310754605, + "grad_norm": 1.9450527855229645, + "learning_rate": 8.43379223665679e-07, + "loss": 0.1575, + "step": 11010 + }, + { + "epoch": 0.8723311546840958, + "grad_norm": 1.7083899811408323, + "learning_rate": 8.423481113712573e-07, + "loss": 0.1901, + "step": 11011 + }, + { + "epoch": 0.8724103782927313, + "grad_norm": 1.7986818100625264, + "learning_rate": 8.413176020641489e-07, + "loss": 0.1876, + "step": 11012 + }, + { + "epoch": 0.8724896019013666, + "grad_norm": 1.9088068931451552, + "learning_rate": 8.402876958122075e-07, + "loss": 0.1677, + "step": 11013 + }, + { + "epoch": 0.872568825510002, + "grad_norm": 1.2644182697903634, + "learning_rate": 8.392583926832454e-07, + "loss": 0.1064, + "step": 11014 + }, + { + "epoch": 0.8726480491186374, + "grad_norm": 1.3243914060493218, + "learning_rate": 8.382296927450417e-07, + "loss": 0.1341, + "step": 11015 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 1.405091237630225, + "learning_rate": 8.37201596065329e-07, + "loss": 0.1585, + "step": 11016 + }, + { + "epoch": 0.8728064963359081, + "grad_norm": 1.6326457541248793, + "learning_rate": 8.361741027118009e-07, + "loss": 0.1368, + "step": 11017 + }, + { + "epoch": 0.8728857199445434, + "grad_norm": 1.4428408014739877, + "learning_rate": 8.351472127521166e-07, + "loss": 0.1571, + "step": 11018 + }, + { + "epoch": 0.8729649435531789, + "grad_norm": 1.2730069115241511, + "learning_rate": 8.341209262538896e-07, + "loss": 0.129, + "step": 11019 + }, + { + "epoch": 0.8730441671618142, + "grad_norm": 1.421899450309346, + "learning_rate": 8.330952432846939e-07, + "loss": 0.1286, + "step": 11020 + }, + { + "epoch": 0.8731233907704496, + "grad_norm": 2.3210584919098376, + "learning_rate": 8.320701639120709e-07, + "loss": 0.1806, + "step": 11021 + }, + { + "epoch": 0.873202614379085, + "grad_norm": 1.4065445920443453, + "learning_rate": 8.310456882035145e-07, + "loss": 0.1561, + "step": 11022 + }, + { + "epoch": 0.8732818379877203, + "grad_norm": 1.9764765631166432, + "learning_rate": 8.300218162264783e-07, + "loss": 0.1898, + "step": 11023 + }, + { + "epoch": 0.8733610615963557, + "grad_norm": 1.9887064023906842, + "learning_rate": 8.289985480483864e-07, + "loss": 0.1657, + "step": 11024 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 1.9699979560114873, + "learning_rate": 8.279758837366103e-07, + "loss": 0.1978, + "step": 11025 + }, + { + "epoch": 0.8735195088136265, + "grad_norm": 1.2423101171407702, + "learning_rate": 8.269538233584884e-07, + "loss": 0.0916, + "step": 11026 + }, + { + "epoch": 0.8735987324222618, + "grad_norm": 1.6238746212862867, + "learning_rate": 8.259323669813202e-07, + "loss": 0.1076, + "step": 11027 + }, + { + "epoch": 0.8736779560308973, + "grad_norm": 1.232171295620495, + "learning_rate": 8.24911514672363e-07, + "loss": 0.1082, + "step": 11028 + }, + { + "epoch": 0.8737571796395326, + "grad_norm": 1.6016744884010354, + "learning_rate": 8.23891266498833e-07, + "loss": 0.1668, + "step": 11029 + }, + { + "epoch": 0.8738364032481679, + "grad_norm": 1.502720769662286, + "learning_rate": 8.228716225279121e-07, + "loss": 0.1299, + "step": 11030 + }, + { + "epoch": 0.8739156268568034, + "grad_norm": 1.4484411193736395, + "learning_rate": 8.218525828267377e-07, + "loss": 0.1459, + "step": 11031 + }, + { + "epoch": 0.8739948504654387, + "grad_norm": 1.748556094345839, + "learning_rate": 8.208341474624071e-07, + "loss": 0.1867, + "step": 11032 + }, + { + "epoch": 0.8740740740740741, + "grad_norm": 2.0911540876283157, + "learning_rate": 8.198163165019812e-07, + "loss": 0.2293, + "step": 11033 + }, + { + "epoch": 0.8741532976827094, + "grad_norm": 1.3533978724512903, + "learning_rate": 8.187990900124787e-07, + "loss": 0.1198, + "step": 11034 + }, + { + "epoch": 0.8742325212913448, + "grad_norm": 1.200006291736161, + "learning_rate": 8.177824680608781e-07, + "loss": 0.1085, + "step": 11035 + }, + { + "epoch": 0.8743117448999802, + "grad_norm": 1.617583492298631, + "learning_rate": 8.167664507141215e-07, + "loss": 0.1895, + "step": 11036 + }, + { + "epoch": 0.8743909685086155, + "grad_norm": 1.548224389199469, + "learning_rate": 8.157510380391065e-07, + "loss": 0.1409, + "step": 11037 + }, + { + "epoch": 0.874470192117251, + "grad_norm": 1.9670762647277398, + "learning_rate": 8.14736230102694e-07, + "loss": 0.1696, + "step": 11038 + }, + { + "epoch": 0.8745494157258863, + "grad_norm": 1.4834681256023374, + "learning_rate": 8.137220269717028e-07, + "loss": 0.1302, + "step": 11039 + }, + { + "epoch": 0.8746286393345217, + "grad_norm": 1.893756388731371, + "learning_rate": 8.127084287129161e-07, + "loss": 0.1842, + "step": 11040 + }, + { + "epoch": 0.874707862943157, + "grad_norm": 2.016710640036495, + "learning_rate": 8.116954353930728e-07, + "loss": 0.2187, + "step": 11041 + }, + { + "epoch": 0.8747870865517924, + "grad_norm": 1.4940586794469284, + "learning_rate": 8.106830470788729e-07, + "loss": 0.1309, + "step": 11042 + }, + { + "epoch": 0.8748663101604278, + "grad_norm": 1.6039098671434404, + "learning_rate": 8.096712638369797e-07, + "loss": 0.1453, + "step": 11043 + }, + { + "epoch": 0.8749455337690631, + "grad_norm": 1.1653177222697704, + "learning_rate": 8.086600857340121e-07, + "loss": 0.1157, + "step": 11044 + }, + { + "epoch": 0.8750247573776986, + "grad_norm": 1.3985778556104098, + "learning_rate": 8.076495128365502e-07, + "loss": 0.1258, + "step": 11045 + }, + { + "epoch": 0.8751039809863339, + "grad_norm": 1.1897375410971311, + "learning_rate": 8.066395452111387e-07, + "loss": 0.1324, + "step": 11046 + }, + { + "epoch": 0.8751832045949693, + "grad_norm": 1.329847737991157, + "learning_rate": 8.056301829242785e-07, + "loss": 0.1371, + "step": 11047 + }, + { + "epoch": 0.8752624282036047, + "grad_norm": 1.553353020837809, + "learning_rate": 8.046214260424279e-07, + "loss": 0.1613, + "step": 11048 + }, + { + "epoch": 0.87534165181224, + "grad_norm": 1.3751073872323294, + "learning_rate": 8.036132746320125e-07, + "loss": 0.1331, + "step": 11049 + }, + { + "epoch": 0.8754208754208754, + "grad_norm": 1.5327896495551423, + "learning_rate": 8.026057287594136e-07, + "loss": 0.158, + "step": 11050 + }, + { + "epoch": 0.8755000990295108, + "grad_norm": 1.5121267916789705, + "learning_rate": 8.015987884909692e-07, + "loss": 0.1677, + "step": 11051 + }, + { + "epoch": 0.8755793226381462, + "grad_norm": 1.4042188522204424, + "learning_rate": 8.005924538929877e-07, + "loss": 0.1324, + "step": 11052 + }, + { + "epoch": 0.8756585462467815, + "grad_norm": 1.328277647115414, + "learning_rate": 7.99586725031728e-07, + "loss": 0.1106, + "step": 11053 + }, + { + "epoch": 0.875737769855417, + "grad_norm": 1.8239494505271974, + "learning_rate": 7.985816019734127e-07, + "loss": 0.1903, + "step": 11054 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 1.3782876359957164, + "learning_rate": 7.975770847842234e-07, + "loss": 0.1229, + "step": 11055 + }, + { + "epoch": 0.8758962170726876, + "grad_norm": 1.8534888944301684, + "learning_rate": 7.965731735303051e-07, + "loss": 0.17, + "step": 11056 + }, + { + "epoch": 0.875975440681323, + "grad_norm": 1.2458916719109903, + "learning_rate": 7.955698682777601e-07, + "loss": 0.1014, + "step": 11057 + }, + { + "epoch": 0.8760546642899584, + "grad_norm": 1.1449972517787468, + "learning_rate": 7.945671690926471e-07, + "loss": 0.0885, + "step": 11058 + }, + { + "epoch": 0.8761338878985938, + "grad_norm": 1.6426687320259323, + "learning_rate": 7.935650760409952e-07, + "loss": 0.2175, + "step": 11059 + }, + { + "epoch": 0.8762131115072291, + "grad_norm": 1.5807434425690674, + "learning_rate": 7.925635891887839e-07, + "loss": 0.1533, + "step": 11060 + }, + { + "epoch": 0.8762923351158646, + "grad_norm": 1.5697158031136984, + "learning_rate": 7.915627086019561e-07, + "loss": 0.1276, + "step": 11061 + }, + { + "epoch": 0.8763715587244999, + "grad_norm": 1.8997840016244778, + "learning_rate": 7.905624343464169e-07, + "loss": 0.1517, + "step": 11062 + }, + { + "epoch": 0.8764507823331352, + "grad_norm": 1.3513186481944488, + "learning_rate": 7.895627664880278e-07, + "loss": 0.1231, + "step": 11063 + }, + { + "epoch": 0.8765300059417707, + "grad_norm": 1.1353838374362546, + "learning_rate": 7.88563705092612e-07, + "loss": 0.0876, + "step": 11064 + }, + { + "epoch": 0.876609229550406, + "grad_norm": 1.3803019825379057, + "learning_rate": 7.875652502259545e-07, + "loss": 0.1442, + "step": 11065 + }, + { + "epoch": 0.8766884531590414, + "grad_norm": 1.123428824245722, + "learning_rate": 7.865674019537983e-07, + "loss": 0.0914, + "step": 11066 + }, + { + "epoch": 0.8767676767676768, + "grad_norm": 1.5637562392873046, + "learning_rate": 7.855701603418442e-07, + "loss": 0.1787, + "step": 11067 + }, + { + "epoch": 0.8768469003763122, + "grad_norm": 1.3514842481495226, + "learning_rate": 7.845735254557608e-07, + "loss": 0.1138, + "step": 11068 + }, + { + "epoch": 0.8769261239849475, + "grad_norm": 1.4883758503748221, + "learning_rate": 7.835774973611687e-07, + "loss": 0.1294, + "step": 11069 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 1.8425176573630073, + "learning_rate": 7.825820761236514e-07, + "loss": 0.2123, + "step": 11070 + }, + { + "epoch": 0.8770845712022183, + "grad_norm": 1.5354551841523785, + "learning_rate": 7.815872618087506e-07, + "loss": 0.138, + "step": 11071 + }, + { + "epoch": 0.8771637948108536, + "grad_norm": 1.2943596722900061, + "learning_rate": 7.805930544819751e-07, + "loss": 0.1532, + "step": 11072 + }, + { + "epoch": 0.877243018419489, + "grad_norm": 1.5506195840527082, + "learning_rate": 7.795994542087859e-07, + "loss": 0.1807, + "step": 11073 + }, + { + "epoch": 0.8773222420281244, + "grad_norm": 1.456548003620456, + "learning_rate": 7.786064610546051e-07, + "loss": 0.1704, + "step": 11074 + }, + { + "epoch": 0.8774014656367598, + "grad_norm": 1.8466227504958324, + "learning_rate": 7.776140750848205e-07, + "loss": 0.1915, + "step": 11075 + }, + { + "epoch": 0.8774806892453951, + "grad_norm": 1.5899723841286948, + "learning_rate": 7.766222963647729e-07, + "loss": 0.1586, + "step": 11076 + }, + { + "epoch": 0.8775599128540305, + "grad_norm": 1.5812415549782357, + "learning_rate": 7.756311249597659e-07, + "loss": 0.1289, + "step": 11077 + }, + { + "epoch": 0.8776391364626659, + "grad_norm": 1.1045692898606165, + "learning_rate": 7.746405609350661e-07, + "loss": 0.0926, + "step": 11078 + }, + { + "epoch": 0.8777183600713012, + "grad_norm": 1.7299878193532292, + "learning_rate": 7.736506043558956e-07, + "loss": 0.2077, + "step": 11079 + }, + { + "epoch": 0.8777975836799367, + "grad_norm": 1.675653079315312, + "learning_rate": 7.726612552874368e-07, + "loss": 0.1581, + "step": 11080 + }, + { + "epoch": 0.877876807288572, + "grad_norm": 1.3068524458677393, + "learning_rate": 7.716725137948366e-07, + "loss": 0.1147, + "step": 11081 + }, + { + "epoch": 0.8779560308972074, + "grad_norm": 1.4682485026277445, + "learning_rate": 7.706843799431985e-07, + "loss": 0.1436, + "step": 11082 + }, + { + "epoch": 0.8780352545058427, + "grad_norm": 1.2990961209575327, + "learning_rate": 7.696968537975847e-07, + "loss": 0.1382, + "step": 11083 + }, + { + "epoch": 0.8781144781144781, + "grad_norm": 1.7159384905615305, + "learning_rate": 7.687099354230177e-07, + "loss": 0.1943, + "step": 11084 + }, + { + "epoch": 0.8781937017231135, + "grad_norm": 1.5114893507133043, + "learning_rate": 7.677236248844855e-07, + "loss": 0.1439, + "step": 11085 + }, + { + "epoch": 0.8782729253317488, + "grad_norm": 1.440527015874435, + "learning_rate": 7.667379222469295e-07, + "loss": 0.1762, + "step": 11086 + }, + { + "epoch": 0.8783521489403843, + "grad_norm": 1.3369850074725866, + "learning_rate": 7.657528275752524e-07, + "loss": 0.0848, + "step": 11087 + }, + { + "epoch": 0.8784313725490196, + "grad_norm": 1.7124503510272913, + "learning_rate": 7.647683409343198e-07, + "loss": 0.1582, + "step": 11088 + }, + { + "epoch": 0.878510596157655, + "grad_norm": 1.4861977833770101, + "learning_rate": 7.637844623889557e-07, + "loss": 0.1229, + "step": 11089 + }, + { + "epoch": 0.8785898197662904, + "grad_norm": 1.3899642530828875, + "learning_rate": 7.628011920039414e-07, + "loss": 0.0837, + "step": 11090 + }, + { + "epoch": 0.8786690433749257, + "grad_norm": 1.4446096629750547, + "learning_rate": 7.618185298440239e-07, + "loss": 0.1438, + "step": 11091 + }, + { + "epoch": 0.8787482669835611, + "grad_norm": 1.1484021972326586, + "learning_rate": 7.608364759739039e-07, + "loss": 0.1127, + "step": 11092 + }, + { + "epoch": 0.8788274905921964, + "grad_norm": 1.2636044929676284, + "learning_rate": 7.598550304582453e-07, + "loss": 0.0945, + "step": 11093 + }, + { + "epoch": 0.8789067142008319, + "grad_norm": 1.6738270481531605, + "learning_rate": 7.588741933616728e-07, + "loss": 0.1621, + "step": 11094 + }, + { + "epoch": 0.8789859378094672, + "grad_norm": 1.4038465432917206, + "learning_rate": 7.578939647487705e-07, + "loss": 0.1474, + "step": 11095 + }, + { + "epoch": 0.8790651614181026, + "grad_norm": 2.3264592524034136, + "learning_rate": 7.569143446840776e-07, + "loss": 0.131, + "step": 11096 + }, + { + "epoch": 0.879144385026738, + "grad_norm": 1.342160553010363, + "learning_rate": 7.559353332321029e-07, + "loss": 0.0931, + "step": 11097 + }, + { + "epoch": 0.8792236086353733, + "grad_norm": 1.4823900789567563, + "learning_rate": 7.549569304573057e-07, + "loss": 0.222, + "step": 11098 + }, + { + "epoch": 0.8793028322440087, + "grad_norm": 2.0199311663698385, + "learning_rate": 7.539791364241111e-07, + "loss": 0.1746, + "step": 11099 + }, + { + "epoch": 0.8793820558526441, + "grad_norm": 1.604095758236551, + "learning_rate": 7.530019511969e-07, + "loss": 0.1492, + "step": 11100 + }, + { + "epoch": 0.8794612794612795, + "grad_norm": 1.3453499361938412, + "learning_rate": 7.520253748400175e-07, + "loss": 0.1199, + "step": 11101 + }, + { + "epoch": 0.8795405030699148, + "grad_norm": 1.5124738624809815, + "learning_rate": 7.510494074177666e-07, + "loss": 0.1048, + "step": 11102 + }, + { + "epoch": 0.8796197266785503, + "grad_norm": 1.6276717268046157, + "learning_rate": 7.500740489944092e-07, + "loss": 0.1262, + "step": 11103 + }, + { + "epoch": 0.8796989502871856, + "grad_norm": 1.2660817562754592, + "learning_rate": 7.490992996341662e-07, + "loss": 0.1038, + "step": 11104 + }, + { + "epoch": 0.8797781738958209, + "grad_norm": 1.4808194266546642, + "learning_rate": 7.481251594012218e-07, + "loss": 0.1525, + "step": 11105 + }, + { + "epoch": 0.8798573975044564, + "grad_norm": 1.1035835785590915, + "learning_rate": 7.471516283597191e-07, + "loss": 0.0901, + "step": 11106 + }, + { + "epoch": 0.8799366211130917, + "grad_norm": 1.25726160744421, + "learning_rate": 7.461787065737602e-07, + "loss": 0.1165, + "step": 11107 + }, + { + "epoch": 0.8800158447217271, + "grad_norm": 1.3486256848014806, + "learning_rate": 7.452063941074073e-07, + "loss": 0.1158, + "step": 11108 + }, + { + "epoch": 0.8800950683303624, + "grad_norm": 1.2517075362925056, + "learning_rate": 7.442346910246801e-07, + "loss": 0.136, + "step": 11109 + }, + { + "epoch": 0.8801742919389978, + "grad_norm": 1.2111080557371396, + "learning_rate": 7.432635973895652e-07, + "loss": 0.0863, + "step": 11110 + }, + { + "epoch": 0.8802535155476332, + "grad_norm": 1.7483078900025897, + "learning_rate": 7.422931132660005e-07, + "loss": 0.1899, + "step": 11111 + }, + { + "epoch": 0.8803327391562685, + "grad_norm": 1.5807034312357622, + "learning_rate": 7.413232387178882e-07, + "loss": 0.118, + "step": 11112 + }, + { + "epoch": 0.880411962764904, + "grad_norm": 1.9485706575073423, + "learning_rate": 7.403539738090914e-07, + "loss": 0.1106, + "step": 11113 + }, + { + "epoch": 0.8804911863735393, + "grad_norm": 1.6873901742328379, + "learning_rate": 7.393853186034316e-07, + "loss": 0.1899, + "step": 11114 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 1.9597737852281722, + "learning_rate": 7.384172731646877e-07, + "loss": 0.158, + "step": 11115 + }, + { + "epoch": 0.88064963359081, + "grad_norm": 1.8619822635570302, + "learning_rate": 7.374498375566042e-07, + "loss": 0.1961, + "step": 11116 + }, + { + "epoch": 0.8807288571994454, + "grad_norm": 1.5695146854586293, + "learning_rate": 7.364830118428801e-07, + "loss": 0.2133, + "step": 11117 + }, + { + "epoch": 0.8808080808080808, + "grad_norm": 1.478800109186462, + "learning_rate": 7.355167960871745e-07, + "loss": 0.1624, + "step": 11118 + }, + { + "epoch": 0.8808873044167161, + "grad_norm": 1.6214009776786245, + "learning_rate": 7.345511903531122e-07, + "loss": 0.1399, + "step": 11119 + }, + { + "epoch": 0.8809665280253516, + "grad_norm": 1.528845052740751, + "learning_rate": 7.335861947042711e-07, + "loss": 0.1222, + "step": 11120 + }, + { + "epoch": 0.8810457516339869, + "grad_norm": 1.4993479716755802, + "learning_rate": 7.326218092041903e-07, + "loss": 0.133, + "step": 11121 + }, + { + "epoch": 0.8811249752426223, + "grad_norm": 1.6820809362683793, + "learning_rate": 7.316580339163736e-07, + "loss": 0.1335, + "step": 11122 + }, + { + "epoch": 0.8812041988512577, + "grad_norm": 1.2079859122053844, + "learning_rate": 7.306948689042792e-07, + "loss": 0.0947, + "step": 11123 + }, + { + "epoch": 0.881283422459893, + "grad_norm": 1.5348029362662943, + "learning_rate": 7.297323142313262e-07, + "loss": 0.1198, + "step": 11124 + }, + { + "epoch": 0.8813626460685284, + "grad_norm": 1.49480804702477, + "learning_rate": 7.287703699608928e-07, + "loss": 0.1517, + "step": 11125 + }, + { + "epoch": 0.8814418696771638, + "grad_norm": 1.5221241334792392, + "learning_rate": 7.278090361563228e-07, + "loss": 0.1443, + "step": 11126 + }, + { + "epoch": 0.8815210932857992, + "grad_norm": 1.940163117579796, + "learning_rate": 7.268483128809122e-07, + "loss": 0.205, + "step": 11127 + }, + { + "epoch": 0.8816003168944345, + "grad_norm": 1.5362766393192315, + "learning_rate": 7.258882001979184e-07, + "loss": 0.1637, + "step": 11128 + }, + { + "epoch": 0.88167954050307, + "grad_norm": 1.5644007245495675, + "learning_rate": 7.24928698170565e-07, + "loss": 0.1374, + "step": 11129 + }, + { + "epoch": 0.8817587641117053, + "grad_norm": 2.004489483898855, + "learning_rate": 7.239698068620272e-07, + "loss": 0.2701, + "step": 11130 + }, + { + "epoch": 0.8818379877203406, + "grad_norm": 1.3244070864876418, + "learning_rate": 7.230115263354431e-07, + "loss": 0.1344, + "step": 11131 + }, + { + "epoch": 0.881917211328976, + "grad_norm": 1.679983697823597, + "learning_rate": 7.220538566539137e-07, + "loss": 0.1734, + "step": 11132 + }, + { + "epoch": 0.8819964349376114, + "grad_norm": 1.786821865799786, + "learning_rate": 7.21096797880495e-07, + "loss": 0.1768, + "step": 11133 + }, + { + "epoch": 0.8820756585462468, + "grad_norm": 1.8551648073826383, + "learning_rate": 7.201403500782034e-07, + "loss": 0.2055, + "step": 11134 + }, + { + "epoch": 0.8821548821548821, + "grad_norm": 1.7889618735972586, + "learning_rate": 7.191845133100195e-07, + "loss": 0.1697, + "step": 11135 + }, + { + "epoch": 0.8822341057635176, + "grad_norm": 1.397606079260697, + "learning_rate": 7.182292876388785e-07, + "loss": 0.1418, + "step": 11136 + }, + { + "epoch": 0.8823133293721529, + "grad_norm": 1.6054860990465039, + "learning_rate": 7.17274673127677e-07, + "loss": 0.135, + "step": 11137 + }, + { + "epoch": 0.8823925529807882, + "grad_norm": 1.7195520923106866, + "learning_rate": 7.163206698392744e-07, + "loss": 0.1376, + "step": 11138 + }, + { + "epoch": 0.8824717765894237, + "grad_norm": 1.2884564375815863, + "learning_rate": 7.153672778364851e-07, + "loss": 0.1284, + "step": 11139 + }, + { + "epoch": 0.882551000198059, + "grad_norm": 1.5600589826964066, + "learning_rate": 7.144144971820855e-07, + "loss": 0.1781, + "step": 11140 + }, + { + "epoch": 0.8826302238066944, + "grad_norm": 1.7889125989457015, + "learning_rate": 7.134623279388098e-07, + "loss": 0.1577, + "step": 11141 + }, + { + "epoch": 0.8827094474153298, + "grad_norm": 1.7736935452385398, + "learning_rate": 7.12510770169359e-07, + "loss": 0.1369, + "step": 11142 + }, + { + "epoch": 0.8827886710239652, + "grad_norm": 1.6159978180694412, + "learning_rate": 7.115598239363842e-07, + "loss": 0.1781, + "step": 11143 + }, + { + "epoch": 0.8828678946326005, + "grad_norm": 1.8947987201075354, + "learning_rate": 7.106094893025006e-07, + "loss": 0.2934, + "step": 11144 + }, + { + "epoch": 0.8829471182412358, + "grad_norm": 1.6120085550912424, + "learning_rate": 7.096597663302862e-07, + "loss": 0.1452, + "step": 11145 + }, + { + "epoch": 0.8830263418498713, + "grad_norm": 1.5986575421728753, + "learning_rate": 7.087106550822731e-07, + "loss": 0.1601, + "step": 11146 + }, + { + "epoch": 0.8831055654585066, + "grad_norm": 1.0178386973477769, + "learning_rate": 7.077621556209557e-07, + "loss": 0.0728, + "step": 11147 + }, + { + "epoch": 0.883184789067142, + "grad_norm": 1.4218344936942204, + "learning_rate": 7.068142680087909e-07, + "loss": 0.1493, + "step": 11148 + }, + { + "epoch": 0.8832640126757774, + "grad_norm": 1.4753371306919423, + "learning_rate": 7.058669923081896e-07, + "loss": 0.1556, + "step": 11149 + }, + { + "epoch": 0.8833432362844128, + "grad_norm": 1.5490919797379965, + "learning_rate": 7.049203285815253e-07, + "loss": 0.1173, + "step": 11150 + }, + { + "epoch": 0.8834224598930481, + "grad_norm": 1.5942410600236587, + "learning_rate": 7.03974276891134e-07, + "loss": 0.1265, + "step": 11151 + }, + { + "epoch": 0.8835016835016835, + "grad_norm": 1.4656320898371784, + "learning_rate": 7.030288372993066e-07, + "loss": 0.1468, + "step": 11152 + }, + { + "epoch": 0.8835809071103189, + "grad_norm": 1.4680545887161833, + "learning_rate": 7.020840098682968e-07, + "loss": 0.1234, + "step": 11153 + }, + { + "epoch": 0.8836601307189542, + "grad_norm": 1.389056483511112, + "learning_rate": 7.011397946603138e-07, + "loss": 0.1237, + "step": 11154 + }, + { + "epoch": 0.8837393543275897, + "grad_norm": 1.7126887232272259, + "learning_rate": 7.001961917375344e-07, + "loss": 0.198, + "step": 11155 + }, + { + "epoch": 0.883818577936225, + "grad_norm": 1.1415614785280985, + "learning_rate": 6.992532011620878e-07, + "loss": 0.1159, + "step": 11156 + }, + { + "epoch": 0.8838978015448604, + "grad_norm": 1.4461974354079816, + "learning_rate": 6.983108229960633e-07, + "loss": 0.1034, + "step": 11157 + }, + { + "epoch": 0.8839770251534957, + "grad_norm": 2.1506525876897937, + "learning_rate": 6.973690573015168e-07, + "loss": 0.2679, + "step": 11158 + }, + { + "epoch": 0.8840562487621311, + "grad_norm": 1.6475841358886705, + "learning_rate": 6.964279041404553e-07, + "loss": 0.162, + "step": 11159 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 1.4850362469694653, + "learning_rate": 6.954873635748493e-07, + "loss": 0.1441, + "step": 11160 + }, + { + "epoch": 0.8842146959794018, + "grad_norm": 1.606768367852246, + "learning_rate": 6.945474356666326e-07, + "loss": 0.195, + "step": 11161 + }, + { + "epoch": 0.8842939195880373, + "grad_norm": 1.8304760132245828, + "learning_rate": 6.936081204776913e-07, + "loss": 0.1573, + "step": 11162 + }, + { + "epoch": 0.8843731431966726, + "grad_norm": 1.4679915803362746, + "learning_rate": 6.926694180698734e-07, + "loss": 0.1745, + "step": 11163 + }, + { + "epoch": 0.884452366805308, + "grad_norm": 1.717960598890273, + "learning_rate": 6.917313285049931e-07, + "loss": 0.2052, + "step": 11164 + }, + { + "epoch": 0.8845315904139434, + "grad_norm": 1.2154139342917198, + "learning_rate": 6.907938518448154e-07, + "loss": 0.0808, + "step": 11165 + }, + { + "epoch": 0.8846108140225787, + "grad_norm": 1.7571815308144139, + "learning_rate": 6.898569881510686e-07, + "loss": 0.1876, + "step": 11166 + }, + { + "epoch": 0.8846900376312141, + "grad_norm": 1.3074702335973698, + "learning_rate": 6.889207374854434e-07, + "loss": 0.0875, + "step": 11167 + }, + { + "epoch": 0.8847692612398494, + "grad_norm": 1.4915280470452914, + "learning_rate": 6.879850999095849e-07, + "loss": 0.1169, + "step": 11168 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 1.5858128281599722, + "learning_rate": 6.870500754851017e-07, + "loss": 0.1644, + "step": 11169 + }, + { + "epoch": 0.8849277084571202, + "grad_norm": 1.6199546298355267, + "learning_rate": 6.861156642735578e-07, + "loss": 0.1831, + "step": 11170 + }, + { + "epoch": 0.8850069320657556, + "grad_norm": 1.815086789074067, + "learning_rate": 6.851818663364839e-07, + "loss": 0.1278, + "step": 11171 + }, + { + "epoch": 0.885086155674391, + "grad_norm": 1.489717338249987, + "learning_rate": 6.842486817353633e-07, + "loss": 0.1287, + "step": 11172 + }, + { + "epoch": 0.8851653792830263, + "grad_norm": 1.3484807114867106, + "learning_rate": 6.833161105316421e-07, + "loss": 0.099, + "step": 11173 + }, + { + "epoch": 0.8852446028916617, + "grad_norm": 1.380817401369569, + "learning_rate": 6.823841527867259e-07, + "loss": 0.0877, + "step": 11174 + }, + { + "epoch": 0.8853238265002971, + "grad_norm": 1.3627900815025338, + "learning_rate": 6.814528085619809e-07, + "loss": 0.1395, + "step": 11175 + }, + { + "epoch": 0.8854030501089325, + "grad_norm": 1.7115600357121634, + "learning_rate": 6.805220779187293e-07, + "loss": 0.2051, + "step": 11176 + }, + { + "epoch": 0.8854822737175678, + "grad_norm": 1.908196926567162, + "learning_rate": 6.795919609182566e-07, + "loss": 0.1698, + "step": 11177 + }, + { + "epoch": 0.8855614973262033, + "grad_norm": 1.326031720268804, + "learning_rate": 6.78662457621807e-07, + "loss": 0.112, + "step": 11178 + }, + { + "epoch": 0.8856407209348386, + "grad_norm": 1.7568740812652275, + "learning_rate": 6.777335680905817e-07, + "loss": 0.1754, + "step": 11179 + }, + { + "epoch": 0.8857199445434739, + "grad_norm": 1.7513806225288808, + "learning_rate": 6.768052923857482e-07, + "loss": 0.2034, + "step": 11180 + }, + { + "epoch": 0.8857991681521094, + "grad_norm": 1.3711975828429002, + "learning_rate": 6.758776305684245e-07, + "loss": 0.1335, + "step": 11181 + }, + { + "epoch": 0.8858783917607447, + "grad_norm": 1.4007591843055125, + "learning_rate": 6.749505826996927e-07, + "loss": 0.117, + "step": 11182 + }, + { + "epoch": 0.8859576153693801, + "grad_norm": 1.5552915496574455, + "learning_rate": 6.740241488405963e-07, + "loss": 0.1246, + "step": 11183 + }, + { + "epoch": 0.8860368389780154, + "grad_norm": 1.7304594683069021, + "learning_rate": 6.730983290521365e-07, + "loss": 0.1918, + "step": 11184 + }, + { + "epoch": 0.8861160625866509, + "grad_norm": 1.4540912158424129, + "learning_rate": 6.721731233952722e-07, + "loss": 0.1399, + "step": 11185 + }, + { + "epoch": 0.8861952861952862, + "grad_norm": 1.5808172287444398, + "learning_rate": 6.712485319309258e-07, + "loss": 0.1468, + "step": 11186 + }, + { + "epoch": 0.8862745098039215, + "grad_norm": 1.7907370149774915, + "learning_rate": 6.703245547199777e-07, + "loss": 0.1875, + "step": 11187 + }, + { + "epoch": 0.886353733412557, + "grad_norm": 1.46345190390323, + "learning_rate": 6.694011918232635e-07, + "loss": 0.1342, + "step": 11188 + }, + { + "epoch": 0.8864329570211923, + "grad_norm": 2.262595414517965, + "learning_rate": 6.684784433015867e-07, + "loss": 0.2245, + "step": 11189 + }, + { + "epoch": 0.8865121806298277, + "grad_norm": 1.932419712110693, + "learning_rate": 6.675563092157044e-07, + "loss": 0.2003, + "step": 11190 + }, + { + "epoch": 0.886591404238463, + "grad_norm": 1.6139038501004332, + "learning_rate": 6.666347896263326e-07, + "loss": 0.1703, + "step": 11191 + }, + { + "epoch": 0.8866706278470984, + "grad_norm": 1.6083030141776613, + "learning_rate": 6.657138845941524e-07, + "loss": 0.1518, + "step": 11192 + }, + { + "epoch": 0.8867498514557338, + "grad_norm": 1.5460532694260236, + "learning_rate": 6.64793594179799e-07, + "loss": 0.13, + "step": 11193 + }, + { + "epoch": 0.8868290750643691, + "grad_norm": 1.4099721470608344, + "learning_rate": 6.638739184438681e-07, + "loss": 0.1645, + "step": 11194 + }, + { + "epoch": 0.8869082986730046, + "grad_norm": 1.7513948458341688, + "learning_rate": 6.629548574469169e-07, + "loss": 0.1453, + "step": 11195 + }, + { + "epoch": 0.8869875222816399, + "grad_norm": 1.561618827569234, + "learning_rate": 6.620364112494627e-07, + "loss": 0.2136, + "step": 11196 + }, + { + "epoch": 0.8870667458902753, + "grad_norm": 1.5661793984465628, + "learning_rate": 6.611185799119791e-07, + "loss": 0.1937, + "step": 11197 + }, + { + "epoch": 0.8871459694989107, + "grad_norm": 1.4580213914665436, + "learning_rate": 6.602013634949001e-07, + "loss": 0.1156, + "step": 11198 + }, + { + "epoch": 0.887225193107546, + "grad_norm": 1.329351429338928, + "learning_rate": 6.592847620586217e-07, + "loss": 0.1309, + "step": 11199 + }, + { + "epoch": 0.8873044167161814, + "grad_norm": 1.6875267310544861, + "learning_rate": 6.583687756634982e-07, + "loss": 0.1955, + "step": 11200 + }, + { + "epoch": 0.8873836403248168, + "grad_norm": 1.5599368515170422, + "learning_rate": 6.574534043698399e-07, + "loss": 0.122, + "step": 11201 + }, + { + "epoch": 0.8874628639334522, + "grad_norm": 1.3859195779507871, + "learning_rate": 6.565386482379221e-07, + "loss": 0.1327, + "step": 11202 + }, + { + "epoch": 0.8875420875420875, + "grad_norm": 1.5138946910295334, + "learning_rate": 6.556245073279777e-07, + "loss": 0.163, + "step": 11203 + }, + { + "epoch": 0.887621311150723, + "grad_norm": 1.3326850825856658, + "learning_rate": 6.547109817001951e-07, + "loss": 0.1668, + "step": 11204 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 1.6015704736011787, + "learning_rate": 6.537980714147285e-07, + "loss": 0.1388, + "step": 11205 + }, + { + "epoch": 0.8877797583679936, + "grad_norm": 1.8286157952647148, + "learning_rate": 6.528857765316887e-07, + "loss": 0.1454, + "step": 11206 + }, + { + "epoch": 0.887858981976629, + "grad_norm": 1.522324509793647, + "learning_rate": 6.519740971111432e-07, + "loss": 0.1599, + "step": 11207 + }, + { + "epoch": 0.8879382055852644, + "grad_norm": 1.525973904566212, + "learning_rate": 6.510630332131262e-07, + "loss": 0.1706, + "step": 11208 + }, + { + "epoch": 0.8880174291938998, + "grad_norm": 1.4522070152451843, + "learning_rate": 6.501525848976231e-07, + "loss": 0.1331, + "step": 11209 + }, + { + "epoch": 0.8880966528025351, + "grad_norm": 1.495099835222104, + "learning_rate": 6.492427522245836e-07, + "loss": 0.1555, + "step": 11210 + }, + { + "epoch": 0.8881758764111706, + "grad_norm": 1.2192352004704508, + "learning_rate": 6.483335352539144e-07, + "loss": 0.0837, + "step": 11211 + }, + { + "epoch": 0.8882551000198059, + "grad_norm": 1.7804550544788196, + "learning_rate": 6.474249340454874e-07, + "loss": 0.1429, + "step": 11212 + }, + { + "epoch": 0.8883343236284412, + "grad_norm": 1.5833327282776872, + "learning_rate": 6.46516948659125e-07, + "loss": 0.1629, + "step": 11213 + }, + { + "epoch": 0.8884135472370767, + "grad_norm": 1.5123457353031424, + "learning_rate": 6.456095791546147e-07, + "loss": 0.1292, + "step": 11214 + }, + { + "epoch": 0.888492770845712, + "grad_norm": 1.4086814871923838, + "learning_rate": 6.447028255917054e-07, + "loss": 0.1151, + "step": 11215 + }, + { + "epoch": 0.8885719944543474, + "grad_norm": 1.5087089422295907, + "learning_rate": 6.437966880300995e-07, + "loss": 0.1115, + "step": 11216 + }, + { + "epoch": 0.8886512180629828, + "grad_norm": 1.2840662545304073, + "learning_rate": 6.428911665294601e-07, + "loss": 0.0922, + "step": 11217 + }, + { + "epoch": 0.8887304416716182, + "grad_norm": 1.3695665174865892, + "learning_rate": 6.419862611494165e-07, + "loss": 0.1217, + "step": 11218 + }, + { + "epoch": 0.8888096652802535, + "grad_norm": 1.6259332497392347, + "learning_rate": 6.410819719495498e-07, + "loss": 0.1554, + "step": 11219 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.6804022880656255, + "learning_rate": 6.401782989894012e-07, + "loss": 0.188, + "step": 11220 + }, + { + "epoch": 0.8889681124975243, + "grad_norm": 1.7283123861924126, + "learning_rate": 6.392752423284765e-07, + "loss": 0.1592, + "step": 11221 + }, + { + "epoch": 0.8890473361061596, + "grad_norm": 1.7761924947340235, + "learning_rate": 6.383728020262359e-07, + "loss": 0.1403, + "step": 11222 + }, + { + "epoch": 0.889126559714795, + "grad_norm": 1.3749939235057573, + "learning_rate": 6.374709781420995e-07, + "loss": 0.1337, + "step": 11223 + }, + { + "epoch": 0.8892057833234304, + "grad_norm": 1.5242141328141736, + "learning_rate": 6.365697707354512e-07, + "loss": 0.1444, + "step": 11224 + }, + { + "epoch": 0.8892850069320658, + "grad_norm": 2.4773199167645084, + "learning_rate": 6.3566917986563e-07, + "loss": 0.1877, + "step": 11225 + }, + { + "epoch": 0.8893642305407011, + "grad_norm": 1.1579857686049366, + "learning_rate": 6.347692055919353e-07, + "loss": 0.0776, + "step": 11226 + }, + { + "epoch": 0.8894434541493365, + "grad_norm": 1.636475076877616, + "learning_rate": 6.338698479736227e-07, + "loss": 0.1753, + "step": 11227 + }, + { + "epoch": 0.8895226777579719, + "grad_norm": 1.2199580781257486, + "learning_rate": 6.329711070699162e-07, + "loss": 0.1069, + "step": 11228 + }, + { + "epoch": 0.8896019013666072, + "grad_norm": 1.6847169976544392, + "learning_rate": 6.320729829399918e-07, + "loss": 0.2067, + "step": 11229 + }, + { + "epoch": 0.8896811249752427, + "grad_norm": 2.10032103768669, + "learning_rate": 6.311754756429833e-07, + "loss": 0.1863, + "step": 11230 + }, + { + "epoch": 0.889760348583878, + "grad_norm": 1.8610953453560823, + "learning_rate": 6.302785852379911e-07, + "loss": 0.254, + "step": 11231 + }, + { + "epoch": 0.8898395721925134, + "grad_norm": 1.8218724451059767, + "learning_rate": 6.293823117840703e-07, + "loss": 0.1762, + "step": 11232 + }, + { + "epoch": 0.8899187958011487, + "grad_norm": 1.6441741701038273, + "learning_rate": 6.284866553402347e-07, + "loss": 0.2197, + "step": 11233 + }, + { + "epoch": 0.8899980194097841, + "grad_norm": 1.5854646926362082, + "learning_rate": 6.275916159654616e-07, + "loss": 0.121, + "step": 11234 + }, + { + "epoch": 0.8900772430184195, + "grad_norm": 1.7377751986544356, + "learning_rate": 6.266971937186827e-07, + "loss": 0.1548, + "step": 11235 + }, + { + "epoch": 0.8901564666270548, + "grad_norm": 1.4061315713175284, + "learning_rate": 6.258033886587911e-07, + "loss": 0.1495, + "step": 11236 + }, + { + "epoch": 0.8902356902356903, + "grad_norm": 1.4998357311709074, + "learning_rate": 6.249102008446418e-07, + "loss": 0.1431, + "step": 11237 + }, + { + "epoch": 0.8903149138443256, + "grad_norm": 1.6175872568826988, + "learning_rate": 6.240176303350453e-07, + "loss": 0.1837, + "step": 11238 + }, + { + "epoch": 0.890394137452961, + "grad_norm": 1.246091915795264, + "learning_rate": 6.231256771887739e-07, + "loss": 0.0959, + "step": 11239 + }, + { + "epoch": 0.8904733610615964, + "grad_norm": 1.6529702667844364, + "learning_rate": 6.222343414645571e-07, + "loss": 0.1791, + "step": 11240 + }, + { + "epoch": 0.8905525846702317, + "grad_norm": 1.5090258437887851, + "learning_rate": 6.213436232210868e-07, + "loss": 0.1907, + "step": 11241 + }, + { + "epoch": 0.8906318082788671, + "grad_norm": 2.095791854337218, + "learning_rate": 6.204535225170116e-07, + "loss": 0.1741, + "step": 11242 + }, + { + "epoch": 0.8907110318875024, + "grad_norm": 1.876696982929106, + "learning_rate": 6.195640394109393e-07, + "loss": 0.1649, + "step": 11243 + }, + { + "epoch": 0.8907902554961379, + "grad_norm": 1.4199290772900632, + "learning_rate": 6.186751739614405e-07, + "loss": 0.1433, + "step": 11244 + }, + { + "epoch": 0.8908694791047732, + "grad_norm": 1.7431171687864346, + "learning_rate": 6.177869262270419e-07, + "loss": 0.1284, + "step": 11245 + }, + { + "epoch": 0.8909487027134086, + "grad_norm": 2.195327519966475, + "learning_rate": 6.168992962662279e-07, + "loss": 0.1708, + "step": 11246 + }, + { + "epoch": 0.891027926322044, + "grad_norm": 1.7763622960297851, + "learning_rate": 6.160122841374482e-07, + "loss": 0.2129, + "step": 11247 + }, + { + "epoch": 0.8911071499306793, + "grad_norm": 1.9501774640286642, + "learning_rate": 6.151258898991064e-07, + "loss": 0.2469, + "step": 11248 + }, + { + "epoch": 0.8911863735393147, + "grad_norm": 1.851749359678267, + "learning_rate": 6.142401136095666e-07, + "loss": 0.1727, + "step": 11249 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 1.0887285254803596, + "learning_rate": 6.133549553271556e-07, + "loss": 0.1168, + "step": 11250 + }, + { + "epoch": 0.8913448207565855, + "grad_norm": 1.4013356163225112, + "learning_rate": 6.124704151101546e-07, + "loss": 0.0999, + "step": 11251 + }, + { + "epoch": 0.8914240443652208, + "grad_norm": 1.694376069402616, + "learning_rate": 6.115864930168058e-07, + "loss": 0.2033, + "step": 11252 + }, + { + "epoch": 0.8915032679738563, + "grad_norm": 1.8458441488883788, + "learning_rate": 6.107031891053139e-07, + "loss": 0.1726, + "step": 11253 + }, + { + "epoch": 0.8915824915824916, + "grad_norm": 1.159657815093472, + "learning_rate": 6.098205034338378e-07, + "loss": 0.0735, + "step": 11254 + }, + { + "epoch": 0.8916617151911269, + "grad_norm": 1.4033220964352697, + "learning_rate": 6.089384360605e-07, + "loss": 0.1828, + "step": 11255 + }, + { + "epoch": 0.8917409387997624, + "grad_norm": 1.7516159313909134, + "learning_rate": 6.080569870433773e-07, + "loss": 0.1345, + "step": 11256 + }, + { + "epoch": 0.8918201624083977, + "grad_norm": 1.6714637132205032, + "learning_rate": 6.071761564405121e-07, + "loss": 0.1865, + "step": 11257 + }, + { + "epoch": 0.8918993860170331, + "grad_norm": 1.7742440519366174, + "learning_rate": 6.062959443099014e-07, + "loss": 0.205, + "step": 11258 + }, + { + "epoch": 0.8919786096256684, + "grad_norm": 2.0297418055836185, + "learning_rate": 6.054163507095035e-07, + "loss": 0.2431, + "step": 11259 + }, + { + "epoch": 0.8920578332343039, + "grad_norm": 1.3322366360320181, + "learning_rate": 6.04537375697235e-07, + "loss": 0.1114, + "step": 11260 + }, + { + "epoch": 0.8921370568429392, + "grad_norm": 1.577470855731298, + "learning_rate": 6.036590193309711e-07, + "loss": 0.1358, + "step": 11261 + }, + { + "epoch": 0.8922162804515745, + "grad_norm": 1.4110589792298522, + "learning_rate": 6.027812816685497e-07, + "loss": 0.1431, + "step": 11262 + }, + { + "epoch": 0.89229550406021, + "grad_norm": 1.5870613425665359, + "learning_rate": 6.019041627677635e-07, + "loss": 0.152, + "step": 11263 + }, + { + "epoch": 0.8923747276688453, + "grad_norm": 2.017644087815337, + "learning_rate": 6.010276626863687e-07, + "loss": 0.2127, + "step": 11264 + }, + { + "epoch": 0.8924539512774807, + "grad_norm": 1.9355662399539144, + "learning_rate": 6.001517814820757e-07, + "loss": 0.1656, + "step": 11265 + }, + { + "epoch": 0.892533174886116, + "grad_norm": 1.7887324927696548, + "learning_rate": 5.992765192125594e-07, + "loss": 0.1672, + "step": 11266 + }, + { + "epoch": 0.8926123984947515, + "grad_norm": 1.8308073852761952, + "learning_rate": 5.984018759354515e-07, + "loss": 0.1837, + "step": 11267 + }, + { + "epoch": 0.8926916221033868, + "grad_norm": 1.5167061556868657, + "learning_rate": 5.975278517083405e-07, + "loss": 0.1456, + "step": 11268 + }, + { + "epoch": 0.8927708457120221, + "grad_norm": 1.3132974669812354, + "learning_rate": 5.966544465887803e-07, + "loss": 0.1294, + "step": 11269 + }, + { + "epoch": 0.8928500693206576, + "grad_norm": 1.2186063300854928, + "learning_rate": 5.957816606342792e-07, + "loss": 0.1101, + "step": 11270 + }, + { + "epoch": 0.8929292929292929, + "grad_norm": 1.846243225219194, + "learning_rate": 5.949094939023037e-07, + "loss": 0.1587, + "step": 11271 + }, + { + "epoch": 0.8930085165379283, + "grad_norm": 1.5181098948173455, + "learning_rate": 5.940379464502854e-07, + "loss": 0.0929, + "step": 11272 + }, + { + "epoch": 0.8930877401465637, + "grad_norm": 1.8059894076668794, + "learning_rate": 5.931670183356097e-07, + "loss": 0.2006, + "step": 11273 + }, + { + "epoch": 0.893166963755199, + "grad_norm": 1.30408373614937, + "learning_rate": 5.922967096156218e-07, + "loss": 0.1283, + "step": 11274 + }, + { + "epoch": 0.8932461873638344, + "grad_norm": 1.6571374410903401, + "learning_rate": 5.914270203476291e-07, + "loss": 0.1757, + "step": 11275 + }, + { + "epoch": 0.8933254109724698, + "grad_norm": 1.4526104683390844, + "learning_rate": 5.90557950588897e-07, + "loss": 0.1023, + "step": 11276 + }, + { + "epoch": 0.8934046345811052, + "grad_norm": 1.7298944643987848, + "learning_rate": 5.896895003966463e-07, + "loss": 0.143, + "step": 11277 + }, + { + "epoch": 0.8934838581897405, + "grad_norm": 1.2827752791043883, + "learning_rate": 5.888216698280646e-07, + "loss": 0.109, + "step": 11278 + }, + { + "epoch": 0.893563081798376, + "grad_norm": 1.3578336180229134, + "learning_rate": 5.879544589402919e-07, + "loss": 0.1232, + "step": 11279 + }, + { + "epoch": 0.8936423054070113, + "grad_norm": 1.849784040165139, + "learning_rate": 5.870878677904302e-07, + "loss": 0.2735, + "step": 11280 + }, + { + "epoch": 0.8937215290156466, + "grad_norm": 1.8077700092476061, + "learning_rate": 5.862218964355382e-07, + "loss": 0.1994, + "step": 11281 + }, + { + "epoch": 0.893800752624282, + "grad_norm": 1.9009916428864504, + "learning_rate": 5.853565449326404e-07, + "loss": 0.1723, + "step": 11282 + }, + { + "epoch": 0.8938799762329174, + "grad_norm": 1.5682009846048015, + "learning_rate": 5.844918133387134e-07, + "loss": 0.1355, + "step": 11283 + }, + { + "epoch": 0.8939591998415528, + "grad_norm": 1.2714334361564996, + "learning_rate": 5.836277017106951e-07, + "loss": 0.1071, + "step": 11284 + }, + { + "epoch": 0.8940384234501881, + "grad_norm": 1.591084559490241, + "learning_rate": 5.827642101054854e-07, + "loss": 0.1345, + "step": 11285 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 1.9704366449407094, + "learning_rate": 5.819013385799388e-07, + "loss": 0.1989, + "step": 11286 + }, + { + "epoch": 0.8941968706674589, + "grad_norm": 1.9061566520705489, + "learning_rate": 5.810390871908711e-07, + "loss": 0.1741, + "step": 11287 + }, + { + "epoch": 0.8942760942760942, + "grad_norm": 1.6121238486483267, + "learning_rate": 5.801774559950591e-07, + "loss": 0.1263, + "step": 11288 + }, + { + "epoch": 0.8943553178847297, + "grad_norm": 1.32607166595525, + "learning_rate": 5.793164450492372e-07, + "loss": 0.087, + "step": 11289 + }, + { + "epoch": 0.894434541493365, + "grad_norm": 2.392354843710851, + "learning_rate": 5.784560544100959e-07, + "loss": 0.2312, + "step": 11290 + }, + { + "epoch": 0.8945137651020004, + "grad_norm": 1.547948366456263, + "learning_rate": 5.775962841342919e-07, + "loss": 0.1689, + "step": 11291 + }, + { + "epoch": 0.8945929887106358, + "grad_norm": 2.27664957216012, + "learning_rate": 5.767371342784345e-07, + "loss": 0.1949, + "step": 11292 + }, + { + "epoch": 0.8946722123192712, + "grad_norm": 1.838662399074391, + "learning_rate": 5.758786048990939e-07, + "loss": 0.1854, + "step": 11293 + }, + { + "epoch": 0.8947514359279065, + "grad_norm": 1.2599419099330276, + "learning_rate": 5.750206960528027e-07, + "loss": 0.1108, + "step": 11294 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 1.7598522187783212, + "learning_rate": 5.741634077960479e-07, + "loss": 0.1982, + "step": 11295 + }, + { + "epoch": 0.8949098831451773, + "grad_norm": 1.517262601487664, + "learning_rate": 5.733067401852788e-07, + "loss": 0.2039, + "step": 11296 + }, + { + "epoch": 0.8949891067538126, + "grad_norm": 1.6145889972294232, + "learning_rate": 5.724506932769014e-07, + "loss": 0.1695, + "step": 11297 + }, + { + "epoch": 0.895068330362448, + "grad_norm": 1.3094398597015489, + "learning_rate": 5.71595267127284e-07, + "loss": 0.0688, + "step": 11298 + }, + { + "epoch": 0.8951475539710834, + "grad_norm": 1.6896357084447566, + "learning_rate": 5.707404617927526e-07, + "loss": 0.1168, + "step": 11299 + }, + { + "epoch": 0.8952267775797188, + "grad_norm": 1.6449875147626893, + "learning_rate": 5.698862773295888e-07, + "loss": 0.1499, + "step": 11300 + }, + { + "epoch": 0.8953060011883541, + "grad_norm": 1.6897512766602445, + "learning_rate": 5.69032713794041e-07, + "loss": 0.1442, + "step": 11301 + }, + { + "epoch": 0.8953852247969895, + "grad_norm": 1.3085370897928394, + "learning_rate": 5.681797712423099e-07, + "loss": 0.1084, + "step": 11302 + }, + { + "epoch": 0.8954644484056249, + "grad_norm": 1.3669161558332743, + "learning_rate": 5.673274497305559e-07, + "loss": 0.1333, + "step": 11303 + }, + { + "epoch": 0.8955436720142602, + "grad_norm": 2.0376865016263155, + "learning_rate": 5.664757493149042e-07, + "loss": 0.2081, + "step": 11304 + }, + { + "epoch": 0.8956228956228957, + "grad_norm": 1.571977400576846, + "learning_rate": 5.656246700514323e-07, + "loss": 0.1898, + "step": 11305 + }, + { + "epoch": 0.895702119231531, + "grad_norm": 1.9198876385521253, + "learning_rate": 5.647742119961797e-07, + "loss": 0.1674, + "step": 11306 + }, + { + "epoch": 0.8957813428401664, + "grad_norm": 1.3299311659699893, + "learning_rate": 5.639243752051482e-07, + "loss": 0.1105, + "step": 11307 + }, + { + "epoch": 0.8958605664488017, + "grad_norm": 1.592886373269954, + "learning_rate": 5.630751597342921e-07, + "loss": 0.1071, + "step": 11308 + }, + { + "epoch": 0.8959397900574371, + "grad_norm": 1.546762085242033, + "learning_rate": 5.622265656395276e-07, + "loss": 0.1426, + "step": 11309 + }, + { + "epoch": 0.8960190136660725, + "grad_norm": 1.6661356466787718, + "learning_rate": 5.613785929767335e-07, + "loss": 0.1744, + "step": 11310 + }, + { + "epoch": 0.8960982372747078, + "grad_norm": 1.9130021091242595, + "learning_rate": 5.605312418017439e-07, + "loss": 0.2088, + "step": 11311 + }, + { + "epoch": 0.8961774608833433, + "grad_norm": 1.3767088898138513, + "learning_rate": 5.59684512170352e-07, + "loss": 0.1461, + "step": 11312 + }, + { + "epoch": 0.8962566844919786, + "grad_norm": 1.473560398664012, + "learning_rate": 5.588384041383089e-07, + "loss": 0.1184, + "step": 11313 + }, + { + "epoch": 0.896335908100614, + "grad_norm": 1.5160233684633657, + "learning_rate": 5.579929177613308e-07, + "loss": 0.1623, + "step": 11314 + }, + { + "epoch": 0.8964151317092494, + "grad_norm": 1.3391100552727728, + "learning_rate": 5.571480530950879e-07, + "loss": 0.1407, + "step": 11315 + }, + { + "epoch": 0.8964943553178847, + "grad_norm": 1.652836812915285, + "learning_rate": 5.563038101952067e-07, + "loss": 0.2171, + "step": 11316 + }, + { + "epoch": 0.8965735789265201, + "grad_norm": 1.3700753634234208, + "learning_rate": 5.554601891172817e-07, + "loss": 0.1119, + "step": 11317 + }, + { + "epoch": 0.8966528025351554, + "grad_norm": 1.4839320395216817, + "learning_rate": 5.546171899168595e-07, + "loss": 0.1419, + "step": 11318 + }, + { + "epoch": 0.8967320261437909, + "grad_norm": 1.8921895587968263, + "learning_rate": 5.537748126494446e-07, + "loss": 0.2277, + "step": 11319 + }, + { + "epoch": 0.8968112497524262, + "grad_norm": 1.4042344796227255, + "learning_rate": 5.529330573705083e-07, + "loss": 0.1294, + "step": 11320 + }, + { + "epoch": 0.8968904733610616, + "grad_norm": 1.7041968714851021, + "learning_rate": 5.520919241354728e-07, + "loss": 0.1673, + "step": 11321 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 1.5651800598353514, + "learning_rate": 5.512514129997227e-07, + "loss": 0.1575, + "step": 11322 + }, + { + "epoch": 0.8970489205783323, + "grad_norm": 1.4110693909785013, + "learning_rate": 5.504115240186048e-07, + "loss": 0.092, + "step": 11323 + }, + { + "epoch": 0.8971281441869677, + "grad_norm": 1.5541585819536432, + "learning_rate": 5.495722572474183e-07, + "loss": 0.1799, + "step": 11324 + }, + { + "epoch": 0.8972073677956031, + "grad_norm": 1.594218021174744, + "learning_rate": 5.487336127414267e-07, + "loss": 0.1556, + "step": 11325 + }, + { + "epoch": 0.8972865914042385, + "grad_norm": 1.57926160174065, + "learning_rate": 5.478955905558491e-07, + "loss": 0.1688, + "step": 11326 + }, + { + "epoch": 0.8973658150128738, + "grad_norm": 1.2120721334678117, + "learning_rate": 5.470581907458672e-07, + "loss": 0.0978, + "step": 11327 + }, + { + "epoch": 0.8974450386215093, + "grad_norm": 1.4914908757731382, + "learning_rate": 5.462214133666189e-07, + "loss": 0.11, + "step": 11328 + }, + { + "epoch": 0.8975242622301446, + "grad_norm": 1.7671302458626819, + "learning_rate": 5.453852584732e-07, + "loss": 0.1428, + "step": 11329 + }, + { + "epoch": 0.8976034858387799, + "grad_norm": 1.7867272378550434, + "learning_rate": 5.4454972612067e-07, + "loss": 0.1282, + "step": 11330 + }, + { + "epoch": 0.8976827094474154, + "grad_norm": 1.4743379797773373, + "learning_rate": 5.437148163640449e-07, + "loss": 0.1755, + "step": 11331 + }, + { + "epoch": 0.8977619330560507, + "grad_norm": 2.016628735552289, + "learning_rate": 5.428805292582973e-07, + "loss": 0.2047, + "step": 11332 + }, + { + "epoch": 0.8978411566646861, + "grad_norm": 1.2649879451604544, + "learning_rate": 5.420468648583621e-07, + "loss": 0.0848, + "step": 11333 + }, + { + "epoch": 0.8979203802733214, + "grad_norm": 1.5785850638119205, + "learning_rate": 5.412138232191333e-07, + "loss": 0.0966, + "step": 11334 + }, + { + "epoch": 0.8979996038819569, + "grad_norm": 1.9203816553077167, + "learning_rate": 5.403814043954592e-07, + "loss": 0.1588, + "step": 11335 + }, + { + "epoch": 0.8980788274905922, + "grad_norm": 1.3511975089074213, + "learning_rate": 5.39549608442157e-07, + "loss": 0.0872, + "step": 11336 + }, + { + "epoch": 0.8981580510992275, + "grad_norm": 1.6294520420028766, + "learning_rate": 5.387184354139896e-07, + "loss": 0.1017, + "step": 11337 + }, + { + "epoch": 0.898237274707863, + "grad_norm": 1.4732938690481243, + "learning_rate": 5.378878853656877e-07, + "loss": 0.1177, + "step": 11338 + }, + { + "epoch": 0.8983164983164983, + "grad_norm": 1.9978839265823294, + "learning_rate": 5.370579583519409e-07, + "loss": 0.1499, + "step": 11339 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 1.604379496127464, + "learning_rate": 5.362286544273942e-07, + "loss": 0.1529, + "step": 11340 + }, + { + "epoch": 0.898474945533769, + "grad_norm": 1.483344368761494, + "learning_rate": 5.353999736466531e-07, + "loss": 0.1232, + "step": 11341 + }, + { + "epoch": 0.8985541691424045, + "grad_norm": 1.2542500492553612, + "learning_rate": 5.345719160642848e-07, + "loss": 0.1157, + "step": 11342 + }, + { + "epoch": 0.8986333927510398, + "grad_norm": 1.4386845890827769, + "learning_rate": 5.337444817348103e-07, + "loss": 0.0905, + "step": 11343 + }, + { + "epoch": 0.8987126163596751, + "grad_norm": 1.5307143742529814, + "learning_rate": 5.329176707127115e-07, + "loss": 0.1478, + "step": 11344 + }, + { + "epoch": 0.8987918399683106, + "grad_norm": 1.321029166404186, + "learning_rate": 5.320914830524337e-07, + "loss": 0.1347, + "step": 11345 + }, + { + "epoch": 0.8988710635769459, + "grad_norm": 1.5876254737255515, + "learning_rate": 5.312659188083746e-07, + "loss": 0.11, + "step": 11346 + }, + { + "epoch": 0.8989502871855813, + "grad_norm": 1.6432029337053642, + "learning_rate": 5.304409780348919e-07, + "loss": 0.1774, + "step": 11347 + }, + { + "epoch": 0.8990295107942167, + "grad_norm": 1.5641320811554813, + "learning_rate": 5.296166607863085e-07, + "loss": 0.197, + "step": 11348 + }, + { + "epoch": 0.899108734402852, + "grad_norm": 1.5244923754489452, + "learning_rate": 5.287929671168989e-07, + "loss": 0.0989, + "step": 11349 + }, + { + "epoch": 0.8991879580114874, + "grad_norm": 1.7158970641907811, + "learning_rate": 5.279698970809011e-07, + "loss": 0.1801, + "step": 11350 + }, + { + "epoch": 0.8992671816201228, + "grad_norm": 1.4183447696963687, + "learning_rate": 5.271474507325058e-07, + "loss": 0.1377, + "step": 11351 + }, + { + "epoch": 0.8993464052287582, + "grad_norm": 1.4773141886665055, + "learning_rate": 5.263256281258733e-07, + "loss": 0.1103, + "step": 11352 + }, + { + "epoch": 0.8994256288373935, + "grad_norm": 1.6218057438215137, + "learning_rate": 5.255044293151135e-07, + "loss": 0.1844, + "step": 11353 + }, + { + "epoch": 0.899504852446029, + "grad_norm": 1.4933641227010577, + "learning_rate": 5.246838543542964e-07, + "loss": 0.1397, + "step": 11354 + }, + { + "epoch": 0.8995840760546643, + "grad_norm": 1.5396011767264581, + "learning_rate": 5.23863903297458e-07, + "loss": 0.1236, + "step": 11355 + }, + { + "epoch": 0.8996632996632996, + "grad_norm": 1.8919777521382155, + "learning_rate": 5.230445761985836e-07, + "loss": 0.1955, + "step": 11356 + }, + { + "epoch": 0.899742523271935, + "grad_norm": 1.8568798319959627, + "learning_rate": 5.222258731116237e-07, + "loss": 0.1766, + "step": 11357 + }, + { + "epoch": 0.8998217468805704, + "grad_norm": 1.9133468366897972, + "learning_rate": 5.214077940904872e-07, + "loss": 0.1658, + "step": 11358 + }, + { + "epoch": 0.8999009704892058, + "grad_norm": 1.6425193973081886, + "learning_rate": 5.205903391890387e-07, + "loss": 0.1523, + "step": 11359 + }, + { + "epoch": 0.8999801940978411, + "grad_norm": 1.2474483394012947, + "learning_rate": 5.197735084611033e-07, + "loss": 0.1387, + "step": 11360 + }, + { + "epoch": 0.9000594177064766, + "grad_norm": 1.3462790343248248, + "learning_rate": 5.189573019604676e-07, + "loss": 0.1126, + "step": 11361 + }, + { + "epoch": 0.9001386413151119, + "grad_norm": 1.545323003050307, + "learning_rate": 5.181417197408733e-07, + "loss": 0.1697, + "step": 11362 + }, + { + "epoch": 0.9002178649237472, + "grad_norm": 1.5548820157271666, + "learning_rate": 5.173267618560229e-07, + "loss": 0.17, + "step": 11363 + }, + { + "epoch": 0.9002970885323827, + "grad_norm": 1.1205615384995606, + "learning_rate": 5.165124283595779e-07, + "loss": 0.0736, + "step": 11364 + }, + { + "epoch": 0.900376312141018, + "grad_norm": 1.98818053162733, + "learning_rate": 5.156987193051577e-07, + "loss": 0.2034, + "step": 11365 + }, + { + "epoch": 0.9004555357496534, + "grad_norm": 1.6542828239285352, + "learning_rate": 5.148856347463416e-07, + "loss": 0.1141, + "step": 11366 + }, + { + "epoch": 0.9005347593582887, + "grad_norm": 1.5942589914927283, + "learning_rate": 5.140731747366656e-07, + "loss": 0.1243, + "step": 11367 + }, + { + "epoch": 0.9006139829669242, + "grad_norm": 1.4300086113167827, + "learning_rate": 5.132613393296293e-07, + "loss": 0.1411, + "step": 11368 + }, + { + "epoch": 0.9006932065755595, + "grad_norm": 1.6178021745768005, + "learning_rate": 5.124501285786865e-07, + "loss": 0.1196, + "step": 11369 + }, + { + "epoch": 0.9007724301841948, + "grad_norm": 1.9545184399558575, + "learning_rate": 5.1163954253725e-07, + "loss": 0.1976, + "step": 11370 + }, + { + "epoch": 0.9008516537928303, + "grad_norm": 1.3406387089035658, + "learning_rate": 5.108295812586961e-07, + "loss": 0.1307, + "step": 11371 + }, + { + "epoch": 0.9009308774014656, + "grad_norm": 2.082724295705734, + "learning_rate": 5.100202447963553e-07, + "loss": 0.1879, + "step": 11372 + }, + { + "epoch": 0.901010101010101, + "grad_norm": 1.661762069352776, + "learning_rate": 5.092115332035163e-07, + "loss": 0.1544, + "step": 11373 + }, + { + "epoch": 0.9010893246187364, + "grad_norm": 1.5043254295374404, + "learning_rate": 5.084034465334342e-07, + "loss": 0.112, + "step": 11374 + }, + { + "epoch": 0.9011685482273718, + "grad_norm": 1.8103321437180646, + "learning_rate": 5.07595984839313e-07, + "loss": 0.1764, + "step": 11375 + }, + { + "epoch": 0.9012477718360071, + "grad_norm": 1.6908820620018818, + "learning_rate": 5.067891481743203e-07, + "loss": 0.1258, + "step": 11376 + }, + { + "epoch": 0.9013269954446425, + "grad_norm": 1.8555870826772165, + "learning_rate": 5.059829365915859e-07, + "loss": 0.2098, + "step": 11377 + }, + { + "epoch": 0.9014062190532779, + "grad_norm": 1.7446096382305152, + "learning_rate": 5.051773501441926e-07, + "loss": 0.1563, + "step": 11378 + }, + { + "epoch": 0.9014854426619132, + "grad_norm": 1.7967718562152633, + "learning_rate": 5.043723888851837e-07, + "loss": 0.1808, + "step": 11379 + }, + { + "epoch": 0.9015646662705487, + "grad_norm": 1.8487202047221145, + "learning_rate": 5.035680528675635e-07, + "loss": 0.1806, + "step": 11380 + }, + { + "epoch": 0.901643889879184, + "grad_norm": 2.020903434076147, + "learning_rate": 5.027643421442929e-07, + "loss": 0.169, + "step": 11381 + }, + { + "epoch": 0.9017231134878194, + "grad_norm": 1.5976327872226603, + "learning_rate": 5.01961256768293e-07, + "loss": 0.12, + "step": 11382 + }, + { + "epoch": 0.9018023370964547, + "grad_norm": 1.202659850687613, + "learning_rate": 5.011587967924414e-07, + "loss": 0.0739, + "step": 11383 + }, + { + "epoch": 0.9018815607050901, + "grad_norm": 1.2199042086708365, + "learning_rate": 5.003569622695792e-07, + "loss": 0.0919, + "step": 11384 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 1.5701709718738008, + "learning_rate": 4.99555753252502e-07, + "loss": 0.1662, + "step": 11385 + }, + { + "epoch": 0.9020400079223608, + "grad_norm": 1.6548432815623688, + "learning_rate": 4.987551697939629e-07, + "loss": 0.1702, + "step": 11386 + }, + { + "epoch": 0.9021192315309963, + "grad_norm": 1.0829761303403709, + "learning_rate": 4.979552119466802e-07, + "loss": 0.0932, + "step": 11387 + }, + { + "epoch": 0.9021984551396316, + "grad_norm": 1.6308302635424, + "learning_rate": 4.971558797633258e-07, + "loss": 0.1834, + "step": 11388 + }, + { + "epoch": 0.902277678748267, + "grad_norm": 1.4169782185345832, + "learning_rate": 4.963571732965311e-07, + "loss": 0.1214, + "step": 11389 + }, + { + "epoch": 0.9023569023569024, + "grad_norm": 2.114433740553322, + "learning_rate": 4.955590925988896e-07, + "loss": 0.2454, + "step": 11390 + }, + { + "epoch": 0.9024361259655377, + "grad_norm": 1.8675460581321455, + "learning_rate": 4.947616377229492e-07, + "loss": 0.1358, + "step": 11391 + }, + { + "epoch": 0.9025153495741731, + "grad_norm": 1.7578970333362844, + "learning_rate": 4.939648087212168e-07, + "loss": 0.1745, + "step": 11392 + }, + { + "epoch": 0.9025945731828084, + "grad_norm": 1.7722405255081999, + "learning_rate": 4.931686056461626e-07, + "loss": 0.1781, + "step": 11393 + }, + { + "epoch": 0.9026737967914439, + "grad_norm": 2.0153409981016512, + "learning_rate": 4.923730285502126e-07, + "loss": 0.1848, + "step": 11394 + }, + { + "epoch": 0.9027530204000792, + "grad_norm": 1.6568347891034174, + "learning_rate": 4.915780774857504e-07, + "loss": 0.1856, + "step": 11395 + }, + { + "epoch": 0.9028322440087146, + "grad_norm": 1.4383764201952762, + "learning_rate": 4.907837525051196e-07, + "loss": 0.1231, + "step": 11396 + }, + { + "epoch": 0.90291146761735, + "grad_norm": 2.0260323706516106, + "learning_rate": 4.89990053660624e-07, + "loss": 0.2478, + "step": 11397 + }, + { + "epoch": 0.9029906912259853, + "grad_norm": 1.6566336805540895, + "learning_rate": 4.891969810045239e-07, + "loss": 0.1579, + "step": 11398 + }, + { + "epoch": 0.9030699148346207, + "grad_norm": 1.4074813418733512, + "learning_rate": 4.884045345890387e-07, + "loss": 0.1366, + "step": 11399 + }, + { + "epoch": 0.9031491384432561, + "grad_norm": 1.964438753026122, + "learning_rate": 4.87612714466349e-07, + "loss": 0.2293, + "step": 11400 + }, + { + "epoch": 0.9032283620518915, + "grad_norm": 2.0421436854789223, + "learning_rate": 4.868215206885918e-07, + "loss": 0.1583, + "step": 11401 + }, + { + "epoch": 0.9033075856605268, + "grad_norm": 1.2624632556187516, + "learning_rate": 4.860309533078611e-07, + "loss": 0.0778, + "step": 11402 + }, + { + "epoch": 0.9033868092691623, + "grad_norm": 2.155488995237254, + "learning_rate": 4.852410123762164e-07, + "loss": 0.2024, + "step": 11403 + }, + { + "epoch": 0.9034660328777976, + "grad_norm": 1.4810079746012292, + "learning_rate": 4.844516979456671e-07, + "loss": 0.128, + "step": 11404 + }, + { + "epoch": 0.9035452564864329, + "grad_norm": 1.4881442493700057, + "learning_rate": 4.836630100681872e-07, + "loss": 0.1067, + "step": 11405 + }, + { + "epoch": 0.9036244800950683, + "grad_norm": 1.9283246715174054, + "learning_rate": 4.828749487957097e-07, + "loss": 0.2044, + "step": 11406 + }, + { + "epoch": 0.9037037037037037, + "grad_norm": 1.363813160298884, + "learning_rate": 4.82087514180124e-07, + "loss": 0.1322, + "step": 11407 + }, + { + "epoch": 0.9037829273123391, + "grad_norm": 1.7902310414222058, + "learning_rate": 4.813007062732756e-07, + "loss": 0.1482, + "step": 11408 + }, + { + "epoch": 0.9038621509209744, + "grad_norm": 1.2489883691533643, + "learning_rate": 4.805145251269772e-07, + "loss": 0.1039, + "step": 11409 + }, + { + "epoch": 0.9039413745296099, + "grad_norm": 1.3935651931034179, + "learning_rate": 4.797289707929919e-07, + "loss": 0.1136, + "step": 11410 + }, + { + "epoch": 0.9040205981382452, + "grad_norm": 1.406768123138463, + "learning_rate": 4.789440433230452e-07, + "loss": 0.1278, + "step": 11411 + }, + { + "epoch": 0.9040998217468805, + "grad_norm": 1.619004642502583, + "learning_rate": 4.781597427688189e-07, + "loss": 0.1398, + "step": 11412 + }, + { + "epoch": 0.904179045355516, + "grad_norm": 1.6282128895076493, + "learning_rate": 4.773760691819596e-07, + "loss": 0.1499, + "step": 11413 + }, + { + "epoch": 0.9042582689641513, + "grad_norm": 2.2494812738014933, + "learning_rate": 4.765930226140658e-07, + "loss": 0.1706, + "step": 11414 + }, + { + "epoch": 0.9043374925727867, + "grad_norm": 1.526250178082704, + "learning_rate": 4.7581060311669757e-07, + "loss": 0.1186, + "step": 11415 + }, + { + "epoch": 0.904416716181422, + "grad_norm": 1.3764706450447917, + "learning_rate": 4.7502881074137476e-07, + "loss": 0.1432, + "step": 11416 + }, + { + "epoch": 0.9044959397900575, + "grad_norm": 1.4298051863304106, + "learning_rate": 4.742476455395706e-07, + "loss": 0.1373, + "step": 11417 + }, + { + "epoch": 0.9045751633986928, + "grad_norm": 1.4785058799251485, + "learning_rate": 4.734671075627262e-07, + "loss": 0.1298, + "step": 11418 + }, + { + "epoch": 0.9046543870073281, + "grad_norm": 1.555622667199011, + "learning_rate": 4.726871968622337e-07, + "loss": 0.1513, + "step": 11419 + }, + { + "epoch": 0.9047336106159636, + "grad_norm": 1.6606247073598899, + "learning_rate": 4.7190791348944777e-07, + "loss": 0.1493, + "step": 11420 + }, + { + "epoch": 0.9048128342245989, + "grad_norm": 1.2205852258418255, + "learning_rate": 4.711292574956772e-07, + "loss": 0.1191, + "step": 11421 + }, + { + "epoch": 0.9048920578332343, + "grad_norm": 1.9490785495485299, + "learning_rate": 4.7035122893219653e-07, + "loss": 0.1741, + "step": 11422 + }, + { + "epoch": 0.9049712814418697, + "grad_norm": 3.575777555965547, + "learning_rate": 4.695738278502338e-07, + "loss": 0.1204, + "step": 11423 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 1.8982510466699347, + "learning_rate": 4.6879705430097566e-07, + "loss": 0.1712, + "step": 11424 + }, + { + "epoch": 0.9051297286591404, + "grad_norm": 1.5860373350741164, + "learning_rate": 4.6802090833557136e-07, + "loss": 0.1518, + "step": 11425 + }, + { + "epoch": 0.9052089522677758, + "grad_norm": 1.0487312466854741, + "learning_rate": 4.6724539000512546e-07, + "loss": 0.0756, + "step": 11426 + }, + { + "epoch": 0.9052881758764112, + "grad_norm": 1.3378899060647163, + "learning_rate": 4.6647049936070054e-07, + "loss": 0.1318, + "step": 11427 + }, + { + "epoch": 0.9053673994850465, + "grad_norm": 1.4264166642309284, + "learning_rate": 4.656962364533224e-07, + "loss": 0.0955, + "step": 11428 + }, + { + "epoch": 0.905446623093682, + "grad_norm": 1.2982669978194084, + "learning_rate": 4.649226013339703e-07, + "loss": 0.1542, + "step": 11429 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 1.356803060594413, + "learning_rate": 4.641495940535845e-07, + "loss": 0.1386, + "step": 11430 + }, + { + "epoch": 0.9056050703109526, + "grad_norm": 1.5557342496371263, + "learning_rate": 4.633772146630655e-07, + "loss": 0.1591, + "step": 11431 + }, + { + "epoch": 0.905684293919588, + "grad_norm": 1.8995539995481001, + "learning_rate": 4.626054632132693e-07, + "loss": 0.1821, + "step": 11432 + }, + { + "epoch": 0.9057635175282234, + "grad_norm": 1.700038313275163, + "learning_rate": 4.6183433975501067e-07, + "loss": 0.1451, + "step": 11433 + }, + { + "epoch": 0.9058427411368588, + "grad_norm": 1.832993371380585, + "learning_rate": 4.61063844339068e-07, + "loss": 0.1174, + "step": 11434 + }, + { + "epoch": 0.9059219647454941, + "grad_norm": 1.65751923386094, + "learning_rate": 4.6029397701617296e-07, + "loss": 0.1291, + "step": 11435 + }, + { + "epoch": 0.9060011883541296, + "grad_norm": 1.9533427523696023, + "learning_rate": 4.595247378370171e-07, + "loss": 0.2368, + "step": 11436 + }, + { + "epoch": 0.9060804119627649, + "grad_norm": 1.519366410392257, + "learning_rate": 4.5875612685225e-07, + "loss": 0.1234, + "step": 11437 + }, + { + "epoch": 0.9061596355714002, + "grad_norm": 1.731676213932306, + "learning_rate": 4.5798814411248336e-07, + "loss": 0.1264, + "step": 11438 + }, + { + "epoch": 0.9062388591800357, + "grad_norm": 1.6178598376552586, + "learning_rate": 4.5722078966828455e-07, + "loss": 0.1556, + "step": 11439 + }, + { + "epoch": 0.906318082788671, + "grad_norm": 1.6351938104257702, + "learning_rate": 4.5645406357017865e-07, + "loss": 0.1929, + "step": 11440 + }, + { + "epoch": 0.9063973063973064, + "grad_norm": 1.4601227302114204, + "learning_rate": 4.5568796586865304e-07, + "loss": 0.1198, + "step": 11441 + }, + { + "epoch": 0.9064765300059417, + "grad_norm": 2.4113144449324, + "learning_rate": 4.5492249661415077e-07, + "loss": 0.1908, + "step": 11442 + }, + { + "epoch": 0.9065557536145772, + "grad_norm": 1.7621537032939296, + "learning_rate": 4.541576558570726e-07, + "loss": 0.1689, + "step": 11443 + }, + { + "epoch": 0.9066349772232125, + "grad_norm": 1.329967633141681, + "learning_rate": 4.533934436477827e-07, + "loss": 0.0894, + "step": 11444 + }, + { + "epoch": 0.9067142008318478, + "grad_norm": 1.3935119403098621, + "learning_rate": 4.526298600365997e-07, + "loss": 0.1822, + "step": 11445 + }, + { + "epoch": 0.9067934244404833, + "grad_norm": 1.358613009555309, + "learning_rate": 4.5186690507379894e-07, + "loss": 0.1313, + "step": 11446 + }, + { + "epoch": 0.9068726480491186, + "grad_norm": 1.2676754987561336, + "learning_rate": 4.5110457880962246e-07, + "loss": 0.0775, + "step": 11447 + }, + { + "epoch": 0.906951871657754, + "grad_norm": 1.663215876255719, + "learning_rate": 4.503428812942623e-07, + "loss": 0.143, + "step": 11448 + }, + { + "epoch": 0.9070310952663894, + "grad_norm": 1.7666869367452651, + "learning_rate": 4.495818125778717e-07, + "loss": 0.1901, + "step": 11449 + }, + { + "epoch": 0.9071103188750248, + "grad_norm": 1.2659551483511309, + "learning_rate": 4.488213727105672e-07, + "loss": 0.086, + "step": 11450 + }, + { + "epoch": 0.9071895424836601, + "grad_norm": 1.8523283017604075, + "learning_rate": 4.4806156174241776e-07, + "loss": 0.2006, + "step": 11451 + }, + { + "epoch": 0.9072687660922955, + "grad_norm": 1.2893120423432343, + "learning_rate": 4.4730237972345326e-07, + "loss": 0.1062, + "step": 11452 + }, + { + "epoch": 0.9073479897009309, + "grad_norm": 1.5218562161802534, + "learning_rate": 4.465438267036604e-07, + "loss": 0.1368, + "step": 11453 + }, + { + "epoch": 0.9074272133095662, + "grad_norm": 1.5222837280333117, + "learning_rate": 4.4578590273299027e-07, + "loss": 0.129, + "step": 11454 + }, + { + "epoch": 0.9075064369182017, + "grad_norm": 2.0373974163545068, + "learning_rate": 4.4502860786134747e-07, + "loss": 0.1235, + "step": 11455 + }, + { + "epoch": 0.907585660526837, + "grad_norm": 1.7761946526984598, + "learning_rate": 4.4427194213859216e-07, + "loss": 0.1409, + "step": 11456 + }, + { + "epoch": 0.9076648841354724, + "grad_norm": 1.6236772582510532, + "learning_rate": 4.435159056145533e-07, + "loss": 0.1334, + "step": 11457 + }, + { + "epoch": 0.9077441077441077, + "grad_norm": 1.4811407774522563, + "learning_rate": 4.427604983390077e-07, + "loss": 0.1735, + "step": 11458 + }, + { + "epoch": 0.9078233313527431, + "grad_norm": 2.018968731941394, + "learning_rate": 4.420057203616956e-07, + "loss": 0.1982, + "step": 11459 + }, + { + "epoch": 0.9079025549613785, + "grad_norm": 1.4590780705199156, + "learning_rate": 4.4125157173231847e-07, + "loss": 0.0875, + "step": 11460 + }, + { + "epoch": 0.9079817785700138, + "grad_norm": 1.6296435205133828, + "learning_rate": 4.40498052500532e-07, + "loss": 0.2, + "step": 11461 + }, + { + "epoch": 0.9080610021786493, + "grad_norm": 1.7082663538612024, + "learning_rate": 4.397451627159499e-07, + "loss": 0.1283, + "step": 11462 + }, + { + "epoch": 0.9081402257872846, + "grad_norm": 1.954016097680302, + "learning_rate": 4.389929024281492e-07, + "loss": 0.2037, + "step": 11463 + }, + { + "epoch": 0.90821944939592, + "grad_norm": 1.4045344721135806, + "learning_rate": 4.382412716866602e-07, + "loss": 0.1595, + "step": 11464 + }, + { + "epoch": 0.9082986730045554, + "grad_norm": 1.3395634336756332, + "learning_rate": 4.374902705409745e-07, + "loss": 0.1343, + "step": 11465 + }, + { + "epoch": 0.9083778966131907, + "grad_norm": 1.280373094394481, + "learning_rate": 4.367398990405447e-07, + "loss": 0.0947, + "step": 11466 + }, + { + "epoch": 0.9084571202218261, + "grad_norm": 1.4916047142841655, + "learning_rate": 4.359901572347758e-07, + "loss": 0.1688, + "step": 11467 + }, + { + "epoch": 0.9085363438304614, + "grad_norm": 1.3628814348833307, + "learning_rate": 4.3524104517303714e-07, + "loss": 0.127, + "step": 11468 + }, + { + "epoch": 0.9086155674390969, + "grad_norm": 2.6665253956652473, + "learning_rate": 4.3449256290465035e-07, + "loss": 0.1734, + "step": 11469 + }, + { + "epoch": 0.9086947910477322, + "grad_norm": 1.488086262716902, + "learning_rate": 4.3374471047890497e-07, + "loss": 0.1562, + "step": 11470 + }, + { + "epoch": 0.9087740146563676, + "grad_norm": 1.53872033396023, + "learning_rate": 4.329974879450394e-07, + "loss": 0.1611, + "step": 11471 + }, + { + "epoch": 0.908853238265003, + "grad_norm": 1.552253372938291, + "learning_rate": 4.3225089535225415e-07, + "loss": 0.1553, + "step": 11472 + }, + { + "epoch": 0.9089324618736383, + "grad_norm": 1.516760891347147, + "learning_rate": 4.3150493274971227e-07, + "loss": 0.1644, + "step": 11473 + }, + { + "epoch": 0.9090116854822737, + "grad_norm": 1.494441336588106, + "learning_rate": 4.3075960018652995e-07, + "loss": 0.1122, + "step": 11474 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.6144792076667926, + "learning_rate": 4.300148977117824e-07, + "loss": 0.1662, + "step": 11475 + }, + { + "epoch": 0.9091701326995445, + "grad_norm": 1.2885401572931912, + "learning_rate": 4.2927082537450705e-07, + "loss": 0.1025, + "step": 11476 + }, + { + "epoch": 0.9092493563081798, + "grad_norm": 1.3474983101967246, + "learning_rate": 4.285273832236969e-07, + "loss": 0.1176, + "step": 11477 + }, + { + "epoch": 0.9093285799168153, + "grad_norm": 1.875344140418163, + "learning_rate": 4.277845713083018e-07, + "loss": 0.2049, + "step": 11478 + }, + { + "epoch": 0.9094078035254506, + "grad_norm": 1.615629943363044, + "learning_rate": 4.2704238967723574e-07, + "loss": 0.1634, + "step": 11479 + }, + { + "epoch": 0.9094870271340859, + "grad_norm": 1.4123259716174827, + "learning_rate": 4.2630083837936654e-07, + "loss": 0.115, + "step": 11480 + }, + { + "epoch": 0.9095662507427213, + "grad_norm": 1.5196796232562162, + "learning_rate": 4.2555991746352054e-07, + "loss": 0.1865, + "step": 11481 + }, + { + "epoch": 0.9096454743513567, + "grad_norm": 2.110429034207658, + "learning_rate": 4.2481962697848323e-07, + "loss": 0.1659, + "step": 11482 + }, + { + "epoch": 0.9097246979599921, + "grad_norm": 1.789308279589873, + "learning_rate": 4.240799669730034e-07, + "loss": 0.2024, + "step": 11483 + }, + { + "epoch": 0.9098039215686274, + "grad_norm": 1.7360816472168967, + "learning_rate": 4.2334093749577975e-07, + "loss": 0.1565, + "step": 11484 + }, + { + "epoch": 0.9098831451772629, + "grad_norm": 1.5890408322521405, + "learning_rate": 4.226025385954746e-07, + "loss": 0.1324, + "step": 11485 + }, + { + "epoch": 0.9099623687858982, + "grad_norm": 1.260621694056252, + "learning_rate": 4.218647703207113e-07, + "loss": 0.0799, + "step": 11486 + }, + { + "epoch": 0.9100415923945335, + "grad_norm": 1.4641042877468629, + "learning_rate": 4.211276327200642e-07, + "loss": 0.1397, + "step": 11487 + }, + { + "epoch": 0.910120816003169, + "grad_norm": 1.7766762268382965, + "learning_rate": 4.203911258420712e-07, + "loss": 0.162, + "step": 11488 + }, + { + "epoch": 0.9102000396118043, + "grad_norm": 1.9630476454092027, + "learning_rate": 4.196552497352302e-07, + "loss": 0.1117, + "step": 11489 + }, + { + "epoch": 0.9102792632204397, + "grad_norm": 1.3637213618537847, + "learning_rate": 4.189200044479924e-07, + "loss": 0.1432, + "step": 11490 + }, + { + "epoch": 0.910358486829075, + "grad_norm": 2.0088292049886842, + "learning_rate": 4.1818539002877024e-07, + "loss": 0.2268, + "step": 11491 + }, + { + "epoch": 0.9104377104377105, + "grad_norm": 1.4150811023124612, + "learning_rate": 4.174514065259383e-07, + "loss": 0.1392, + "step": 11492 + }, + { + "epoch": 0.9105169340463458, + "grad_norm": 1.4594828016024535, + "learning_rate": 4.167180539878213e-07, + "loss": 0.1034, + "step": 11493 + }, + { + "epoch": 0.9105961576549811, + "grad_norm": 2.011774466012604, + "learning_rate": 4.1598533246270833e-07, + "loss": 0.1638, + "step": 11494 + }, + { + "epoch": 0.9106753812636166, + "grad_norm": 1.875467935125843, + "learning_rate": 4.152532419988453e-07, + "loss": 0.248, + "step": 11495 + }, + { + "epoch": 0.9107546048722519, + "grad_norm": 1.5698093927419363, + "learning_rate": 4.145217826444392e-07, + "loss": 0.1729, + "step": 11496 + }, + { + "epoch": 0.9108338284808873, + "grad_norm": 1.7009017025483715, + "learning_rate": 4.1379095444764926e-07, + "loss": 0.1228, + "step": 11497 + }, + { + "epoch": 0.9109130520895227, + "grad_norm": 1.412021408610375, + "learning_rate": 4.130607574566003e-07, + "loss": 0.1408, + "step": 11498 + }, + { + "epoch": 0.9109922756981581, + "grad_norm": 1.5105646720320853, + "learning_rate": 4.1233119171937065e-07, + "loss": 0.161, + "step": 11499 + }, + { + "epoch": 0.9110714993067934, + "grad_norm": 1.4556037366622985, + "learning_rate": 4.116022572839984e-07, + "loss": 0.1228, + "step": 11500 + }, + { + "epoch": 0.9111507229154288, + "grad_norm": 1.5244095482547964, + "learning_rate": 4.1087395419848186e-07, + "loss": 0.1308, + "step": 11501 + }, + { + "epoch": 0.9112299465240642, + "grad_norm": 1.348523017951419, + "learning_rate": 4.10146282510776e-07, + "loss": 0.1154, + "step": 11502 + }, + { + "epoch": 0.9113091701326995, + "grad_norm": 1.3128465629445263, + "learning_rate": 4.094192422687926e-07, + "loss": 0.0797, + "step": 11503 + }, + { + "epoch": 0.911388393741335, + "grad_norm": 1.550695791337716, + "learning_rate": 4.0869283352040656e-07, + "loss": 0.1498, + "step": 11504 + }, + { + "epoch": 0.9114676173499703, + "grad_norm": 1.452680901527569, + "learning_rate": 4.079670563134475e-07, + "loss": 0.1888, + "step": 11505 + }, + { + "epoch": 0.9115468409586057, + "grad_norm": 1.600964369445469, + "learning_rate": 4.072419106957026e-07, + "loss": 0.1169, + "step": 11506 + }, + { + "epoch": 0.911626064567241, + "grad_norm": 1.7918237432120638, + "learning_rate": 4.065173967149205e-07, + "loss": 0.1469, + "step": 11507 + }, + { + "epoch": 0.9117052881758764, + "grad_norm": 1.4150082529440795, + "learning_rate": 4.057935144188074e-07, + "loss": 0.0823, + "step": 11508 + }, + { + "epoch": 0.9117845117845118, + "grad_norm": 1.3364288318451276, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.1333, + "step": 11509 + }, + { + "epoch": 0.9118637353931471, + "grad_norm": 1.1833849746772958, + "learning_rate": 4.043476450712014e-07, + "loss": 0.1506, + "step": 11510 + }, + { + "epoch": 0.9119429590017826, + "grad_norm": 1.652049602662587, + "learning_rate": 4.036256581149123e-07, + "loss": 0.1657, + "step": 11511 + }, + { + "epoch": 0.9120221826104179, + "grad_norm": 1.7263545888738163, + "learning_rate": 4.0290430303369876e-07, + "loss": 0.1779, + "step": 11512 + }, + { + "epoch": 0.9121014062190532, + "grad_norm": 1.7207571872753231, + "learning_rate": 4.021835798750584e-07, + "loss": 0.1996, + "step": 11513 + }, + { + "epoch": 0.9121806298276887, + "grad_norm": 1.5695124587485665, + "learning_rate": 4.0146348868644767e-07, + "loss": 0.1797, + "step": 11514 + }, + { + "epoch": 0.912259853436324, + "grad_norm": 1.5382496131762027, + "learning_rate": 4.0074402951528204e-07, + "loss": 0.1, + "step": 11515 + }, + { + "epoch": 0.9123390770449594, + "grad_norm": 1.5840713797519137, + "learning_rate": 4.000252024089313e-07, + "loss": 0.1195, + "step": 11516 + }, + { + "epoch": 0.9124183006535947, + "grad_norm": 1.7705422429782756, + "learning_rate": 3.9930700741473093e-07, + "loss": 0.2405, + "step": 11517 + }, + { + "epoch": 0.9124975242622302, + "grad_norm": 1.5622492562438988, + "learning_rate": 3.985894445799676e-07, + "loss": 0.1498, + "step": 11518 + }, + { + "epoch": 0.9125767478708655, + "grad_norm": 1.4064816930871453, + "learning_rate": 3.978725139518891e-07, + "loss": 0.1305, + "step": 11519 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 1.8413556462738536, + "learning_rate": 3.9715621557770535e-07, + "loss": 0.1805, + "step": 11520 + }, + { + "epoch": 0.9127351950881363, + "grad_norm": 1.9057942315284813, + "learning_rate": 3.9644054950457753e-07, + "loss": 0.1519, + "step": 11521 + }, + { + "epoch": 0.9128144186967716, + "grad_norm": 1.9093191561874774, + "learning_rate": 3.9572551577963135e-07, + "loss": 0.1727, + "step": 11522 + }, + { + "epoch": 0.912893642305407, + "grad_norm": 1.4370936444534879, + "learning_rate": 3.9501111444994576e-07, + "loss": 0.1483, + "step": 11523 + }, + { + "epoch": 0.9129728659140424, + "grad_norm": 2.1219661549947526, + "learning_rate": 3.9429734556256205e-07, + "loss": 0.2495, + "step": 11524 + }, + { + "epoch": 0.9130520895226778, + "grad_norm": 1.805258930501115, + "learning_rate": 3.9358420916447927e-07, + "loss": 0.1946, + "step": 11525 + }, + { + "epoch": 0.9131313131313131, + "grad_norm": 1.7335832960592692, + "learning_rate": 3.9287170530265206e-07, + "loss": 0.1278, + "step": 11526 + }, + { + "epoch": 0.9132105367399485, + "grad_norm": 1.2158623326202613, + "learning_rate": 3.9215983402399736e-07, + "loss": 0.1075, + "step": 11527 + }, + { + "epoch": 0.9132897603485839, + "grad_norm": 1.2351151263867965, + "learning_rate": 3.914485953753888e-07, + "loss": 0.0794, + "step": 11528 + }, + { + "epoch": 0.9133689839572192, + "grad_norm": 1.4047343315874632, + "learning_rate": 3.907379894036545e-07, + "loss": 0.1047, + "step": 11529 + }, + { + "epoch": 0.9134482075658547, + "grad_norm": 2.2716094433711085, + "learning_rate": 3.9002801615558805e-07, + "loss": 0.2498, + "step": 11530 + }, + { + "epoch": 0.91352743117449, + "grad_norm": 1.7866964598227082, + "learning_rate": 3.893186756779366e-07, + "loss": 0.1751, + "step": 11531 + }, + { + "epoch": 0.9136066547831254, + "grad_norm": 1.3599024389724523, + "learning_rate": 3.886099680174049e-07, + "loss": 0.1578, + "step": 11532 + }, + { + "epoch": 0.9136858783917607, + "grad_norm": 1.4278646897137819, + "learning_rate": 3.879018932206624e-07, + "loss": 0.1609, + "step": 11533 + }, + { + "epoch": 0.9137651020003961, + "grad_norm": 1.7372751891319695, + "learning_rate": 3.871944513343284e-07, + "loss": 0.1903, + "step": 11534 + }, + { + "epoch": 0.9138443256090315, + "grad_norm": 1.4377724234209375, + "learning_rate": 3.864876424049857e-07, + "loss": 0.1413, + "step": 11535 + }, + { + "epoch": 0.9139235492176668, + "grad_norm": 1.223255049661162, + "learning_rate": 3.857814664791748e-07, + "loss": 0.0839, + "step": 11536 + }, + { + "epoch": 0.9140027728263023, + "grad_norm": 1.4495305857515581, + "learning_rate": 3.8507592360339407e-07, + "loss": 0.1394, + "step": 11537 + }, + { + "epoch": 0.9140819964349376, + "grad_norm": 1.6285867213681162, + "learning_rate": 3.843710138240997e-07, + "loss": 0.1442, + "step": 11538 + }, + { + "epoch": 0.914161220043573, + "grad_norm": 1.3411802527815546, + "learning_rate": 3.8366673718770564e-07, + "loss": 0.102, + "step": 11539 + }, + { + "epoch": 0.9142404436522084, + "grad_norm": 1.5876884342169022, + "learning_rate": 3.8296309374058704e-07, + "loss": 0.1504, + "step": 11540 + }, + { + "epoch": 0.9143196672608437, + "grad_norm": 1.7265560493276777, + "learning_rate": 3.8226008352907464e-07, + "loss": 0.1295, + "step": 11541 + }, + { + "epoch": 0.9143988908694791, + "grad_norm": 1.118984677423103, + "learning_rate": 3.815577065994569e-07, + "loss": 0.0891, + "step": 11542 + }, + { + "epoch": 0.9144781144781144, + "grad_norm": 2.5964336678687228, + "learning_rate": 3.8085596299798465e-07, + "loss": 0.1915, + "step": 11543 + }, + { + "epoch": 0.9145573380867499, + "grad_norm": 1.8021713531838894, + "learning_rate": 3.801548527708621e-07, + "loss": 0.1555, + "step": 11544 + }, + { + "epoch": 0.9146365616953852, + "grad_norm": 1.5575912487444539, + "learning_rate": 3.794543759642544e-07, + "loss": 0.1348, + "step": 11545 + }, + { + "epoch": 0.9147157853040206, + "grad_norm": 1.5610257957820701, + "learning_rate": 3.7875453262428584e-07, + "loss": 0.1445, + "step": 11546 + }, + { + "epoch": 0.914795008912656, + "grad_norm": 1.4484869564359644, + "learning_rate": 3.7805532279703625e-07, + "loss": 0.1232, + "step": 11547 + }, + { + "epoch": 0.9148742325212913, + "grad_norm": 1.4893814936190304, + "learning_rate": 3.773567465285455e-07, + "loss": 0.1386, + "step": 11548 + }, + { + "epoch": 0.9149534561299267, + "grad_norm": 1.9547831899284507, + "learning_rate": 3.7665880386481226e-07, + "loss": 0.1908, + "step": 11549 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 1.272824327408018, + "learning_rate": 3.759614948517931e-07, + "loss": 0.0918, + "step": 11550 + }, + { + "epoch": 0.9151119033471975, + "grad_norm": 1.6318645870591142, + "learning_rate": 3.7526481953539915e-07, + "loss": 0.1976, + "step": 11551 + }, + { + "epoch": 0.9151911269558328, + "grad_norm": 2.058839492962686, + "learning_rate": 3.74568777961507e-07, + "loss": 0.1649, + "step": 11552 + }, + { + "epoch": 0.9152703505644683, + "grad_norm": 1.505000230961725, + "learning_rate": 3.7387337017594674e-07, + "loss": 0.1312, + "step": 11553 + }, + { + "epoch": 0.9153495741731036, + "grad_norm": 1.3895035978217933, + "learning_rate": 3.7317859622450714e-07, + "loss": 0.1194, + "step": 11554 + }, + { + "epoch": 0.9154287977817389, + "grad_norm": 1.8297069473841547, + "learning_rate": 3.7248445615293506e-07, + "loss": 0.1553, + "step": 11555 + }, + { + "epoch": 0.9155080213903743, + "grad_norm": 1.7772629678778158, + "learning_rate": 3.7179095000693723e-07, + "loss": 0.1813, + "step": 11556 + }, + { + "epoch": 0.9155872449990097, + "grad_norm": 1.670179032618852, + "learning_rate": 3.710980778321771e-07, + "loss": 0.1478, + "step": 11557 + }, + { + "epoch": 0.9156664686076451, + "grad_norm": 1.6544428166526033, + "learning_rate": 3.70405839674276e-07, + "loss": 0.1106, + "step": 11558 + }, + { + "epoch": 0.9157456922162804, + "grad_norm": 2.075296757575573, + "learning_rate": 3.697142355788175e-07, + "loss": 0.1677, + "step": 11559 + }, + { + "epoch": 0.9158249158249159, + "grad_norm": 1.899923451301728, + "learning_rate": 3.6902326559133836e-07, + "loss": 0.2148, + "step": 11560 + }, + { + "epoch": 0.9159041394335512, + "grad_norm": 1.7522757224615029, + "learning_rate": 3.683329297573346e-07, + "loss": 0.1674, + "step": 11561 + }, + { + "epoch": 0.9159833630421865, + "grad_norm": 1.7573486374578033, + "learning_rate": 3.6764322812226416e-07, + "loss": 0.1267, + "step": 11562 + }, + { + "epoch": 0.916062586650822, + "grad_norm": 1.6285239229569703, + "learning_rate": 3.669541607315397e-07, + "loss": 0.1264, + "step": 11563 + }, + { + "epoch": 0.9161418102594573, + "grad_norm": 1.4735365135225411, + "learning_rate": 3.6626572763053034e-07, + "loss": 0.1486, + "step": 11564 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 1.9874340762709108, + "learning_rate": 3.6557792886457e-07, + "loss": 0.1427, + "step": 11565 + }, + { + "epoch": 0.916300257476728, + "grad_norm": 1.8464329164877902, + "learning_rate": 3.6489076447894456e-07, + "loss": 0.1761, + "step": 11566 + }, + { + "epoch": 0.9163794810853635, + "grad_norm": 1.4968989741471816, + "learning_rate": 3.642042345189023e-07, + "loss": 0.1648, + "step": 11567 + }, + { + "epoch": 0.9164587046939988, + "grad_norm": 1.6823491570843476, + "learning_rate": 3.6351833902964485e-07, + "loss": 0.1351, + "step": 11568 + }, + { + "epoch": 0.9165379283026341, + "grad_norm": 1.5933423637451072, + "learning_rate": 3.6283307805633714e-07, + "loss": 0.1931, + "step": 11569 + }, + { + "epoch": 0.9166171519112696, + "grad_norm": 1.383141472402163, + "learning_rate": 3.6214845164410205e-07, + "loss": 0.1039, + "step": 11570 + }, + { + "epoch": 0.9166963755199049, + "grad_norm": 1.766979450990567, + "learning_rate": 3.614644598380157e-07, + "loss": 0.187, + "step": 11571 + }, + { + "epoch": 0.9167755991285403, + "grad_norm": 1.8081345949348906, + "learning_rate": 3.607811026831176e-07, + "loss": 0.2761, + "step": 11572 + }, + { + "epoch": 0.9168548227371757, + "grad_norm": 1.617022495769536, + "learning_rate": 3.600983802244007e-07, + "loss": 0.1534, + "step": 11573 + }, + { + "epoch": 0.9169340463458111, + "grad_norm": 1.5527745764611558, + "learning_rate": 3.594162925068234e-07, + "loss": 0.1194, + "step": 11574 + }, + { + "epoch": 0.9170132699544464, + "grad_norm": 1.8497763853159939, + "learning_rate": 3.587348395752954e-07, + "loss": 0.172, + "step": 11575 + }, + { + "epoch": 0.9170924935630818, + "grad_norm": 1.7865374395055618, + "learning_rate": 3.5805402147468746e-07, + "loss": 0.2308, + "step": 11576 + }, + { + "epoch": 0.9171717171717172, + "grad_norm": 1.812559172587517, + "learning_rate": 3.573738382498271e-07, + "loss": 0.1521, + "step": 11577 + }, + { + "epoch": 0.9172509407803525, + "grad_norm": 1.5404171224819538, + "learning_rate": 3.566942899455039e-07, + "loss": 0.1433, + "step": 11578 + }, + { + "epoch": 0.917330164388988, + "grad_norm": 1.3737733339516849, + "learning_rate": 3.5601537660646e-07, + "loss": 0.0984, + "step": 11579 + }, + { + "epoch": 0.9174093879976233, + "grad_norm": 1.4178263612374546, + "learning_rate": 3.553370982773985e-07, + "loss": 0.1234, + "step": 11580 + }, + { + "epoch": 0.9174886116062587, + "grad_norm": 1.3987086617889926, + "learning_rate": 3.546594550029836e-07, + "loss": 0.1506, + "step": 11581 + }, + { + "epoch": 0.917567835214894, + "grad_norm": 1.3053407349738233, + "learning_rate": 3.53982446827833e-07, + "loss": 0.1261, + "step": 11582 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 1.2981639049950129, + "learning_rate": 3.533060737965244e-07, + "loss": 0.129, + "step": 11583 + }, + { + "epoch": 0.9177262824321648, + "grad_norm": 1.7152670182804346, + "learning_rate": 3.526303359535932e-07, + "loss": 0.1296, + "step": 11584 + }, + { + "epoch": 0.9178055060408001, + "grad_norm": 1.4413341806356037, + "learning_rate": 3.519552333435361e-07, + "loss": 0.1483, + "step": 11585 + }, + { + "epoch": 0.9178847296494356, + "grad_norm": 1.5133653871837465, + "learning_rate": 3.5128076601080087e-07, + "loss": 0.1111, + "step": 11586 + }, + { + "epoch": 0.9179639532580709, + "grad_norm": 1.6782389871616465, + "learning_rate": 3.5060693399980194e-07, + "loss": 0.1927, + "step": 11587 + }, + { + "epoch": 0.9180431768667062, + "grad_norm": 1.4103115490126616, + "learning_rate": 3.499337373549072e-07, + "loss": 0.1214, + "step": 11588 + }, + { + "epoch": 0.9181224004753417, + "grad_norm": 2.0548603485890364, + "learning_rate": 3.4926117612044117e-07, + "loss": 0.1626, + "step": 11589 + }, + { + "epoch": 0.918201624083977, + "grad_norm": 1.772403923619835, + "learning_rate": 3.485892503406907e-07, + "loss": 0.1552, + "step": 11590 + }, + { + "epoch": 0.9182808476926124, + "grad_norm": 1.5935080185895591, + "learning_rate": 3.4791796005989917e-07, + "loss": 0.1519, + "step": 11591 + }, + { + "epoch": 0.9183600713012477, + "grad_norm": 1.5556321116925378, + "learning_rate": 3.4724730532226693e-07, + "loss": 0.1573, + "step": 11592 + }, + { + "epoch": 0.9184392949098832, + "grad_norm": 1.6362103712809155, + "learning_rate": 3.4657728617195295e-07, + "loss": 0.1648, + "step": 11593 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 1.2292794872086064, + "learning_rate": 3.459079026530754e-07, + "loss": 0.1077, + "step": 11594 + }, + { + "epoch": 0.9185977421271538, + "grad_norm": 1.62138625214792, + "learning_rate": 3.4523915480971113e-07, + "loss": 0.1229, + "step": 11595 + }, + { + "epoch": 0.9186769657357893, + "grad_norm": 1.7245256202720183, + "learning_rate": 3.445710426858906e-07, + "loss": 0.1765, + "step": 11596 + }, + { + "epoch": 0.9187561893444246, + "grad_norm": 1.3099747045657828, + "learning_rate": 3.439035663256096e-07, + "loss": 0.1182, + "step": 11597 + }, + { + "epoch": 0.91883541295306, + "grad_norm": 1.7343117225883748, + "learning_rate": 3.4323672577281754e-07, + "loss": 0.1789, + "step": 11598 + }, + { + "epoch": 0.9189146365616954, + "grad_norm": 2.14604571805114, + "learning_rate": 3.425705210714192e-07, + "loss": 0.1285, + "step": 11599 + }, + { + "epoch": 0.9189938601703308, + "grad_norm": 2.048265676265873, + "learning_rate": 3.419049522652851e-07, + "loss": 0.1955, + "step": 11600 + }, + { + "epoch": 0.9190730837789661, + "grad_norm": 1.3961606142733436, + "learning_rate": 3.412400193982379e-07, + "loss": 0.1103, + "step": 11601 + }, + { + "epoch": 0.9191523073876015, + "grad_norm": 1.5634537347847361, + "learning_rate": 3.4057572251405936e-07, + "loss": 0.1258, + "step": 11602 + }, + { + "epoch": 0.9192315309962369, + "grad_norm": 1.9822298179705902, + "learning_rate": 3.3991206165649213e-07, + "loss": 0.1528, + "step": 11603 + }, + { + "epoch": 0.9193107546048722, + "grad_norm": 1.4833246757830865, + "learning_rate": 3.392490368692347e-07, + "loss": 0.1108, + "step": 11604 + }, + { + "epoch": 0.9193899782135077, + "grad_norm": 2.0044503593001153, + "learning_rate": 3.385866481959432e-07, + "loss": 0.1768, + "step": 11605 + }, + { + "epoch": 0.919469201822143, + "grad_norm": 1.2465074971948669, + "learning_rate": 3.379248956802328e-07, + "loss": 0.1063, + "step": 11606 + }, + { + "epoch": 0.9195484254307784, + "grad_norm": 1.7156708888361036, + "learning_rate": 3.3726377936567856e-07, + "loss": 0.1476, + "step": 11607 + }, + { + "epoch": 0.9196276490394137, + "grad_norm": 2.896158622026451, + "learning_rate": 3.3660329929580904e-07, + "loss": 0.1596, + "step": 11608 + }, + { + "epoch": 0.9197068726480491, + "grad_norm": 1.3472800425011235, + "learning_rate": 3.3594345551411503e-07, + "loss": 0.1053, + "step": 11609 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 1.1842206137896167, + "learning_rate": 3.352842480640439e-07, + "loss": 0.1042, + "step": 11610 + }, + { + "epoch": 0.9198653198653198, + "grad_norm": 1.1964695718924736, + "learning_rate": 3.346256769890022e-07, + "loss": 0.1037, + "step": 11611 + }, + { + "epoch": 0.9199445434739553, + "grad_norm": 1.3335666224752123, + "learning_rate": 3.3396774233235173e-07, + "loss": 0.0832, + "step": 11612 + }, + { + "epoch": 0.9200237670825906, + "grad_norm": 1.5331923588499068, + "learning_rate": 3.333104441374158e-07, + "loss": 0.2056, + "step": 11613 + }, + { + "epoch": 0.920102990691226, + "grad_norm": 1.7182370851103397, + "learning_rate": 3.32653782447474e-07, + "loss": 0.167, + "step": 11614 + }, + { + "epoch": 0.9201822142998614, + "grad_norm": 1.2212365012362176, + "learning_rate": 3.319977573057642e-07, + "loss": 0.129, + "step": 11615 + }, + { + "epoch": 0.9202614379084967, + "grad_norm": 1.8317428040916024, + "learning_rate": 3.313423687554829e-07, + "loss": 0.2041, + "step": 11616 + }, + { + "epoch": 0.9203406615171321, + "grad_norm": 1.7281923692129009, + "learning_rate": 3.3068761683978434e-07, + "loss": 0.0947, + "step": 11617 + }, + { + "epoch": 0.9204198851257674, + "grad_norm": 1.5338419037822246, + "learning_rate": 3.3003350160177974e-07, + "loss": 0.1464, + "step": 11618 + }, + { + "epoch": 0.9204991087344029, + "grad_norm": 1.3389794976151554, + "learning_rate": 3.293800230845412e-07, + "loss": 0.1056, + "step": 11619 + }, + { + "epoch": 0.9205783323430382, + "grad_norm": 1.3275207160487668, + "learning_rate": 3.287271813310955e-07, + "loss": 0.1365, + "step": 11620 + }, + { + "epoch": 0.9206575559516736, + "grad_norm": 1.3843883194498903, + "learning_rate": 3.280749763844293e-07, + "loss": 0.0964, + "step": 11621 + }, + { + "epoch": 0.920736779560309, + "grad_norm": 1.5719911462765894, + "learning_rate": 3.274234082874872e-07, + "loss": 0.152, + "step": 11622 + }, + { + "epoch": 0.9208160031689443, + "grad_norm": 1.874855980187166, + "learning_rate": 3.267724770831737e-07, + "loss": 0.1273, + "step": 11623 + }, + { + "epoch": 0.9208952267775797, + "grad_norm": 1.6198544459298125, + "learning_rate": 3.2612218281434794e-07, + "loss": 0.1575, + "step": 11624 + }, + { + "epoch": 0.9209744503862151, + "grad_norm": 1.763212027577717, + "learning_rate": 3.254725255238267e-07, + "loss": 0.2235, + "step": 11625 + }, + { + "epoch": 0.9210536739948505, + "grad_norm": 1.6911522138304946, + "learning_rate": 3.2482350525439023e-07, + "loss": 0.1513, + "step": 11626 + }, + { + "epoch": 0.9211328976034858, + "grad_norm": 1.456452226797364, + "learning_rate": 3.241751220487721e-07, + "loss": 0.1372, + "step": 11627 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 1.4405901795807932, + "learning_rate": 3.235273759496638e-07, + "loss": 0.1307, + "step": 11628 + }, + { + "epoch": 0.9212913448207566, + "grad_norm": 1.0882347880236594, + "learning_rate": 3.2288026699971884e-07, + "loss": 0.0769, + "step": 11629 + }, + { + "epoch": 0.9213705684293919, + "grad_norm": 1.3426771319618553, + "learning_rate": 3.222337952415455e-07, + "loss": 0.1046, + "step": 11630 + }, + { + "epoch": 0.9214497920380273, + "grad_norm": 1.5799093221154634, + "learning_rate": 3.215879607177086e-07, + "loss": 0.1304, + "step": 11631 + }, + { + "epoch": 0.9215290156466627, + "grad_norm": 1.968446649826686, + "learning_rate": 3.2094276347073626e-07, + "loss": 0.2047, + "step": 11632 + }, + { + "epoch": 0.9216082392552981, + "grad_norm": 2.280575118186498, + "learning_rate": 3.2029820354311014e-07, + "loss": 0.1311, + "step": 11633 + }, + { + "epoch": 0.9216874628639334, + "grad_norm": 1.4690123540354785, + "learning_rate": 3.196542809772707e-07, + "loss": 0.1572, + "step": 11634 + }, + { + "epoch": 0.9217666864725689, + "grad_norm": 1.5379390361554928, + "learning_rate": 3.1901099581561846e-07, + "loss": 0.1375, + "step": 11635 + }, + { + "epoch": 0.9218459100812042, + "grad_norm": 1.2376937629579075, + "learning_rate": 3.183683481005106e-07, + "loss": 0.1077, + "step": 11636 + }, + { + "epoch": 0.9219251336898395, + "grad_norm": 1.4936200104044857, + "learning_rate": 3.1772633787426233e-07, + "loss": 0.1323, + "step": 11637 + }, + { + "epoch": 0.922004357298475, + "grad_norm": 1.7174130093389426, + "learning_rate": 3.1708496517914523e-07, + "loss": 0.2056, + "step": 11638 + }, + { + "epoch": 0.9220835809071103, + "grad_norm": 1.4418865168568653, + "learning_rate": 3.1644423005739335e-07, + "loss": 0.1508, + "step": 11639 + }, + { + "epoch": 0.9221628045157457, + "grad_norm": 1.316743610570187, + "learning_rate": 3.15804132551194e-07, + "loss": 0.1196, + "step": 11640 + }, + { + "epoch": 0.922242028124381, + "grad_norm": 1.7073826396258642, + "learning_rate": 3.151646727026947e-07, + "loss": 0.1647, + "step": 11641 + }, + { + "epoch": 0.9223212517330165, + "grad_norm": 1.6376988051497907, + "learning_rate": 3.1452585055400167e-07, + "loss": 0.1989, + "step": 11642 + }, + { + "epoch": 0.9224004753416518, + "grad_norm": 1.4923469373310065, + "learning_rate": 3.138876661471779e-07, + "loss": 0.1281, + "step": 11643 + }, + { + "epoch": 0.9224796989502871, + "grad_norm": 1.2899564588739745, + "learning_rate": 3.1325011952424435e-07, + "loss": 0.1175, + "step": 11644 + }, + { + "epoch": 0.9225589225589226, + "grad_norm": 1.7718355652328004, + "learning_rate": 3.1261321072718063e-07, + "loss": 0.1712, + "step": 11645 + }, + { + "epoch": 0.9226381461675579, + "grad_norm": 1.6351016246175194, + "learning_rate": 3.1197693979792556e-07, + "loss": 0.1519, + "step": 11646 + }, + { + "epoch": 0.9227173697761933, + "grad_norm": 1.628244352165676, + "learning_rate": 3.1134130677837103e-07, + "loss": 0.127, + "step": 11647 + }, + { + "epoch": 0.9227965933848287, + "grad_norm": 1.5151095923441855, + "learning_rate": 3.107063117103759e-07, + "loss": 0.0792, + "step": 11648 + }, + { + "epoch": 0.9228758169934641, + "grad_norm": 2.4022307320517458, + "learning_rate": 3.100719546357467e-07, + "loss": 0.1349, + "step": 11649 + }, + { + "epoch": 0.9229550406020994, + "grad_norm": 1.6479057999461215, + "learning_rate": 3.0943823559625217e-07, + "loss": 0.1886, + "step": 11650 + }, + { + "epoch": 0.9230342642107348, + "grad_norm": 1.8424348287345138, + "learning_rate": 3.088051546336246e-07, + "loss": 0.2588, + "step": 11651 + }, + { + "epoch": 0.9231134878193702, + "grad_norm": 1.6874009228266151, + "learning_rate": 3.08172711789545e-07, + "loss": 0.1619, + "step": 11652 + }, + { + "epoch": 0.9231927114280055, + "grad_norm": 1.5086799190636198, + "learning_rate": 3.0754090710565785e-07, + "loss": 0.2206, + "step": 11653 + }, + { + "epoch": 0.923271935036641, + "grad_norm": 1.7584008865145941, + "learning_rate": 3.069097406235666e-07, + "loss": 0.1813, + "step": 11654 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 1.307879113416923, + "learning_rate": 3.0627921238482794e-07, + "loss": 0.148, + "step": 11655 + }, + { + "epoch": 0.9234303822539117, + "grad_norm": 1.791320729241961, + "learning_rate": 3.056493224309587e-07, + "loss": 0.1778, + "step": 11656 + }, + { + "epoch": 0.923509605862547, + "grad_norm": 1.9374329520498341, + "learning_rate": 3.0502007080343675e-07, + "loss": 0.167, + "step": 11657 + }, + { + "epoch": 0.9235888294711824, + "grad_norm": 1.3895261273343018, + "learning_rate": 3.043914575436946e-07, + "loss": 0.1162, + "step": 11658 + }, + { + "epoch": 0.9236680530798178, + "grad_norm": 1.3992705024143886, + "learning_rate": 3.0376348269312017e-07, + "loss": 0.1089, + "step": 11659 + }, + { + "epoch": 0.9237472766884531, + "grad_norm": 1.3016853448378236, + "learning_rate": 3.031361462930671e-07, + "loss": 0.1353, + "step": 11660 + }, + { + "epoch": 0.9238265002970886, + "grad_norm": 2.241201197216433, + "learning_rate": 3.025094483848401e-07, + "loss": 0.164, + "step": 11661 + }, + { + "epoch": 0.9239057239057239, + "grad_norm": 1.8320411760526856, + "learning_rate": 3.0188338900970505e-07, + "loss": 0.2267, + "step": 11662 + }, + { + "epoch": 0.9239849475143593, + "grad_norm": 1.4397073178874091, + "learning_rate": 3.0125796820888343e-07, + "loss": 0.1176, + "step": 11663 + }, + { + "epoch": 0.9240641711229947, + "grad_norm": 1.4271924456601242, + "learning_rate": 3.0063318602355787e-07, + "loss": 0.1182, + "step": 11664 + }, + { + "epoch": 0.92414339473163, + "grad_norm": 1.8845527000929638, + "learning_rate": 3.000090424948665e-07, + "loss": 0.1804, + "step": 11665 + }, + { + "epoch": 0.9242226183402654, + "grad_norm": 2.2425951312324215, + "learning_rate": 2.993855376639054e-07, + "loss": 0.2159, + "step": 11666 + }, + { + "epoch": 0.9243018419489007, + "grad_norm": 1.778330137799533, + "learning_rate": 2.987626715717318e-07, + "loss": 0.2485, + "step": 11667 + }, + { + "epoch": 0.9243810655575362, + "grad_norm": 1.5228936539295022, + "learning_rate": 2.9814044425935605e-07, + "loss": 0.1393, + "step": 11668 + }, + { + "epoch": 0.9244602891661715, + "grad_norm": 1.3641193094591961, + "learning_rate": 2.9751885576774887e-07, + "loss": 0.1682, + "step": 11669 + }, + { + "epoch": 0.9245395127748068, + "grad_norm": 1.4390503180778005, + "learning_rate": 2.9689790613784073e-07, + "loss": 0.1204, + "step": 11670 + }, + { + "epoch": 0.9246187363834423, + "grad_norm": 1.675419583148962, + "learning_rate": 2.962775954105179e-07, + "loss": 0.1268, + "step": 11671 + }, + { + "epoch": 0.9246979599920776, + "grad_norm": 1.2661279475292775, + "learning_rate": 2.9565792362662213e-07, + "loss": 0.0962, + "step": 11672 + }, + { + "epoch": 0.924777183600713, + "grad_norm": 1.8175015118198954, + "learning_rate": 2.9503889082695967e-07, + "loss": 0.164, + "step": 11673 + }, + { + "epoch": 0.9248564072093484, + "grad_norm": 1.6461564442567023, + "learning_rate": 2.9442049705228794e-07, + "loss": 0.0976, + "step": 11674 + }, + { + "epoch": 0.9249356308179838, + "grad_norm": 1.505026992419722, + "learning_rate": 2.938027423433254e-07, + "loss": 0.1249, + "step": 11675 + }, + { + "epoch": 0.9250148544266191, + "grad_norm": 1.5764070871938913, + "learning_rate": 2.931856267407507e-07, + "loss": 0.1244, + "step": 11676 + }, + { + "epoch": 0.9250940780352545, + "grad_norm": 1.708192777427909, + "learning_rate": 2.9256915028519575e-07, + "loss": 0.1479, + "step": 11677 + }, + { + "epoch": 0.9251733016438899, + "grad_norm": 1.904073161409538, + "learning_rate": 2.919533130172536e-07, + "loss": 0.1764, + "step": 11678 + }, + { + "epoch": 0.9252525252525252, + "grad_norm": 1.5731454496797521, + "learning_rate": 2.913381149774719e-07, + "loss": 0.1007, + "step": 11679 + }, + { + "epoch": 0.9253317488611607, + "grad_norm": 1.232005169787576, + "learning_rate": 2.907235562063615e-07, + "loss": 0.1047, + "step": 11680 + }, + { + "epoch": 0.925410972469796, + "grad_norm": 2.1321905265613723, + "learning_rate": 2.9010963674438674e-07, + "loss": 0.3028, + "step": 11681 + }, + { + "epoch": 0.9254901960784314, + "grad_norm": 1.5450402958356704, + "learning_rate": 2.8949635663197087e-07, + "loss": 0.1288, + "step": 11682 + }, + { + "epoch": 0.9255694196870667, + "grad_norm": 1.3152807694139295, + "learning_rate": 2.8888371590949703e-07, + "loss": 0.114, + "step": 11683 + }, + { + "epoch": 0.9256486432957021, + "grad_norm": 1.5908223031267135, + "learning_rate": 2.882717146173031e-07, + "loss": 0.1605, + "step": 11684 + }, + { + "epoch": 0.9257278669043375, + "grad_norm": 1.744822215313669, + "learning_rate": 2.8766035279568563e-07, + "loss": 0.1654, + "step": 11685 + }, + { + "epoch": 0.9258070905129728, + "grad_norm": 1.4112028815113598, + "learning_rate": 2.8704963048490243e-07, + "loss": 0.1136, + "step": 11686 + }, + { + "epoch": 0.9258863141216083, + "grad_norm": 1.3275109723200844, + "learning_rate": 2.864395477251658e-07, + "loss": 0.1271, + "step": 11687 + }, + { + "epoch": 0.9259655377302436, + "grad_norm": 1.9629416281676029, + "learning_rate": 2.858301045566447e-07, + "loss": 0.1316, + "step": 11688 + }, + { + "epoch": 0.926044761338879, + "grad_norm": 1.5980592306847683, + "learning_rate": 2.8522130101947045e-07, + "loss": 0.1265, + "step": 11689 + }, + { + "epoch": 0.9261239849475144, + "grad_norm": 1.4230943112183498, + "learning_rate": 2.8461313715372976e-07, + "loss": 0.0955, + "step": 11690 + }, + { + "epoch": 0.9262032085561497, + "grad_norm": 1.8304223296406066, + "learning_rate": 2.8400561299946503e-07, + "loss": 0.1766, + "step": 11691 + }, + { + "epoch": 0.9262824321647851, + "grad_norm": 1.7353593044395943, + "learning_rate": 2.8339872859668103e-07, + "loss": 0.1552, + "step": 11692 + }, + { + "epoch": 0.9263616557734204, + "grad_norm": 1.9076415849588544, + "learning_rate": 2.82792483985338e-07, + "loss": 0.1213, + "step": 11693 + }, + { + "epoch": 0.9264408793820559, + "grad_norm": 1.8558378166517273, + "learning_rate": 2.8218687920535395e-07, + "loss": 0.1345, + "step": 11694 + }, + { + "epoch": 0.9265201029906912, + "grad_norm": 1.4577450763751478, + "learning_rate": 2.8158191429660364e-07, + "loss": 0.1095, + "step": 11695 + }, + { + "epoch": 0.9265993265993266, + "grad_norm": 1.4249962356324197, + "learning_rate": 2.8097758929892196e-07, + "loss": 0.1354, + "step": 11696 + }, + { + "epoch": 0.926678550207962, + "grad_norm": 1.5769509722267274, + "learning_rate": 2.803739042521025e-07, + "loss": 0.1524, + "step": 11697 + }, + { + "epoch": 0.9267577738165973, + "grad_norm": 1.644574895200532, + "learning_rate": 2.7977085919589253e-07, + "loss": 0.1624, + "step": 11698 + }, + { + "epoch": 0.9268369974252327, + "grad_norm": 1.8638902429061044, + "learning_rate": 2.791684541700013e-07, + "loss": 0.1791, + "step": 11699 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 1.5134004418321778, + "learning_rate": 2.785666892140937e-07, + "loss": 0.1345, + "step": 11700 + }, + { + "epoch": 0.9269954446425035, + "grad_norm": 1.3544409921124534, + "learning_rate": 2.7796556436779144e-07, + "loss": 0.1125, + "step": 11701 + }, + { + "epoch": 0.9270746682511388, + "grad_norm": 1.334768128109655, + "learning_rate": 2.773650796706795e-07, + "loss": 0.1241, + "step": 11702 + }, + { + "epoch": 0.9271538918597743, + "grad_norm": 2.256049474205574, + "learning_rate": 2.7676523516229404e-07, + "loss": 0.1636, + "step": 11703 + }, + { + "epoch": 0.9272331154684096, + "grad_norm": 1.2334360930280261, + "learning_rate": 2.7616603088213126e-07, + "loss": 0.093, + "step": 11704 + }, + { + "epoch": 0.9273123390770449, + "grad_norm": 1.8538842423884008, + "learning_rate": 2.755674668696495e-07, + "loss": 0.1198, + "step": 11705 + }, + { + "epoch": 0.9273915626856803, + "grad_norm": 1.7571718732385617, + "learning_rate": 2.749695431642574e-07, + "loss": 0.1522, + "step": 11706 + }, + { + "epoch": 0.9274707862943157, + "grad_norm": 1.6408522026123453, + "learning_rate": 2.743722598053278e-07, + "loss": 0.1488, + "step": 11707 + }, + { + "epoch": 0.9275500099029511, + "grad_norm": 1.5240448412180032, + "learning_rate": 2.737756168321881e-07, + "loss": 0.1568, + "step": 11708 + }, + { + "epoch": 0.9276292335115864, + "grad_norm": 1.6592881811409121, + "learning_rate": 2.7317961428412475e-07, + "loss": 0.117, + "step": 11709 + }, + { + "epoch": 0.9277084571202219, + "grad_norm": 2.0577890108814145, + "learning_rate": 2.7258425220038077e-07, + "loss": 0.2154, + "step": 11710 + }, + { + "epoch": 0.9277876807288572, + "grad_norm": 1.522763535952322, + "learning_rate": 2.719895306201581e-07, + "loss": 0.158, + "step": 11711 + }, + { + "epoch": 0.9278669043374925, + "grad_norm": 2.1385756090334054, + "learning_rate": 2.7139544958261765e-07, + "loss": 0.1965, + "step": 11712 + }, + { + "epoch": 0.927946127946128, + "grad_norm": 1.5491272044064366, + "learning_rate": 2.7080200912687484e-07, + "loss": 0.1211, + "step": 11713 + }, + { + "epoch": 0.9280253515547633, + "grad_norm": 1.7967937233245501, + "learning_rate": 2.702092092920061e-07, + "loss": 0.1519, + "step": 11714 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 2.0472651051884556, + "learning_rate": 2.6961705011704475e-07, + "loss": 0.2119, + "step": 11715 + }, + { + "epoch": 0.928183798772034, + "grad_norm": 1.8568698084885338, + "learning_rate": 2.6902553164098065e-07, + "loss": 0.1477, + "step": 11716 + }, + { + "epoch": 0.9282630223806695, + "grad_norm": 1.8856808832454568, + "learning_rate": 2.684346539027616e-07, + "loss": 0.1985, + "step": 11717 + }, + { + "epoch": 0.9283422459893048, + "grad_norm": 1.3754590570852714, + "learning_rate": 2.6784441694129747e-07, + "loss": 0.1434, + "step": 11718 + }, + { + "epoch": 0.9284214695979401, + "grad_norm": 1.5196496708283196, + "learning_rate": 2.672548207954495e-07, + "loss": 0.1058, + "step": 11719 + }, + { + "epoch": 0.9285006932065756, + "grad_norm": 1.904537390083598, + "learning_rate": 2.6666586550403884e-07, + "loss": 0.2214, + "step": 11720 + }, + { + "epoch": 0.9285799168152109, + "grad_norm": 1.547897253552797, + "learning_rate": 2.6607755110584886e-07, + "loss": 0.2243, + "step": 11721 + }, + { + "epoch": 0.9286591404238463, + "grad_norm": 1.9861157922471282, + "learning_rate": 2.654898776396164e-07, + "loss": 0.1466, + "step": 11722 + }, + { + "epoch": 0.9287383640324817, + "grad_norm": 1.3037673283245397, + "learning_rate": 2.64902845144035e-07, + "loss": 0.1046, + "step": 11723 + }, + { + "epoch": 0.9288175876411171, + "grad_norm": 1.5569984170578262, + "learning_rate": 2.6431645365775806e-07, + "loss": 0.1834, + "step": 11724 + }, + { + "epoch": 0.9288968112497524, + "grad_norm": 1.6671195524416909, + "learning_rate": 2.637307032193992e-07, + "loss": 0.1805, + "step": 11725 + }, + { + "epoch": 0.9289760348583878, + "grad_norm": 1.8116149059344988, + "learning_rate": 2.6314559386752423e-07, + "loss": 0.1773, + "step": 11726 + }, + { + "epoch": 0.9290552584670232, + "grad_norm": 1.6679532697430413, + "learning_rate": 2.6256112564066236e-07, + "loss": 0.1572, + "step": 11727 + }, + { + "epoch": 0.9291344820756585, + "grad_norm": 1.2627075249199327, + "learning_rate": 2.6197729857729617e-07, + "loss": 0.1052, + "step": 11728 + }, + { + "epoch": 0.929213705684294, + "grad_norm": 1.873068512818658, + "learning_rate": 2.613941127158681e-07, + "loss": 0.1858, + "step": 11729 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 1.7520818616751512, + "learning_rate": 2.608115680947787e-07, + "loss": 0.0973, + "step": 11730 + }, + { + "epoch": 0.9293721529015647, + "grad_norm": 1.372344191359215, + "learning_rate": 2.602296647523861e-07, + "loss": 0.1562, + "step": 11731 + }, + { + "epoch": 0.9294513765102, + "grad_norm": 1.5803983238554975, + "learning_rate": 2.596484027270041e-07, + "loss": 0.1211, + "step": 11732 + }, + { + "epoch": 0.9295306001188354, + "grad_norm": 1.6961601693365884, + "learning_rate": 2.5906778205690876e-07, + "loss": 0.1615, + "step": 11733 + }, + { + "epoch": 0.9296098237274708, + "grad_norm": 1.6323890919766126, + "learning_rate": 2.5848780278032836e-07, + "loss": 0.1163, + "step": 11734 + }, + { + "epoch": 0.9296890473361061, + "grad_norm": 1.93391318545221, + "learning_rate": 2.579084649354546e-07, + "loss": 0.1493, + "step": 11735 + }, + { + "epoch": 0.9297682709447416, + "grad_norm": 1.23711990802364, + "learning_rate": 2.5732976856043034e-07, + "loss": 0.1228, + "step": 11736 + }, + { + "epoch": 0.9298474945533769, + "grad_norm": 1.402640321770987, + "learning_rate": 2.5675171369336284e-07, + "loss": 0.0972, + "step": 11737 + }, + { + "epoch": 0.9299267181620123, + "grad_norm": 1.874691499538644, + "learning_rate": 2.5617430037231495e-07, + "loss": 0.2169, + "step": 11738 + }, + { + "epoch": 0.9300059417706477, + "grad_norm": 1.503266271582883, + "learning_rate": 2.5559752863530295e-07, + "loss": 0.134, + "step": 11739 + }, + { + "epoch": 0.930085165379283, + "grad_norm": 1.30202069378424, + "learning_rate": 2.550213985203076e-07, + "loss": 0.1195, + "step": 11740 + }, + { + "epoch": 0.9301643889879184, + "grad_norm": 2.0467902772257935, + "learning_rate": 2.54445910065263e-07, + "loss": 0.2347, + "step": 11741 + }, + { + "epoch": 0.9302436125965537, + "grad_norm": 1.556236904715564, + "learning_rate": 2.538710633080621e-07, + "loss": 0.1348, + "step": 11742 + }, + { + "epoch": 0.9303228362051892, + "grad_norm": 1.5966395550494747, + "learning_rate": 2.5329685828655803e-07, + "loss": 0.136, + "step": 11743 + }, + { + "epoch": 0.9304020598138245, + "grad_norm": 1.5473971958105641, + "learning_rate": 2.527232950385572e-07, + "loss": 0.127, + "step": 11744 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 1.5703237930725045, + "learning_rate": 2.521503736018249e-07, + "loss": 0.1049, + "step": 11745 + }, + { + "epoch": 0.9305605070310953, + "grad_norm": 1.448853025755151, + "learning_rate": 2.5157809401408775e-07, + "loss": 0.0826, + "step": 11746 + }, + { + "epoch": 0.9306397306397306, + "grad_norm": 2.0281940952923128, + "learning_rate": 2.510064563130277e-07, + "loss": 0.2286, + "step": 11747 + }, + { + "epoch": 0.930718954248366, + "grad_norm": 1.551753253328473, + "learning_rate": 2.5043546053628245e-07, + "loss": 0.1233, + "step": 11748 + }, + { + "epoch": 0.9307981778570014, + "grad_norm": 1.441535896794083, + "learning_rate": 2.498651067214497e-07, + "loss": 0.1469, + "step": 11749 + }, + { + "epoch": 0.9308774014656368, + "grad_norm": 1.6904437906659537, + "learning_rate": 2.4929539490608614e-07, + "loss": 0.142, + "step": 11750 + }, + { + "epoch": 0.9309566250742721, + "grad_norm": 1.6465222213549568, + "learning_rate": 2.487263251277028e-07, + "loss": 0.1839, + "step": 11751 + }, + { + "epoch": 0.9310358486829075, + "grad_norm": 1.2254054174644144, + "learning_rate": 2.481578974237697e-07, + "loss": 0.1133, + "step": 11752 + }, + { + "epoch": 0.9311150722915429, + "grad_norm": 1.437412364689459, + "learning_rate": 2.475901118317181e-07, + "loss": 0.1271, + "step": 11753 + }, + { + "epoch": 0.9311942959001782, + "grad_norm": 2.473454675057739, + "learning_rate": 2.4702296838893134e-07, + "loss": 0.1931, + "step": 11754 + }, + { + "epoch": 0.9312735195088137, + "grad_norm": 1.5278442789807862, + "learning_rate": 2.464564671327529e-07, + "loss": 0.1404, + "step": 11755 + }, + { + "epoch": 0.931352743117449, + "grad_norm": 2.442156424732724, + "learning_rate": 2.4589060810048635e-07, + "loss": 0.207, + "step": 11756 + }, + { + "epoch": 0.9314319667260844, + "grad_norm": 1.7555572369017296, + "learning_rate": 2.453253913293896e-07, + "loss": 0.195, + "step": 11757 + }, + { + "epoch": 0.9315111903347197, + "grad_norm": 1.5702849739689868, + "learning_rate": 2.447608168566784e-07, + "loss": 0.1222, + "step": 11758 + }, + { + "epoch": 0.9315904139433551, + "grad_norm": 1.791227537014577, + "learning_rate": 2.441968847195286e-07, + "loss": 0.155, + "step": 11759 + }, + { + "epoch": 0.9316696375519905, + "grad_norm": 1.5921267912218913, + "learning_rate": 2.4363359495507166e-07, + "loss": 0.1542, + "step": 11760 + }, + { + "epoch": 0.9317488611606258, + "grad_norm": 1.5137004254318036, + "learning_rate": 2.430709476003978e-07, + "loss": 0.1184, + "step": 11761 + }, + { + "epoch": 0.9318280847692613, + "grad_norm": 1.537585032324018, + "learning_rate": 2.425089426925553e-07, + "loss": 0.1103, + "step": 11762 + }, + { + "epoch": 0.9319073083778966, + "grad_norm": 1.166210295983881, + "learning_rate": 2.419475802685489e-07, + "loss": 0.1257, + "step": 11763 + }, + { + "epoch": 0.931986531986532, + "grad_norm": 1.5056680088985932, + "learning_rate": 2.413868603653413e-07, + "loss": 0.1432, + "step": 11764 + }, + { + "epoch": 0.9320657555951674, + "grad_norm": 1.7107389531715969, + "learning_rate": 2.4082678301985297e-07, + "loss": 0.1635, + "step": 11765 + }, + { + "epoch": 0.9321449792038027, + "grad_norm": 1.7032793824361265, + "learning_rate": 2.402673482689633e-07, + "loss": 0.1455, + "step": 11766 + }, + { + "epoch": 0.9322242028124381, + "grad_norm": 1.7789611557520117, + "learning_rate": 2.3970855614950827e-07, + "loss": 0.1684, + "step": 11767 + }, + { + "epoch": 0.9323034264210734, + "grad_norm": 1.2485721638856666, + "learning_rate": 2.3915040669828084e-07, + "loss": 0.116, + "step": 11768 + }, + { + "epoch": 0.9323826500297089, + "grad_norm": 1.5400855527799107, + "learning_rate": 2.385928999520326e-07, + "loss": 0.1674, + "step": 11769 + }, + { + "epoch": 0.9324618736383442, + "grad_norm": 1.8110532243877115, + "learning_rate": 2.3803603594747427e-07, + "loss": 0.1637, + "step": 11770 + }, + { + "epoch": 0.9325410972469796, + "grad_norm": 1.3892586061718182, + "learning_rate": 2.374798147212698e-07, + "loss": 0.0876, + "step": 11771 + }, + { + "epoch": 0.932620320855615, + "grad_norm": 1.6998567675729621, + "learning_rate": 2.3692423631004658e-07, + "loss": 0.1526, + "step": 11772 + }, + { + "epoch": 0.9326995444642503, + "grad_norm": 1.5728413231250775, + "learning_rate": 2.3636930075038534e-07, + "loss": 0.1644, + "step": 11773 + }, + { + "epoch": 0.9327787680728857, + "grad_norm": 1.306152458781243, + "learning_rate": 2.3581500807882462e-07, + "loss": 0.1013, + "step": 11774 + }, + { + "epoch": 0.9328579916815211, + "grad_norm": 1.2367809775139602, + "learning_rate": 2.3526135833186527e-07, + "loss": 0.0999, + "step": 11775 + }, + { + "epoch": 0.9329372152901565, + "grad_norm": 1.518973710176375, + "learning_rate": 2.3470835154595918e-07, + "loss": 0.1551, + "step": 11776 + }, + { + "epoch": 0.9330164388987918, + "grad_norm": 1.916302663346596, + "learning_rate": 2.3415598775752057e-07, + "loss": 0.154, + "step": 11777 + }, + { + "epoch": 0.9330956625074273, + "grad_norm": 1.5825780600232948, + "learning_rate": 2.3360426700292038e-07, + "loss": 0.1525, + "step": 11778 + }, + { + "epoch": 0.9331748861160626, + "grad_norm": 1.7366911644653875, + "learning_rate": 2.330531893184873e-07, + "loss": 0.1327, + "step": 11779 + }, + { + "epoch": 0.9332541097246979, + "grad_norm": 1.5056015014530373, + "learning_rate": 2.3250275474050565e-07, + "loss": 0.1138, + "step": 11780 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.8118610587436466, + "learning_rate": 2.3195296330521756e-07, + "loss": 0.1623, + "step": 11781 + }, + { + "epoch": 0.9334125569419687, + "grad_norm": 1.101253846813473, + "learning_rate": 2.3140381504882736e-07, + "loss": 0.0734, + "step": 11782 + }, + { + "epoch": 0.9334917805506041, + "grad_norm": 1.2918483464448234, + "learning_rate": 2.3085531000749285e-07, + "loss": 0.1189, + "step": 11783 + }, + { + "epoch": 0.9335710041592394, + "grad_norm": 1.399526896938626, + "learning_rate": 2.3030744821732953e-07, + "loss": 0.1037, + "step": 11784 + }, + { + "epoch": 0.9336502277678749, + "grad_norm": 2.192032769487839, + "learning_rate": 2.297602297144119e-07, + "loss": 0.2054, + "step": 11785 + }, + { + "epoch": 0.9337294513765102, + "grad_norm": 1.487565667790463, + "learning_rate": 2.2921365453477229e-07, + "loss": 0.1547, + "step": 11786 + }, + { + "epoch": 0.9338086749851455, + "grad_norm": 1.9532299725016544, + "learning_rate": 2.286677227143985e-07, + "loss": 0.1657, + "step": 11787 + }, + { + "epoch": 0.933887898593781, + "grad_norm": 1.8591841442476353, + "learning_rate": 2.2812243428923964e-07, + "loss": 0.1968, + "step": 11788 + }, + { + "epoch": 0.9339671222024163, + "grad_norm": 1.3346485701087105, + "learning_rate": 2.2757778929519914e-07, + "loss": 0.1162, + "step": 11789 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 1.494221886850817, + "learning_rate": 2.2703378776813833e-07, + "loss": 0.1565, + "step": 11790 + }, + { + "epoch": 0.934125569419687, + "grad_norm": 1.690149925920761, + "learning_rate": 2.2649042974387858e-07, + "loss": 0.1263, + "step": 11791 + }, + { + "epoch": 0.9342047930283225, + "grad_norm": 1.5318479002724577, + "learning_rate": 2.259477152581979e-07, + "loss": 0.1528, + "step": 11792 + }, + { + "epoch": 0.9342840166369578, + "grad_norm": 1.6617996207191215, + "learning_rate": 2.2540564434682998e-07, + "loss": 0.1469, + "step": 11793 + }, + { + "epoch": 0.9343632402455931, + "grad_norm": 1.573186267108376, + "learning_rate": 2.2486421704546623e-07, + "loss": 0.1398, + "step": 11794 + }, + { + "epoch": 0.9344424638542286, + "grad_norm": 2.01657978511238, + "learning_rate": 2.2432343338976038e-07, + "loss": 0.1498, + "step": 11795 + }, + { + "epoch": 0.9345216874628639, + "grad_norm": 1.5210543427056888, + "learning_rate": 2.2378329341531946e-07, + "loss": 0.1344, + "step": 11796 + }, + { + "epoch": 0.9346009110714993, + "grad_norm": 1.9042795631154794, + "learning_rate": 2.2324379715770728e-07, + "loss": 0.1647, + "step": 11797 + }, + { + "epoch": 0.9346801346801347, + "grad_norm": 1.9030321722899153, + "learning_rate": 2.2270494465244874e-07, + "loss": 0.1827, + "step": 11798 + }, + { + "epoch": 0.9347593582887701, + "grad_norm": 1.4187241485207902, + "learning_rate": 2.2216673593502437e-07, + "loss": 0.1131, + "step": 11799 + }, + { + "epoch": 0.9348385818974054, + "grad_norm": 1.3954590933482958, + "learning_rate": 2.2162917104087245e-07, + "loss": 0.1304, + "step": 11800 + }, + { + "epoch": 0.9349178055060408, + "grad_norm": 1.8524393466504487, + "learning_rate": 2.2109225000538915e-07, + "loss": 0.1664, + "step": 11801 + }, + { + "epoch": 0.9349970291146762, + "grad_norm": 1.301756748047014, + "learning_rate": 2.2055597286392838e-07, + "loss": 0.1274, + "step": 11802 + }, + { + "epoch": 0.9350762527233115, + "grad_norm": 1.5516407754826604, + "learning_rate": 2.200203396517997e-07, + "loss": 0.0869, + "step": 11803 + }, + { + "epoch": 0.935155476331947, + "grad_norm": 1.4493743024754189, + "learning_rate": 2.19485350404276e-07, + "loss": 0.1283, + "step": 11804 + }, + { + "epoch": 0.9352346999405823, + "grad_norm": 1.583324656995076, + "learning_rate": 2.1895100515658019e-07, + "loss": 0.1607, + "step": 11805 + }, + { + "epoch": 0.9353139235492177, + "grad_norm": 1.7837059448578352, + "learning_rate": 2.1841730394389527e-07, + "loss": 0.1668, + "step": 11806 + }, + { + "epoch": 0.935393147157853, + "grad_norm": 1.5015022099402, + "learning_rate": 2.1788424680136756e-07, + "loss": 0.1383, + "step": 11807 + }, + { + "epoch": 0.9354723707664884, + "grad_norm": 1.4783481188874439, + "learning_rate": 2.173518337640923e-07, + "loss": 0.1345, + "step": 11808 + }, + { + "epoch": 0.9355515943751238, + "grad_norm": 1.6246832148375558, + "learning_rate": 2.1682006486712703e-07, + "loss": 0.1494, + "step": 11809 + }, + { + "epoch": 0.9356308179837591, + "grad_norm": 1.719021390524658, + "learning_rate": 2.1628894014548819e-07, + "loss": 0.1951, + "step": 11810 + }, + { + "epoch": 0.9357100415923946, + "grad_norm": 1.4800054525416697, + "learning_rate": 2.1575845963414555e-07, + "loss": 0.1459, + "step": 11811 + }, + { + "epoch": 0.9357892652010299, + "grad_norm": 1.8837283606291502, + "learning_rate": 2.1522862336803008e-07, + "loss": 0.2133, + "step": 11812 + }, + { + "epoch": 0.9358684888096653, + "grad_norm": 1.4382904518897168, + "learning_rate": 2.146994313820283e-07, + "loss": 0.1649, + "step": 11813 + }, + { + "epoch": 0.9359477124183007, + "grad_norm": 1.4455128948114109, + "learning_rate": 2.141708837109846e-07, + "loss": 0.1294, + "step": 11814 + }, + { + "epoch": 0.936026936026936, + "grad_norm": 1.5174409312633135, + "learning_rate": 2.136429803897022e-07, + "loss": 0.1672, + "step": 11815 + }, + { + "epoch": 0.9361061596355714, + "grad_norm": 1.8262995788463858, + "learning_rate": 2.1311572145294114e-07, + "loss": 0.1997, + "step": 11816 + }, + { + "epoch": 0.9361853832442067, + "grad_norm": 1.1188181527878025, + "learning_rate": 2.1258910693541802e-07, + "loss": 0.1053, + "step": 11817 + }, + { + "epoch": 0.9362646068528422, + "grad_norm": 1.50152094855707, + "learning_rate": 2.1206313687180845e-07, + "loss": 0.1481, + "step": 11818 + }, + { + "epoch": 0.9363438304614775, + "grad_norm": 1.3124352964054526, + "learning_rate": 2.1153781129674367e-07, + "loss": 0.1167, + "step": 11819 + }, + { + "epoch": 0.936423054070113, + "grad_norm": 2.4761160578646013, + "learning_rate": 2.1101313024481595e-07, + "loss": 0.1853, + "step": 11820 + }, + { + "epoch": 0.9365022776787483, + "grad_norm": 1.6695380534388198, + "learning_rate": 2.1048909375057103e-07, + "loss": 0.1309, + "step": 11821 + }, + { + "epoch": 0.9365815012873836, + "grad_norm": 1.2270510797044871, + "learning_rate": 2.0996570184851572e-07, + "loss": 0.1013, + "step": 11822 + }, + { + "epoch": 0.936660724896019, + "grad_norm": 1.6055190771815957, + "learning_rate": 2.0944295457311247e-07, + "loss": 0.231, + "step": 11823 + }, + { + "epoch": 0.9367399485046544, + "grad_norm": 1.5052949547604648, + "learning_rate": 2.0892085195878154e-07, + "loss": 0.2038, + "step": 11824 + }, + { + "epoch": 0.9368191721132898, + "grad_norm": 1.1386516468727237, + "learning_rate": 2.0839939403989984e-07, + "loss": 0.1102, + "step": 11825 + }, + { + "epoch": 0.9368983957219251, + "grad_norm": 1.519251922528252, + "learning_rate": 2.078785808508055e-07, + "loss": 0.1124, + "step": 11826 + }, + { + "epoch": 0.9369776193305605, + "grad_norm": 1.6783001312255867, + "learning_rate": 2.0735841242578992e-07, + "loss": 0.1729, + "step": 11827 + }, + { + "epoch": 0.9370568429391959, + "grad_norm": 1.6920556802597297, + "learning_rate": 2.068388887991013e-07, + "loss": 0.176, + "step": 11828 + }, + { + "epoch": 0.9371360665478312, + "grad_norm": 2.1153530231105413, + "learning_rate": 2.0632001000495228e-07, + "loss": 0.1886, + "step": 11829 + }, + { + "epoch": 0.9372152901564667, + "grad_norm": 1.309756203243174, + "learning_rate": 2.0580177607750663e-07, + "loss": 0.1186, + "step": 11830 + }, + { + "epoch": 0.937294513765102, + "grad_norm": 1.5113294616684176, + "learning_rate": 2.0528418705088592e-07, + "loss": 0.1788, + "step": 11831 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 1.8656262585228682, + "learning_rate": 2.0476724295917294e-07, + "loss": 0.1708, + "step": 11832 + }, + { + "epoch": 0.9374529609823727, + "grad_norm": 1.4272889626751728, + "learning_rate": 2.04250943836406e-07, + "loss": 0.1225, + "step": 11833 + }, + { + "epoch": 0.9375321845910081, + "grad_norm": 1.6975446229970488, + "learning_rate": 2.0373528971658009e-07, + "loss": 0.177, + "step": 11834 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 1.502992718576467, + "learning_rate": 2.0322028063364806e-07, + "loss": 0.1683, + "step": 11835 + }, + { + "epoch": 0.9376906318082788, + "grad_norm": 1.724092480014759, + "learning_rate": 2.0270591662152173e-07, + "loss": 0.1898, + "step": 11836 + }, + { + "epoch": 0.9377698554169143, + "grad_norm": 2.1788719188978125, + "learning_rate": 2.0219219771406952e-07, + "loss": 0.2069, + "step": 11837 + }, + { + "epoch": 0.9378490790255496, + "grad_norm": 1.595599678095054, + "learning_rate": 2.0167912394511657e-07, + "loss": 0.1423, + "step": 11838 + }, + { + "epoch": 0.937928302634185, + "grad_norm": 1.422738462376261, + "learning_rate": 2.01166695348447e-07, + "loss": 0.1514, + "step": 11839 + }, + { + "epoch": 0.9380075262428204, + "grad_norm": 1.5142781979094804, + "learning_rate": 2.0065491195780163e-07, + "loss": 0.1418, + "step": 11840 + }, + { + "epoch": 0.9380867498514557, + "grad_norm": 1.502200509695498, + "learning_rate": 2.00143773806879e-07, + "loss": 0.1476, + "step": 11841 + }, + { + "epoch": 0.9381659734600911, + "grad_norm": 1.697517799507699, + "learning_rate": 1.9963328092933444e-07, + "loss": 0.1726, + "step": 11842 + }, + { + "epoch": 0.9382451970687264, + "grad_norm": 2.2267506741690566, + "learning_rate": 1.9912343335878326e-07, + "loss": 0.2278, + "step": 11843 + }, + { + "epoch": 0.9383244206773619, + "grad_norm": 1.4106274960545038, + "learning_rate": 1.9861423112879308e-07, + "loss": 0.1273, + "step": 11844 + }, + { + "epoch": 0.9384036442859972, + "grad_norm": 1.9308164959449465, + "learning_rate": 1.9810567427289596e-07, + "loss": 0.1615, + "step": 11845 + }, + { + "epoch": 0.9384828678946326, + "grad_norm": 1.4111916275200567, + "learning_rate": 1.9759776282457731e-07, + "loss": 0.0799, + "step": 11846 + }, + { + "epoch": 0.938562091503268, + "grad_norm": 1.3648174089371212, + "learning_rate": 1.970904968172771e-07, + "loss": 0.1071, + "step": 11847 + }, + { + "epoch": 0.9386413151119033, + "grad_norm": 1.5094485254495393, + "learning_rate": 1.965838762844019e-07, + "loss": 0.1778, + "step": 11848 + }, + { + "epoch": 0.9387205387205387, + "grad_norm": 1.6990181391425632, + "learning_rate": 1.9607790125930614e-07, + "loss": 0.1431, + "step": 11849 + }, + { + "epoch": 0.9387997623291741, + "grad_norm": 2.148605854081362, + "learning_rate": 1.9557257177530763e-07, + "loss": 0.2264, + "step": 11850 + }, + { + "epoch": 0.9388789859378095, + "grad_norm": 1.2558382377717134, + "learning_rate": 1.9506788786567865e-07, + "loss": 0.0883, + "step": 11851 + }, + { + "epoch": 0.9389582095464448, + "grad_norm": 1.8049545793541213, + "learning_rate": 1.9456384956365149e-07, + "loss": 0.1885, + "step": 11852 + }, + { + "epoch": 0.9390374331550803, + "grad_norm": 1.5641479904797764, + "learning_rate": 1.9406045690241404e-07, + "loss": 0.1504, + "step": 11853 + }, + { + "epoch": 0.9391166567637156, + "grad_norm": 1.262705490604946, + "learning_rate": 1.935577099151109e-07, + "loss": 0.1484, + "step": 11854 + }, + { + "epoch": 0.9391958803723509, + "grad_norm": 1.6160988203637365, + "learning_rate": 1.9305560863484896e-07, + "loss": 0.1419, + "step": 11855 + }, + { + "epoch": 0.9392751039809863, + "grad_norm": 1.8135757520694857, + "learning_rate": 1.9255415309468618e-07, + "loss": 0.1141, + "step": 11856 + }, + { + "epoch": 0.9393543275896217, + "grad_norm": 1.8564696353131633, + "learning_rate": 1.920533433276417e-07, + "loss": 0.2042, + "step": 11857 + }, + { + "epoch": 0.9394335511982571, + "grad_norm": 1.6158178817323292, + "learning_rate": 1.9155317936669248e-07, + "loss": 0.1262, + "step": 11858 + }, + { + "epoch": 0.9395127748068924, + "grad_norm": 1.7371704020105014, + "learning_rate": 1.910536612447711e-07, + "loss": 0.1897, + "step": 11859 + }, + { + "epoch": 0.9395919984155279, + "grad_norm": 1.4258654479404713, + "learning_rate": 1.9055478899476788e-07, + "loss": 0.0854, + "step": 11860 + }, + { + "epoch": 0.9396712220241632, + "grad_norm": 1.801823519288332, + "learning_rate": 1.900565626495332e-07, + "loss": 0.193, + "step": 11861 + }, + { + "epoch": 0.9397504456327985, + "grad_norm": 1.7534619658607473, + "learning_rate": 1.8955898224187086e-07, + "loss": 0.1689, + "step": 11862 + }, + { + "epoch": 0.939829669241434, + "grad_norm": 1.8847822580272615, + "learning_rate": 1.890620478045435e-07, + "loss": 0.2257, + "step": 11863 + }, + { + "epoch": 0.9399088928500693, + "grad_norm": 1.2565745974518263, + "learning_rate": 1.8856575937027388e-07, + "loss": 0.1232, + "step": 11864 + }, + { + "epoch": 0.9399881164587047, + "grad_norm": 1.6528358842259696, + "learning_rate": 1.8807011697174027e-07, + "loss": 0.1856, + "step": 11865 + }, + { + "epoch": 0.94006734006734, + "grad_norm": 1.7884068169795333, + "learning_rate": 1.8757512064157658e-07, + "loss": 0.2185, + "step": 11866 + }, + { + "epoch": 0.9401465636759755, + "grad_norm": 1.958864352190528, + "learning_rate": 1.870807704123756e-07, + "loss": 0.1852, + "step": 11867 + }, + { + "epoch": 0.9402257872846108, + "grad_norm": 1.819078019215524, + "learning_rate": 1.8658706631669133e-07, + "loss": 0.1337, + "step": 11868 + }, + { + "epoch": 0.9403050108932461, + "grad_norm": 2.8134074669061926, + "learning_rate": 1.8609400838702884e-07, + "loss": 0.1194, + "step": 11869 + }, + { + "epoch": 0.9403842345018816, + "grad_norm": 1.0202566566551032, + "learning_rate": 1.856015966558533e-07, + "loss": 0.0732, + "step": 11870 + }, + { + "epoch": 0.9404634581105169, + "grad_norm": 1.1491139965322026, + "learning_rate": 1.8510983115558988e-07, + "loss": 0.08, + "step": 11871 + }, + { + "epoch": 0.9405426817191523, + "grad_norm": 1.8716048443873254, + "learning_rate": 1.8461871191861825e-07, + "loss": 0.1165, + "step": 11872 + }, + { + "epoch": 0.9406219053277877, + "grad_norm": 1.6399236291466963, + "learning_rate": 1.8412823897727473e-07, + "loss": 0.1834, + "step": 11873 + }, + { + "epoch": 0.9407011289364231, + "grad_norm": 1.3377943631831999, + "learning_rate": 1.8363841236385571e-07, + "loss": 0.1089, + "step": 11874 + }, + { + "epoch": 0.9407803525450584, + "grad_norm": 1.4589650557439726, + "learning_rate": 1.8314923211061542e-07, + "loss": 0.1115, + "step": 11875 + }, + { + "epoch": 0.9408595761536938, + "grad_norm": 1.7220854065481161, + "learning_rate": 1.826606982497603e-07, + "loss": 0.1429, + "step": 11876 + }, + { + "epoch": 0.9409387997623292, + "grad_norm": 1.744555537826347, + "learning_rate": 1.8217281081346238e-07, + "loss": 0.1952, + "step": 11877 + }, + { + "epoch": 0.9410180233709645, + "grad_norm": 1.4567505532885319, + "learning_rate": 1.8168556983384377e-07, + "loss": 0.1167, + "step": 11878 + }, + { + "epoch": 0.9410972469796, + "grad_norm": 1.417973923946339, + "learning_rate": 1.811989753429877e-07, + "loss": 0.1393, + "step": 11879 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.8771413000173662, + "learning_rate": 1.8071302737293294e-07, + "loss": 0.1733, + "step": 11880 + }, + { + "epoch": 0.9412556941968707, + "grad_norm": 1.1588549996085826, + "learning_rate": 1.802277259556784e-07, + "loss": 0.0958, + "step": 11881 + }, + { + "epoch": 0.941334917805506, + "grad_norm": 1.3770262535771376, + "learning_rate": 1.7974307112317957e-07, + "loss": 0.1264, + "step": 11882 + }, + { + "epoch": 0.9414141414141414, + "grad_norm": 1.3065675512850603, + "learning_rate": 1.7925906290734653e-07, + "loss": 0.1041, + "step": 11883 + }, + { + "epoch": 0.9414933650227768, + "grad_norm": 1.2029892945540868, + "learning_rate": 1.787757013400504e-07, + "loss": 0.0954, + "step": 11884 + }, + { + "epoch": 0.9415725886314121, + "grad_norm": 1.6001102408512853, + "learning_rate": 1.7829298645311688e-07, + "loss": 0.1583, + "step": 11885 + }, + { + "epoch": 0.9416518122400476, + "grad_norm": 1.6902794289418774, + "learning_rate": 1.7781091827833164e-07, + "loss": 0.1414, + "step": 11886 + }, + { + "epoch": 0.9417310358486829, + "grad_norm": 1.8141361973656667, + "learning_rate": 1.7732949684743593e-07, + "loss": 0.1798, + "step": 11887 + }, + { + "epoch": 0.9418102594573183, + "grad_norm": 1.2303315489034052, + "learning_rate": 1.768487221921278e-07, + "loss": 0.0871, + "step": 11888 + }, + { + "epoch": 0.9418894830659537, + "grad_norm": 2.4208613987094796, + "learning_rate": 1.763685943440674e-07, + "loss": 0.2057, + "step": 11889 + }, + { + "epoch": 0.941968706674589, + "grad_norm": 1.5992007215536663, + "learning_rate": 1.7588911333486614e-07, + "loss": 0.1568, + "step": 11890 + }, + { + "epoch": 0.9420479302832244, + "grad_norm": 1.9025729802431248, + "learning_rate": 1.7541027919609545e-07, + "loss": 0.1484, + "step": 11891 + }, + { + "epoch": 0.9421271538918597, + "grad_norm": 1.544020209843473, + "learning_rate": 1.7493209195928562e-07, + "loss": 0.1179, + "step": 11892 + }, + { + "epoch": 0.9422063775004952, + "grad_norm": 1.2881880574726279, + "learning_rate": 1.7445455165592262e-07, + "loss": 0.0872, + "step": 11893 + }, + { + "epoch": 0.9422856011091305, + "grad_norm": 2.14072433958395, + "learning_rate": 1.7397765831744905e-07, + "loss": 0.2236, + "step": 11894 + }, + { + "epoch": 0.942364824717766, + "grad_norm": 1.973501900863922, + "learning_rate": 1.7350141197526648e-07, + "loss": 0.2124, + "step": 11895 + }, + { + "epoch": 0.9424440483264013, + "grad_norm": 1.4984592138655644, + "learning_rate": 1.7302581266073537e-07, + "loss": 0.1107, + "step": 11896 + }, + { + "epoch": 0.9425232719350366, + "grad_norm": 1.084127345416034, + "learning_rate": 1.7255086040516954e-07, + "loss": 0.0624, + "step": 11897 + }, + { + "epoch": 0.942602495543672, + "grad_norm": 1.712665399028556, + "learning_rate": 1.7207655523984179e-07, + "loss": 0.1397, + "step": 11898 + }, + { + "epoch": 0.9426817191523074, + "grad_norm": 1.9042278255961511, + "learning_rate": 1.71602897195986e-07, + "loss": 0.162, + "step": 11899 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 1.8445177330445153, + "learning_rate": 1.711298863047872e-07, + "loss": 0.1124, + "step": 11900 + }, + { + "epoch": 0.9428401663695781, + "grad_norm": 2.2523130067303696, + "learning_rate": 1.7065752259739056e-07, + "loss": 0.2643, + "step": 11901 + }, + { + "epoch": 0.9429193899782136, + "grad_norm": 1.7144277609924206, + "learning_rate": 1.701858061049022e-07, + "loss": 0.1761, + "step": 11902 + }, + { + "epoch": 0.9429986135868489, + "grad_norm": 2.149629817809522, + "learning_rate": 1.697147368583796e-07, + "loss": 0.2157, + "step": 11903 + }, + { + "epoch": 0.9430778371954842, + "grad_norm": 1.424607724576598, + "learning_rate": 1.692443148888412e-07, + "loss": 0.1797, + "step": 11904 + }, + { + "epoch": 0.9431570608041197, + "grad_norm": 1.8353657077533234, + "learning_rate": 1.6877454022726225e-07, + "loss": 0.1166, + "step": 11905 + }, + { + "epoch": 0.943236284412755, + "grad_norm": 1.3460872683562073, + "learning_rate": 1.6830541290457468e-07, + "loss": 0.1264, + "step": 11906 + }, + { + "epoch": 0.9433155080213904, + "grad_norm": 1.5012394850707114, + "learning_rate": 1.6783693295166935e-07, + "loss": 0.1287, + "step": 11907 + }, + { + "epoch": 0.9433947316300257, + "grad_norm": 1.706287936584026, + "learning_rate": 1.6736910039939159e-07, + "loss": 0.1817, + "step": 11908 + }, + { + "epoch": 0.9434739552386611, + "grad_norm": 1.6575383833199284, + "learning_rate": 1.6690191527854782e-07, + "loss": 0.1342, + "step": 11909 + }, + { + "epoch": 0.9435531788472965, + "grad_norm": 1.5724106277205052, + "learning_rate": 1.6643537761989904e-07, + "loss": 0.1605, + "step": 11910 + }, + { + "epoch": 0.9436324024559318, + "grad_norm": 1.4620817229547778, + "learning_rate": 1.6596948745416397e-07, + "loss": 0.1435, + "step": 11911 + }, + { + "epoch": 0.9437116260645673, + "grad_norm": 1.4580899994611065, + "learning_rate": 1.6550424481202032e-07, + "loss": 0.1249, + "step": 11912 + }, + { + "epoch": 0.9437908496732026, + "grad_norm": 2.2336838851586958, + "learning_rate": 1.6503964972410136e-07, + "loss": 0.2336, + "step": 11913 + }, + { + "epoch": 0.943870073281838, + "grad_norm": 1.4682091262727397, + "learning_rate": 1.6457570222099816e-07, + "loss": 0.1109, + "step": 11914 + }, + { + "epoch": 0.9439492968904734, + "grad_norm": 1.4460866124241982, + "learning_rate": 1.6411240233326076e-07, + "loss": 0.1327, + "step": 11915 + }, + { + "epoch": 0.9440285204991087, + "grad_norm": 2.1642810716238055, + "learning_rate": 1.6364975009139473e-07, + "loss": 0.1285, + "step": 11916 + }, + { + "epoch": 0.9441077441077441, + "grad_norm": 1.4216987919559858, + "learning_rate": 1.6318774552586237e-07, + "loss": 0.097, + "step": 11917 + }, + { + "epoch": 0.9441869677163794, + "grad_norm": 1.481563450957902, + "learning_rate": 1.627263886670849e-07, + "loss": 0.2165, + "step": 11918 + }, + { + "epoch": 0.9442661913250149, + "grad_norm": 1.6766837177581069, + "learning_rate": 1.6226567954544248e-07, + "loss": 0.1179, + "step": 11919 + }, + { + "epoch": 0.9443454149336502, + "grad_norm": 1.5105550202839497, + "learning_rate": 1.618056181912675e-07, + "loss": 0.1751, + "step": 11920 + }, + { + "epoch": 0.9444246385422856, + "grad_norm": 1.0786000641288729, + "learning_rate": 1.6134620463485352e-07, + "loss": 0.0666, + "step": 11921 + }, + { + "epoch": 0.944503862150921, + "grad_norm": 1.6202304222779838, + "learning_rate": 1.6088743890645297e-07, + "loss": 0.1503, + "step": 11922 + }, + { + "epoch": 0.9445830857595563, + "grad_norm": 1.9078972540576251, + "learning_rate": 1.6042932103627174e-07, + "loss": 0.15, + "step": 11923 + }, + { + "epoch": 0.9446623093681917, + "grad_norm": 1.3460728220788978, + "learning_rate": 1.5997185105447344e-07, + "loss": 0.1405, + "step": 11924 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 1.7681649985736054, + "learning_rate": 1.5951502899118176e-07, + "loss": 0.1324, + "step": 11925 + }, + { + "epoch": 0.9448207565854625, + "grad_norm": 1.3148303965560717, + "learning_rate": 1.590588548764771e-07, + "loss": 0.0946, + "step": 11926 + }, + { + "epoch": 0.9448999801940978, + "grad_norm": 1.4187422289031857, + "learning_rate": 1.586033287403943e-07, + "loss": 0.1462, + "step": 11927 + }, + { + "epoch": 0.9449792038027333, + "grad_norm": 1.1150388978288979, + "learning_rate": 1.5814845061292938e-07, + "loss": 0.1129, + "step": 11928 + }, + { + "epoch": 0.9450584274113686, + "grad_norm": 1.3459520433959042, + "learning_rate": 1.5769422052403172e-07, + "loss": 0.1088, + "step": 11929 + }, + { + "epoch": 0.9451376510200039, + "grad_norm": 1.641433006390426, + "learning_rate": 1.572406385036118e-07, + "loss": 0.1466, + "step": 11930 + }, + { + "epoch": 0.9452168746286393, + "grad_norm": 2.0384269998773203, + "learning_rate": 1.5678770458153693e-07, + "loss": 0.1916, + "step": 11931 + }, + { + "epoch": 0.9452960982372747, + "grad_norm": 1.4361041621176975, + "learning_rate": 1.563354187876287e-07, + "loss": 0.1125, + "step": 11932 + }, + { + "epoch": 0.9453753218459101, + "grad_norm": 1.5793929846571342, + "learning_rate": 1.558837811516667e-07, + "loss": 0.1291, + "step": 11933 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 1.4849138907097514, + "learning_rate": 1.5543279170339265e-07, + "loss": 0.0997, + "step": 11934 + }, + { + "epoch": 0.9455337690631809, + "grad_norm": 1.6494464893231329, + "learning_rate": 1.5498245047249948e-07, + "loss": 0.1687, + "step": 11935 + }, + { + "epoch": 0.9456129926718162, + "grad_norm": 1.3112012404275095, + "learning_rate": 1.5453275748864128e-07, + "loss": 0.1378, + "step": 11936 + }, + { + "epoch": 0.9456922162804515, + "grad_norm": 1.8406951283931834, + "learning_rate": 1.5408371278142652e-07, + "loss": 0.1609, + "step": 11937 + }, + { + "epoch": 0.945771439889087, + "grad_norm": 1.2447757624739588, + "learning_rate": 1.5363531638042494e-07, + "loss": 0.0939, + "step": 11938 + }, + { + "epoch": 0.9458506634977223, + "grad_norm": 0.9840578335210307, + "learning_rate": 1.5318756831516069e-07, + "loss": 0.0767, + "step": 11939 + }, + { + "epoch": 0.9459298871063577, + "grad_norm": 1.6467282259061267, + "learning_rate": 1.5274046861511348e-07, + "loss": 0.0972, + "step": 11940 + }, + { + "epoch": 0.946009110714993, + "grad_norm": 2.363151800716459, + "learning_rate": 1.5229401730972536e-07, + "loss": 0.1558, + "step": 11941 + }, + { + "epoch": 0.9460883343236285, + "grad_norm": 1.9614867022253675, + "learning_rate": 1.518482144283917e-07, + "loss": 0.2179, + "step": 11942 + }, + { + "epoch": 0.9461675579322638, + "grad_norm": 1.700805055586085, + "learning_rate": 1.514030600004668e-07, + "loss": 0.1772, + "step": 11943 + }, + { + "epoch": 0.9462467815408991, + "grad_norm": 1.7050464671088847, + "learning_rate": 1.5095855405526272e-07, + "loss": 0.2091, + "step": 11944 + }, + { + "epoch": 0.9463260051495346, + "grad_norm": 1.4897690622708966, + "learning_rate": 1.505146966220461e-07, + "loss": 0.1309, + "step": 11945 + }, + { + "epoch": 0.9464052287581699, + "grad_norm": 1.5862026258795623, + "learning_rate": 1.5007148773004466e-07, + "loss": 0.1306, + "step": 11946 + }, + { + "epoch": 0.9464844523668053, + "grad_norm": 1.3636104079234208, + "learning_rate": 1.496289274084417e-07, + "loss": 0.1303, + "step": 11947 + }, + { + "epoch": 0.9465636759754407, + "grad_norm": 1.9887388251704603, + "learning_rate": 1.4918701568637618e-07, + "loss": 0.2155, + "step": 11948 + }, + { + "epoch": 0.9466428995840761, + "grad_norm": 1.6575947433379106, + "learning_rate": 1.4874575259294588e-07, + "loss": 0.146, + "step": 11949 + }, + { + "epoch": 0.9467221231927114, + "grad_norm": 1.687923479965547, + "learning_rate": 1.483051381572076e-07, + "loss": 0.1392, + "step": 11950 + }, + { + "epoch": 0.9468013468013468, + "grad_norm": 1.2161660130109961, + "learning_rate": 1.4786517240817255e-07, + "loss": 0.1162, + "step": 11951 + }, + { + "epoch": 0.9468805704099822, + "grad_norm": 1.5624870403432545, + "learning_rate": 1.474258553748098e-07, + "loss": 0.148, + "step": 11952 + }, + { + "epoch": 0.9469597940186175, + "grad_norm": 1.8689356110888757, + "learning_rate": 1.469871870860473e-07, + "loss": 0.155, + "step": 11953 + }, + { + "epoch": 0.947039017627253, + "grad_norm": 1.5350277594702901, + "learning_rate": 1.4654916757076865e-07, + "loss": 0.1671, + "step": 11954 + }, + { + "epoch": 0.9471182412358883, + "grad_norm": 1.7356327114580496, + "learning_rate": 1.461117968578163e-07, + "loss": 0.137, + "step": 11955 + }, + { + "epoch": 0.9471974648445237, + "grad_norm": 1.6171128745584276, + "learning_rate": 1.4567507497598722e-07, + "loss": 0.1287, + "step": 11956 + }, + { + "epoch": 0.947276688453159, + "grad_norm": 1.9759607064110165, + "learning_rate": 1.452390019540384e-07, + "loss": 0.1774, + "step": 11957 + }, + { + "epoch": 0.9473559120617944, + "grad_norm": 1.6152995081730877, + "learning_rate": 1.4480357782068467e-07, + "loss": 0.1109, + "step": 11958 + }, + { + "epoch": 0.9474351356704298, + "grad_norm": 1.5319687616532214, + "learning_rate": 1.4436880260459307e-07, + "loss": 0.1574, + "step": 11959 + }, + { + "epoch": 0.9475143592790651, + "grad_norm": 1.8299763193149086, + "learning_rate": 1.4393467633439629e-07, + "loss": 0.1667, + "step": 11960 + }, + { + "epoch": 0.9475935828877006, + "grad_norm": 1.449959841952018, + "learning_rate": 1.4350119903867477e-07, + "loss": 0.1461, + "step": 11961 + }, + { + "epoch": 0.9476728064963359, + "grad_norm": 2.0000677480423144, + "learning_rate": 1.4306837074597235e-07, + "loss": 0.1272, + "step": 11962 + }, + { + "epoch": 0.9477520301049713, + "grad_norm": 1.693370238789433, + "learning_rate": 1.426361914847907e-07, + "loss": 0.1638, + "step": 11963 + }, + { + "epoch": 0.9478312537136067, + "grad_norm": 1.6362350847715208, + "learning_rate": 1.422046612835848e-07, + "loss": 0.1694, + "step": 11964 + }, + { + "epoch": 0.947910477322242, + "grad_norm": 1.8513043844379247, + "learning_rate": 1.417737801707686e-07, + "loss": 0.2078, + "step": 11965 + }, + { + "epoch": 0.9479897009308774, + "grad_norm": 1.5496872318447052, + "learning_rate": 1.4134354817471497e-07, + "loss": 0.1095, + "step": 11966 + }, + { + "epoch": 0.9480689245395127, + "grad_norm": 1.9026858120044008, + "learning_rate": 1.4091396532375123e-07, + "loss": 0.1985, + "step": 11967 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 1.5429302334196537, + "learning_rate": 1.4048503164616367e-07, + "loss": 0.1184, + "step": 11968 + }, + { + "epoch": 0.9482273717567835, + "grad_norm": 2.0347941339117073, + "learning_rate": 1.4005674717019746e-07, + "loss": 0.2167, + "step": 11969 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 1.7773007394502391, + "learning_rate": 1.3962911192405004e-07, + "loss": 0.179, + "step": 11970 + }, + { + "epoch": 0.9483858189740543, + "grad_norm": 1.7088106095254136, + "learning_rate": 1.3920212593588113e-07, + "loss": 0.1642, + "step": 11971 + }, + { + "epoch": 0.9484650425826896, + "grad_norm": 1.5023799075502868, + "learning_rate": 1.3877578923380486e-07, + "loss": 0.1183, + "step": 11972 + }, + { + "epoch": 0.948544266191325, + "grad_norm": 2.3937551198981746, + "learning_rate": 1.3835010184589325e-07, + "loss": 0.1605, + "step": 11973 + }, + { + "epoch": 0.9486234897999604, + "grad_norm": 1.766024357180978, + "learning_rate": 1.3792506380017612e-07, + "loss": 0.1768, + "step": 11974 + }, + { + "epoch": 0.9487027134085958, + "grad_norm": 1.719660452276552, + "learning_rate": 1.3750067512464105e-07, + "loss": 0.1482, + "step": 11975 + }, + { + "epoch": 0.9487819370172311, + "grad_norm": 1.5262864672716732, + "learning_rate": 1.3707693584723124e-07, + "loss": 0.1562, + "step": 11976 + }, + { + "epoch": 0.9488611606258666, + "grad_norm": 1.7261102797847192, + "learning_rate": 1.3665384599584774e-07, + "loss": 0.1718, + "step": 11977 + }, + { + "epoch": 0.9489403842345019, + "grad_norm": 1.5864637180113903, + "learning_rate": 1.3623140559834824e-07, + "loss": 0.193, + "step": 11978 + }, + { + "epoch": 0.9490196078431372, + "grad_norm": 1.4792759024368576, + "learning_rate": 1.358096146825505e-07, + "loss": 0.1667, + "step": 11979 + }, + { + "epoch": 0.9490988314517727, + "grad_norm": 1.362874396659318, + "learning_rate": 1.353884732762256e-07, + "loss": 0.0475, + "step": 11980 + }, + { + "epoch": 0.949178055060408, + "grad_norm": 1.728558506787126, + "learning_rate": 1.3496798140710365e-07, + "loss": 0.0831, + "step": 11981 + }, + { + "epoch": 0.9492572786690434, + "grad_norm": 1.7417228751369813, + "learning_rate": 1.3454813910287358e-07, + "loss": 0.1827, + "step": 11982 + }, + { + "epoch": 0.9493365022776787, + "grad_norm": 2.389794919147696, + "learning_rate": 1.341289463911788e-07, + "loss": 0.1876, + "step": 11983 + }, + { + "epoch": 0.9494157258863142, + "grad_norm": 1.2601374217472465, + "learning_rate": 1.337104032996206e-07, + "loss": 0.0726, + "step": 11984 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 1.9089770087538627, + "learning_rate": 1.3329250985575915e-07, + "loss": 0.1202, + "step": 11985 + }, + { + "epoch": 0.9495741731035848, + "grad_norm": 1.303908150559167, + "learning_rate": 1.3287526608711132e-07, + "loss": 0.1052, + "step": 11986 + }, + { + "epoch": 0.9496533967122203, + "grad_norm": 2.0089282180301415, + "learning_rate": 1.324586720211485e-07, + "loss": 0.1987, + "step": 11987 + }, + { + "epoch": 0.9497326203208556, + "grad_norm": 1.5389078576242174, + "learning_rate": 1.3204272768530313e-07, + "loss": 0.1591, + "step": 11988 + }, + { + "epoch": 0.949811843929491, + "grad_norm": 1.7282731944243857, + "learning_rate": 1.3162743310696224e-07, + "loss": 0.1554, + "step": 11989 + }, + { + "epoch": 0.9498910675381264, + "grad_norm": 1.5207433211225512, + "learning_rate": 1.3121278831347172e-07, + "loss": 0.1618, + "step": 11990 + }, + { + "epoch": 0.9499702911467617, + "grad_norm": 1.4786580595344196, + "learning_rate": 1.3079879333213308e-07, + "loss": 0.1308, + "step": 11991 + }, + { + "epoch": 0.9500495147553971, + "grad_norm": 1.4060282658150705, + "learning_rate": 1.303854481902067e-07, + "loss": 0.1405, + "step": 11992 + }, + { + "epoch": 0.9501287383640324, + "grad_norm": 1.4353414308806975, + "learning_rate": 1.2997275291490863e-07, + "loss": 0.1549, + "step": 11993 + }, + { + "epoch": 0.9502079619726679, + "grad_norm": 1.3168202052568103, + "learning_rate": 1.2956070753341265e-07, + "loss": 0.1424, + "step": 11994 + }, + { + "epoch": 0.9502871855813032, + "grad_norm": 1.180036403468606, + "learning_rate": 1.2914931207285154e-07, + "loss": 0.1101, + "step": 11995 + }, + { + "epoch": 0.9503664091899386, + "grad_norm": 1.8683831369353276, + "learning_rate": 1.2873856656031358e-07, + "loss": 0.1697, + "step": 11996 + }, + { + "epoch": 0.950445632798574, + "grad_norm": 1.4114591680532926, + "learning_rate": 1.2832847102284162e-07, + "loss": 0.1585, + "step": 11997 + }, + { + "epoch": 0.9505248564072093, + "grad_norm": 1.8468183058619627, + "learning_rate": 1.2791902548744185e-07, + "loss": 0.1951, + "step": 11998 + }, + { + "epoch": 0.9506040800158447, + "grad_norm": 1.9933481557191437, + "learning_rate": 1.2751022998107154e-07, + "loss": 0.2607, + "step": 11999 + }, + { + "epoch": 0.9506833036244801, + "grad_norm": 1.9963279242713887, + "learning_rate": 1.271020845306492e-07, + "loss": 0.1575, + "step": 12000 + }, + { + "epoch": 0.9507625272331155, + "grad_norm": 1.7104243492040376, + "learning_rate": 1.2669458916305112e-07, + "loss": 0.1479, + "step": 12001 + }, + { + "epoch": 0.9508417508417508, + "grad_norm": 1.23456583492303, + "learning_rate": 1.2628774390510578e-07, + "loss": 0.0921, + "step": 12002 + }, + { + "epoch": 0.9509209744503863, + "grad_norm": 1.1440957922195196, + "learning_rate": 1.2588154878360293e-07, + "loss": 0.0585, + "step": 12003 + }, + { + "epoch": 0.9510001980590216, + "grad_norm": 1.4943588956271991, + "learning_rate": 1.254760038252889e-07, + "loss": 0.1309, + "step": 12004 + }, + { + "epoch": 0.9510794216676569, + "grad_norm": 2.005179153398323, + "learning_rate": 1.2507110905686793e-07, + "loss": 0.1799, + "step": 12005 + }, + { + "epoch": 0.9511586452762923, + "grad_norm": 1.5338207207701673, + "learning_rate": 1.2466686450499866e-07, + "loss": 0.1416, + "step": 12006 + }, + { + "epoch": 0.9512378688849277, + "grad_norm": 1.506157165613995, + "learning_rate": 1.242632701962987e-07, + "loss": 0.1239, + "step": 12007 + }, + { + "epoch": 0.9513170924935631, + "grad_norm": 1.792253698059906, + "learning_rate": 1.2386032615734345e-07, + "loss": 0.1802, + "step": 12008 + }, + { + "epoch": 0.9513963161021984, + "grad_norm": 1.3138492666234793, + "learning_rate": 1.2345803241466504e-07, + "loss": 0.1184, + "step": 12009 + }, + { + "epoch": 0.9514755397108339, + "grad_norm": 1.6784904215248027, + "learning_rate": 1.2305638899475226e-07, + "loss": 0.1614, + "step": 12010 + }, + { + "epoch": 0.9515547633194692, + "grad_norm": 1.378568569924922, + "learning_rate": 1.2265539592405173e-07, + "loss": 0.1513, + "step": 12011 + }, + { + "epoch": 0.9516339869281045, + "grad_norm": 1.7673684678605055, + "learning_rate": 1.222550532289668e-07, + "loss": 0.1788, + "step": 12012 + }, + { + "epoch": 0.95171321053674, + "grad_norm": 1.7261021485108556, + "learning_rate": 1.218553609358575e-07, + "loss": 0.1458, + "step": 12013 + }, + { + "epoch": 0.9517924341453753, + "grad_norm": 2.0395886870501414, + "learning_rate": 1.214563190710416e-07, + "loss": 0.2136, + "step": 12014 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 1.7761442070779414, + "learning_rate": 1.2105792766079594e-07, + "loss": 0.2062, + "step": 12015 + }, + { + "epoch": 0.951950881362646, + "grad_norm": 1.5915709930985433, + "learning_rate": 1.2066018673134948e-07, + "loss": 0.0977, + "step": 12016 + }, + { + "epoch": 0.9520301049712815, + "grad_norm": 1.5366812070122335, + "learning_rate": 1.2026309630889465e-07, + "loss": 0.1472, + "step": 12017 + }, + { + "epoch": 0.9521093285799168, + "grad_norm": 1.695453467314009, + "learning_rate": 1.1986665641957718e-07, + "loss": 0.1684, + "step": 12018 + }, + { + "epoch": 0.9521885521885521, + "grad_norm": 1.956582060748221, + "learning_rate": 1.194708670894984e-07, + "loss": 0.2115, + "step": 12019 + }, + { + "epoch": 0.9522677757971876, + "grad_norm": 1.8331217653148717, + "learning_rate": 1.1907572834472303e-07, + "loss": 0.156, + "step": 12020 + }, + { + "epoch": 0.9523469994058229, + "grad_norm": 1.4616866307873881, + "learning_rate": 1.1868124021126582e-07, + "loss": 0.1698, + "step": 12021 + }, + { + "epoch": 0.9524262230144583, + "grad_norm": 1.785338298014463, + "learning_rate": 1.1828740271510375e-07, + "loss": 0.1986, + "step": 12022 + }, + { + "epoch": 0.9525054466230937, + "grad_norm": 1.4376578658409527, + "learning_rate": 1.1789421588216721e-07, + "loss": 0.117, + "step": 12023 + }, + { + "epoch": 0.9525846702317291, + "grad_norm": 1.4557152152029316, + "learning_rate": 1.1750167973834769e-07, + "loss": 0.1286, + "step": 12024 + }, + { + "epoch": 0.9526638938403644, + "grad_norm": 1.3652174097322807, + "learning_rate": 1.171097943094912e-07, + "loss": 0.1066, + "step": 12025 + }, + { + "epoch": 0.9527431174489998, + "grad_norm": 1.8076583144588017, + "learning_rate": 1.1671855962140045e-07, + "loss": 0.1343, + "step": 12026 + }, + { + "epoch": 0.9528223410576352, + "grad_norm": 1.723962110125695, + "learning_rate": 1.1632797569983811e-07, + "loss": 0.1292, + "step": 12027 + }, + { + "epoch": 0.9529015646662705, + "grad_norm": 1.8335463439964539, + "learning_rate": 1.1593804257052143e-07, + "loss": 0.1604, + "step": 12028 + }, + { + "epoch": 0.952980788274906, + "grad_norm": 1.5248365587438832, + "learning_rate": 1.1554876025912432e-07, + "loss": 0.118, + "step": 12029 + }, + { + "epoch": 0.9530600118835413, + "grad_norm": 1.5335885639969342, + "learning_rate": 1.151601287912818e-07, + "loss": 0.1052, + "step": 12030 + }, + { + "epoch": 0.9531392354921767, + "grad_norm": 1.670575917637804, + "learning_rate": 1.147721481925812e-07, + "loss": 0.1895, + "step": 12031 + }, + { + "epoch": 0.953218459100812, + "grad_norm": 2.2687123214352867, + "learning_rate": 1.1438481848856986e-07, + "loss": 0.2066, + "step": 12032 + }, + { + "epoch": 0.9532976827094474, + "grad_norm": 2.1372727810964953, + "learning_rate": 1.1399813970475293e-07, + "loss": 0.1923, + "step": 12033 + }, + { + "epoch": 0.9533769063180828, + "grad_norm": 1.4975834870785496, + "learning_rate": 1.1361211186658893e-07, + "loss": 0.127, + "step": 12034 + }, + { + "epoch": 0.9534561299267181, + "grad_norm": 1.8447286433253713, + "learning_rate": 1.1322673499949754e-07, + "loss": 0.1323, + "step": 12035 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 1.683692159063323, + "learning_rate": 1.1284200912885291e-07, + "loss": 0.1554, + "step": 12036 + }, + { + "epoch": 0.9536145771439889, + "grad_norm": 1.6286660098898258, + "learning_rate": 1.1245793427998919e-07, + "loss": 0.1341, + "step": 12037 + }, + { + "epoch": 0.9536938007526243, + "grad_norm": 1.457421158329244, + "learning_rate": 1.1207451047819396e-07, + "loss": 0.1556, + "step": 12038 + }, + { + "epoch": 0.9537730243612597, + "grad_norm": 1.27266027303811, + "learning_rate": 1.1169173774871478e-07, + "loss": 0.0897, + "step": 12039 + }, + { + "epoch": 0.953852247969895, + "grad_norm": 1.8659504097124968, + "learning_rate": 1.1130961611675484e-07, + "loss": 0.1935, + "step": 12040 + }, + { + "epoch": 0.9539314715785304, + "grad_norm": 1.667601274749327, + "learning_rate": 1.1092814560747511e-07, + "loss": 0.1641, + "step": 12041 + }, + { + "epoch": 0.9540106951871657, + "grad_norm": 1.5366170062860218, + "learning_rate": 1.105473262459944e-07, + "loss": 0.1564, + "step": 12042 + }, + { + "epoch": 0.9540899187958012, + "grad_norm": 1.9129907140412452, + "learning_rate": 1.1016715805738709e-07, + "loss": 0.1761, + "step": 12043 + }, + { + "epoch": 0.9541691424044365, + "grad_norm": 1.762676037732583, + "learning_rate": 1.0978764106668538e-07, + "loss": 0.205, + "step": 12044 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 2.0368552232851513, + "learning_rate": 1.0940877529887928e-07, + "loss": 0.2391, + "step": 12045 + }, + { + "epoch": 0.9543275896217073, + "grad_norm": 2.086444359412718, + "learning_rate": 1.0903056077891438e-07, + "loss": 0.1787, + "step": 12046 + }, + { + "epoch": 0.9544068132303426, + "grad_norm": 1.7700579531488503, + "learning_rate": 1.0865299753169522e-07, + "loss": 0.2692, + "step": 12047 + }, + { + "epoch": 0.954486036838978, + "grad_norm": 1.3133828256695508, + "learning_rate": 1.0827608558208192e-07, + "loss": 0.1096, + "step": 12048 + }, + { + "epoch": 0.9545652604476134, + "grad_norm": 2.208530437178883, + "learning_rate": 1.0789982495489238e-07, + "loss": 0.1527, + "step": 12049 + }, + { + "epoch": 0.9546444840562488, + "grad_norm": 1.4546182918366382, + "learning_rate": 1.0752421567490123e-07, + "loss": 0.1054, + "step": 12050 + }, + { + "epoch": 0.9547237076648841, + "grad_norm": 1.5806288253935212, + "learning_rate": 1.0714925776684093e-07, + "loss": 0.1379, + "step": 12051 + }, + { + "epoch": 0.9548029312735196, + "grad_norm": 2.2037909835431764, + "learning_rate": 1.067749512554006e-07, + "loss": 0.1519, + "step": 12052 + }, + { + "epoch": 0.9548821548821549, + "grad_norm": 1.4432888169332159, + "learning_rate": 1.0640129616522721e-07, + "loss": 0.0798, + "step": 12053 + }, + { + "epoch": 0.9549613784907902, + "grad_norm": 1.475526789033575, + "learning_rate": 1.0602829252092328e-07, + "loss": 0.1209, + "step": 12054 + }, + { + "epoch": 0.9550406020994257, + "grad_norm": 1.8891381390407387, + "learning_rate": 1.0565594034704918e-07, + "loss": 0.23, + "step": 12055 + }, + { + "epoch": 0.955119825708061, + "grad_norm": 1.5051609161624158, + "learning_rate": 1.0528423966812307e-07, + "loss": 0.1321, + "step": 12056 + }, + { + "epoch": 0.9551990493166964, + "grad_norm": 1.698008438317734, + "learning_rate": 1.0491319050861981e-07, + "loss": 0.2419, + "step": 12057 + }, + { + "epoch": 0.9552782729253317, + "grad_norm": 1.2470871141957662, + "learning_rate": 1.0454279289296987e-07, + "loss": 0.0995, + "step": 12058 + }, + { + "epoch": 0.9553574965339672, + "grad_norm": 1.7326133783796482, + "learning_rate": 1.0417304684556373e-07, + "loss": 0.1336, + "step": 12059 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 2.1848232285938676, + "learning_rate": 1.0380395239074747e-07, + "loss": 0.2114, + "step": 12060 + }, + { + "epoch": 0.9555159437512378, + "grad_norm": 2.047669789129029, + "learning_rate": 1.0343550955282278e-07, + "loss": 0.1694, + "step": 12061 + }, + { + "epoch": 0.9555951673598733, + "grad_norm": 1.8249573262195395, + "learning_rate": 1.0306771835605022e-07, + "loss": 0.2031, + "step": 12062 + }, + { + "epoch": 0.9556743909685086, + "grad_norm": 1.5757765949747649, + "learning_rate": 1.0270057882464823e-07, + "loss": 0.1412, + "step": 12063 + }, + { + "epoch": 0.955753614577144, + "grad_norm": 1.5730205319358732, + "learning_rate": 1.0233409098278967e-07, + "loss": 0.1858, + "step": 12064 + }, + { + "epoch": 0.9558328381857794, + "grad_norm": 1.2878382038072824, + "learning_rate": 1.0196825485460637e-07, + "loss": 0.112, + "step": 12065 + }, + { + "epoch": 0.9559120617944147, + "grad_norm": 1.4095404333114911, + "learning_rate": 1.0160307046418794e-07, + "loss": 0.0951, + "step": 12066 + }, + { + "epoch": 0.9559912854030501, + "grad_norm": 1.0658902420587906, + "learning_rate": 1.0123853783557847e-07, + "loss": 0.0778, + "step": 12067 + }, + { + "epoch": 0.9560705090116854, + "grad_norm": 1.8862117022948965, + "learning_rate": 1.0087465699278321e-07, + "loss": 0.1907, + "step": 12068 + }, + { + "epoch": 0.9561497326203209, + "grad_norm": 1.5368700250623735, + "learning_rate": 1.0051142795975855e-07, + "loss": 0.1589, + "step": 12069 + }, + { + "epoch": 0.9562289562289562, + "grad_norm": 1.5215946392950483, + "learning_rate": 1.0014885076042313e-07, + "loss": 0.1036, + "step": 12070 + }, + { + "epoch": 0.9563081798375916, + "grad_norm": 1.7367286662883004, + "learning_rate": 9.978692541865121e-08, + "loss": 0.1522, + "step": 12071 + }, + { + "epoch": 0.956387403446227, + "grad_norm": 1.426243673494032, + "learning_rate": 9.94256519582748e-08, + "loss": 0.1265, + "step": 12072 + }, + { + "epoch": 0.9564666270548623, + "grad_norm": 1.6340634294843608, + "learning_rate": 9.906503040307824e-08, + "loss": 0.1731, + "step": 12073 + }, + { + "epoch": 0.9565458506634977, + "grad_norm": 1.9078597306467755, + "learning_rate": 9.87050607768103e-08, + "loss": 0.1778, + "step": 12074 + }, + { + "epoch": 0.9566250742721331, + "grad_norm": 2.120718233222186, + "learning_rate": 9.834574310317313e-08, + "loss": 0.1349, + "step": 12075 + }, + { + "epoch": 0.9567042978807685, + "grad_norm": 1.233042675438917, + "learning_rate": 9.798707740582447e-08, + "loss": 0.1029, + "step": 12076 + }, + { + "epoch": 0.9567835214894038, + "grad_norm": 1.7527488840278964, + "learning_rate": 9.762906370837988e-08, + "loss": 0.123, + "step": 12077 + }, + { + "epoch": 0.9568627450980393, + "grad_norm": 1.2302812798749247, + "learning_rate": 9.727170203441605e-08, + "loss": 0.1082, + "step": 12078 + }, + { + "epoch": 0.9569419687066746, + "grad_norm": 1.4618757138846394, + "learning_rate": 9.691499240746083e-08, + "loss": 0.1166, + "step": 12079 + }, + { + "epoch": 0.9570211923153099, + "grad_norm": 1.4825068340556253, + "learning_rate": 9.65589348510032e-08, + "loss": 0.1397, + "step": 12080 + }, + { + "epoch": 0.9571004159239453, + "grad_norm": 2.2389272262292237, + "learning_rate": 9.620352938848665e-08, + "loss": 0.1726, + "step": 12081 + }, + { + "epoch": 0.9571796395325807, + "grad_norm": 1.2667102678562707, + "learning_rate": 9.584877604331467e-08, + "loss": 0.0798, + "step": 12082 + }, + { + "epoch": 0.9572588631412161, + "grad_norm": 1.7395745703920986, + "learning_rate": 9.549467483884412e-08, + "loss": 0.1517, + "step": 12083 + }, + { + "epoch": 0.9573380867498514, + "grad_norm": 1.6505099377510957, + "learning_rate": 9.514122579839302e-08, + "loss": 0.1038, + "step": 12084 + }, + { + "epoch": 0.9574173103584869, + "grad_norm": 1.1480141201529224, + "learning_rate": 9.478842894523165e-08, + "loss": 0.1307, + "step": 12085 + }, + { + "epoch": 0.9574965339671222, + "grad_norm": 2.1904626018609905, + "learning_rate": 9.443628430259144e-08, + "loss": 0.2708, + "step": 12086 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 1.914669211959344, + "learning_rate": 9.408479189366049e-08, + "loss": 0.1658, + "step": 12087 + }, + { + "epoch": 0.957654981184393, + "grad_norm": 1.4187114823357372, + "learning_rate": 9.37339517415814e-08, + "loss": 0.1128, + "step": 12088 + }, + { + "epoch": 0.9577342047930283, + "grad_norm": 1.5353651587950317, + "learning_rate": 9.33837638694557e-08, + "loss": 0.096, + "step": 12089 + }, + { + "epoch": 0.9578134284016637, + "grad_norm": 1.0547737643939115, + "learning_rate": 9.30342283003416e-08, + "loss": 0.1184, + "step": 12090 + }, + { + "epoch": 0.957892652010299, + "grad_norm": 1.7886008155212836, + "learning_rate": 9.268534505725402e-08, + "loss": 0.1751, + "step": 12091 + }, + { + "epoch": 0.9579718756189345, + "grad_norm": 1.2160077694727411, + "learning_rate": 9.233711416316571e-08, + "loss": 0.0697, + "step": 12092 + }, + { + "epoch": 0.9580510992275698, + "grad_norm": 1.586486962071358, + "learning_rate": 9.1989535641005e-08, + "loss": 0.1733, + "step": 12093 + }, + { + "epoch": 0.9581303228362051, + "grad_norm": 2.117994009454468, + "learning_rate": 9.164260951366021e-08, + "loss": 0.22, + "step": 12094 + }, + { + "epoch": 0.9582095464448406, + "grad_norm": 1.611869808407201, + "learning_rate": 9.129633580397312e-08, + "loss": 0.1255, + "step": 12095 + }, + { + "epoch": 0.9582887700534759, + "grad_norm": 1.8391116847771045, + "learning_rate": 9.095071453474435e-08, + "loss": 0.1725, + "step": 12096 + }, + { + "epoch": 0.9583679936621113, + "grad_norm": 1.6769304174578155, + "learning_rate": 9.060574572873238e-08, + "loss": 0.1585, + "step": 12097 + }, + { + "epoch": 0.9584472172707467, + "grad_norm": 2.0762247540312755, + "learning_rate": 9.026142940865013e-08, + "loss": 0.1069, + "step": 12098 + }, + { + "epoch": 0.9585264408793821, + "grad_norm": 1.6375306988207565, + "learning_rate": 8.991776559717058e-08, + "loss": 0.1567, + "step": 12099 + }, + { + "epoch": 0.9586056644880174, + "grad_norm": 1.8206376668955544, + "learning_rate": 8.95747543169223e-08, + "loss": 0.2215, + "step": 12100 + }, + { + "epoch": 0.9586848880966528, + "grad_norm": 1.1741291418552378, + "learning_rate": 8.923239559049057e-08, + "loss": 0.1113, + "step": 12101 + }, + { + "epoch": 0.9587641117052882, + "grad_norm": 1.4176499894958317, + "learning_rate": 8.889068944041734e-08, + "loss": 0.1724, + "step": 12102 + }, + { + "epoch": 0.9588433353139235, + "grad_norm": 2.547430860660644, + "learning_rate": 8.854963588920351e-08, + "loss": 0.202, + "step": 12103 + }, + { + "epoch": 0.958922558922559, + "grad_norm": 1.609332245853523, + "learning_rate": 8.820923495930556e-08, + "loss": 0.1812, + "step": 12104 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 1.6627823016899432, + "learning_rate": 8.786948667313667e-08, + "loss": 0.1692, + "step": 12105 + }, + { + "epoch": 0.9590810061398297, + "grad_norm": 1.6909104047198948, + "learning_rate": 8.753039105306782e-08, + "loss": 0.1281, + "step": 12106 + }, + { + "epoch": 0.959160229748465, + "grad_norm": 1.4483914647110465, + "learning_rate": 8.719194812142673e-08, + "loss": 0.1302, + "step": 12107 + }, + { + "epoch": 0.9592394533571004, + "grad_norm": 1.2991725781184642, + "learning_rate": 8.685415790049889e-08, + "loss": 0.1101, + "step": 12108 + }, + { + "epoch": 0.9593186769657358, + "grad_norm": 1.6322710688229296, + "learning_rate": 8.651702041252541e-08, + "loss": 0.1706, + "step": 12109 + }, + { + "epoch": 0.9593979005743711, + "grad_norm": 1.6691305386991055, + "learning_rate": 8.61805356797063e-08, + "loss": 0.1761, + "step": 12110 + }, + { + "epoch": 0.9594771241830066, + "grad_norm": 1.494303436196463, + "learning_rate": 8.584470372419606e-08, + "loss": 0.1723, + "step": 12111 + }, + { + "epoch": 0.9595563477916419, + "grad_norm": 1.5875946677400037, + "learning_rate": 8.550952456810813e-08, + "loss": 0.1462, + "step": 12112 + }, + { + "epoch": 0.9596355714002773, + "grad_norm": 1.5350487547338978, + "learning_rate": 8.517499823351261e-08, + "loss": 0.1541, + "step": 12113 + }, + { + "epoch": 0.9597147950089127, + "grad_norm": 1.5421713670763921, + "learning_rate": 8.484112474243633e-08, + "loss": 0.153, + "step": 12114 + }, + { + "epoch": 0.959794018617548, + "grad_norm": 1.9755182692330675, + "learning_rate": 8.450790411686282e-08, + "loss": 0.1709, + "step": 12115 + }, + { + "epoch": 0.9598732422261834, + "grad_norm": 1.4154566799834094, + "learning_rate": 8.417533637873454e-08, + "loss": 0.126, + "step": 12116 + }, + { + "epoch": 0.9599524658348187, + "grad_norm": 1.8474593560933468, + "learning_rate": 8.384342154994841e-08, + "loss": 0.1724, + "step": 12117 + }, + { + "epoch": 0.9600316894434542, + "grad_norm": 1.7195199759996236, + "learning_rate": 8.351215965235915e-08, + "loss": 0.1401, + "step": 12118 + }, + { + "epoch": 0.9601109130520895, + "grad_norm": 1.7528145359642984, + "learning_rate": 8.318155070777822e-08, + "loss": 0.153, + "step": 12119 + }, + { + "epoch": 0.960190136660725, + "grad_norm": 1.616263862220609, + "learning_rate": 8.28515947379771e-08, + "loss": 0.1292, + "step": 12120 + }, + { + "epoch": 0.9602693602693603, + "grad_norm": 1.9343860887939304, + "learning_rate": 8.252229176467841e-08, + "loss": 0.1594, + "step": 12121 + }, + { + "epoch": 0.9603485838779956, + "grad_norm": 1.5607249082531798, + "learning_rate": 8.219364180956812e-08, + "loss": 0.1105, + "step": 12122 + }, + { + "epoch": 0.960427807486631, + "grad_norm": 1.8509707889624776, + "learning_rate": 8.186564489428561e-08, + "loss": 0.1749, + "step": 12123 + }, + { + "epoch": 0.9605070310952664, + "grad_norm": 1.5640763931704413, + "learning_rate": 8.153830104042582e-08, + "loss": 0.133, + "step": 12124 + }, + { + "epoch": 0.9605862547039018, + "grad_norm": 1.3064404636144864, + "learning_rate": 8.121161026954482e-08, + "loss": 0.1123, + "step": 12125 + }, + { + "epoch": 0.9606654783125371, + "grad_norm": 1.4646992759527617, + "learning_rate": 8.088557260315322e-08, + "loss": 0.1334, + "step": 12126 + }, + { + "epoch": 0.9607447019211726, + "grad_norm": 1.8387419792084765, + "learning_rate": 8.056018806271937e-08, + "loss": 0.1965, + "step": 12127 + }, + { + "epoch": 0.9608239255298079, + "grad_norm": 1.392154643132027, + "learning_rate": 8.023545666966726e-08, + "loss": 0.1189, + "step": 12128 + }, + { + "epoch": 0.9609031491384432, + "grad_norm": 1.8625017470286498, + "learning_rate": 7.991137844537977e-08, + "loss": 0.1744, + "step": 12129 + }, + { + "epoch": 0.9609823727470787, + "grad_norm": 1.4608593105457481, + "learning_rate": 7.958795341119541e-08, + "loss": 0.0691, + "step": 12130 + }, + { + "epoch": 0.961061596355714, + "grad_norm": 1.6072790194870434, + "learning_rate": 7.926518158841045e-08, + "loss": 0.1343, + "step": 12131 + }, + { + "epoch": 0.9611408199643494, + "grad_norm": 1.7507664759807247, + "learning_rate": 7.894306299827791e-08, + "loss": 0.1621, + "step": 12132 + }, + { + "epoch": 0.9612200435729847, + "grad_norm": 1.8596896128581366, + "learning_rate": 7.86215976620075e-08, + "loss": 0.132, + "step": 12133 + }, + { + "epoch": 0.9612992671816202, + "grad_norm": 1.7348715749835064, + "learning_rate": 7.83007856007667e-08, + "loss": 0.2028, + "step": 12134 + }, + { + "epoch": 0.9613784907902555, + "grad_norm": 1.774924733682001, + "learning_rate": 7.798062683567864e-08, + "loss": 0.1264, + "step": 12135 + }, + { + "epoch": 0.9614577143988908, + "grad_norm": 1.7624686262664964, + "learning_rate": 7.766112138782422e-08, + "loss": 0.1119, + "step": 12136 + }, + { + "epoch": 0.9615369380075263, + "grad_norm": 1.8687604332498013, + "learning_rate": 7.734226927824106e-08, + "loss": 0.1226, + "step": 12137 + }, + { + "epoch": 0.9616161616161616, + "grad_norm": 1.5491955344841446, + "learning_rate": 7.70240705279257e-08, + "loss": 0.1341, + "step": 12138 + }, + { + "epoch": 0.961695385224797, + "grad_norm": 1.9313415441824746, + "learning_rate": 7.670652515782917e-08, + "loss": 0.1488, + "step": 12139 + }, + { + "epoch": 0.9617746088334324, + "grad_norm": 1.1323492233667762, + "learning_rate": 7.638963318886028e-08, + "loss": 0.0936, + "step": 12140 + }, + { + "epoch": 0.9618538324420678, + "grad_norm": 1.6864150245433278, + "learning_rate": 7.607339464188346e-08, + "loss": 0.1498, + "step": 12141 + }, + { + "epoch": 0.9619330560507031, + "grad_norm": 2.1217611676178927, + "learning_rate": 7.575780953772427e-08, + "loss": 0.2073, + "step": 12142 + }, + { + "epoch": 0.9620122796593384, + "grad_norm": 1.621069799261994, + "learning_rate": 7.544287789715943e-08, + "loss": 0.1631, + "step": 12143 + }, + { + "epoch": 0.9620915032679739, + "grad_norm": 1.761778432246736, + "learning_rate": 7.51285997409279e-08, + "loss": 0.1343, + "step": 12144 + }, + { + "epoch": 0.9621707268766092, + "grad_norm": 2.0516542711916643, + "learning_rate": 7.481497508972313e-08, + "loss": 0.176, + "step": 12145 + }, + { + "epoch": 0.9622499504852446, + "grad_norm": 1.7309353700905166, + "learning_rate": 7.450200396419416e-08, + "loss": 0.1813, + "step": 12146 + }, + { + "epoch": 0.96232917409388, + "grad_norm": 2.4858564225928594, + "learning_rate": 7.418968638495006e-08, + "loss": 0.2158, + "step": 12147 + }, + { + "epoch": 0.9624083977025153, + "grad_norm": 1.090230145992707, + "learning_rate": 7.387802237255658e-08, + "loss": 0.0991, + "step": 12148 + }, + { + "epoch": 0.9624876213111507, + "grad_norm": 1.8472206510931541, + "learning_rate": 7.35670119475329e-08, + "loss": 0.2088, + "step": 12149 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 1.425841344524634, + "learning_rate": 7.325665513035707e-08, + "loss": 0.1284, + "step": 12150 + }, + { + "epoch": 0.9626460685284215, + "grad_norm": 1.6105599148447856, + "learning_rate": 7.294695194146829e-08, + "loss": 0.1495, + "step": 12151 + }, + { + "epoch": 0.9627252921370568, + "grad_norm": 1.75081021673955, + "learning_rate": 7.263790240125579e-08, + "loss": 0.1864, + "step": 12152 + }, + { + "epoch": 0.9628045157456923, + "grad_norm": 2.2093693410274056, + "learning_rate": 7.232950653006998e-08, + "loss": 0.2369, + "step": 12153 + }, + { + "epoch": 0.9628837393543276, + "grad_norm": 1.8158126265179304, + "learning_rate": 7.202176434821683e-08, + "loss": 0.1718, + "step": 12154 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 1.6057954680335182, + "learning_rate": 7.171467587596126e-08, + "loss": 0.1475, + "step": 12155 + }, + { + "epoch": 0.9630421865715983, + "grad_norm": 1.8303159105234104, + "learning_rate": 7.140824113352151e-08, + "loss": 0.1737, + "step": 12156 + }, + { + "epoch": 0.9631214101802337, + "grad_norm": 1.5569510451554864, + "learning_rate": 7.110246014107592e-08, + "loss": 0.204, + "step": 12157 + }, + { + "epoch": 0.9632006337888691, + "grad_norm": 1.8077520298408258, + "learning_rate": 7.079733291875945e-08, + "loss": 0.1592, + "step": 12158 + }, + { + "epoch": 0.9632798573975044, + "grad_norm": 1.5956800822433268, + "learning_rate": 7.049285948666052e-08, + "loss": 0.1561, + "step": 12159 + }, + { + "epoch": 0.9633590810061399, + "grad_norm": 1.9266238742232196, + "learning_rate": 7.018903986483083e-08, + "loss": 0.1822, + "step": 12160 + }, + { + "epoch": 0.9634383046147752, + "grad_norm": 1.5554891946202893, + "learning_rate": 6.988587407327219e-08, + "loss": 0.1846, + "step": 12161 + }, + { + "epoch": 0.9635175282234105, + "grad_norm": 1.3742648690595887, + "learning_rate": 6.958336213194972e-08, + "loss": 0.1232, + "step": 12162 + }, + { + "epoch": 0.963596751832046, + "grad_norm": 1.4240397140485792, + "learning_rate": 6.928150406077861e-08, + "loss": 0.1356, + "step": 12163 + }, + { + "epoch": 0.9636759754406813, + "grad_norm": 2.009020871528144, + "learning_rate": 6.89802998796385e-08, + "loss": 0.1603, + "step": 12164 + }, + { + "epoch": 0.9637551990493167, + "grad_norm": 2.343740539239381, + "learning_rate": 6.867974960836022e-08, + "loss": 0.1993, + "step": 12165 + }, + { + "epoch": 0.963834422657952, + "grad_norm": 1.372622452307348, + "learning_rate": 6.837985326673457e-08, + "loss": 0.1606, + "step": 12166 + }, + { + "epoch": 0.9639136462665875, + "grad_norm": 2.2409000862998547, + "learning_rate": 6.80806108745069e-08, + "loss": 0.2391, + "step": 12167 + }, + { + "epoch": 0.9639928698752228, + "grad_norm": 1.4134459308277265, + "learning_rate": 6.778202245138144e-08, + "loss": 0.1267, + "step": 12168 + }, + { + "epoch": 0.9640720934838581, + "grad_norm": 1.1118892698380483, + "learning_rate": 6.748408801701911e-08, + "loss": 0.073, + "step": 12169 + }, + { + "epoch": 0.9641513170924936, + "grad_norm": 1.759762943514985, + "learning_rate": 6.718680759103757e-08, + "loss": 0.1845, + "step": 12170 + }, + { + "epoch": 0.9642305407011289, + "grad_norm": 1.934575108634639, + "learning_rate": 6.689018119301227e-08, + "loss": 0.1702, + "step": 12171 + }, + { + "epoch": 0.9643097643097643, + "grad_norm": 1.5966213941066827, + "learning_rate": 6.659420884247203e-08, + "loss": 0.1662, + "step": 12172 + }, + { + "epoch": 0.9643889879183997, + "grad_norm": 1.4215974566947838, + "learning_rate": 6.629889055890682e-08, + "loss": 0.0979, + "step": 12173 + }, + { + "epoch": 0.9644682115270351, + "grad_norm": 1.7061554365021858, + "learning_rate": 6.600422636176219e-08, + "loss": 0.1798, + "step": 12174 + }, + { + "epoch": 0.9645474351356704, + "grad_norm": 1.5023981290093678, + "learning_rate": 6.571021627043928e-08, + "loss": 0.1095, + "step": 12175 + }, + { + "epoch": 0.9646266587443058, + "grad_norm": 1.9367645147158803, + "learning_rate": 6.541686030429817e-08, + "loss": 0.1778, + "step": 12176 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 1.3596421973220458, + "learning_rate": 6.512415848265453e-08, + "loss": 0.1268, + "step": 12177 + }, + { + "epoch": 0.9647851059615765, + "grad_norm": 1.3793876622542929, + "learning_rate": 6.48321108247818e-08, + "loss": 0.1279, + "step": 12178 + }, + { + "epoch": 0.964864329570212, + "grad_norm": 1.638438377080994, + "learning_rate": 6.454071734990907e-08, + "loss": 0.195, + "step": 12179 + }, + { + "epoch": 0.9649435531788473, + "grad_norm": 1.7042653753711203, + "learning_rate": 6.424997807722433e-08, + "loss": 0.232, + "step": 12180 + }, + { + "epoch": 0.9650227767874827, + "grad_norm": 1.7291737471909923, + "learning_rate": 6.395989302587113e-08, + "loss": 0.1246, + "step": 12181 + }, + { + "epoch": 0.965102000396118, + "grad_norm": 2.2099748067056986, + "learning_rate": 6.367046221494866e-08, + "loss": 0.2676, + "step": 12182 + }, + { + "epoch": 0.9651812240047534, + "grad_norm": 1.3611406874045902, + "learning_rate": 6.33816856635161e-08, + "loss": 0.1151, + "step": 12183 + }, + { + "epoch": 0.9652604476133888, + "grad_norm": 1.401636083475635, + "learning_rate": 6.309356339058825e-08, + "loss": 0.1121, + "step": 12184 + }, + { + "epoch": 0.9653396712220241, + "grad_norm": 1.056085526994235, + "learning_rate": 6.28060954151355e-08, + "loss": 0.095, + "step": 12185 + }, + { + "epoch": 0.9654188948306596, + "grad_norm": 1.5174559327140367, + "learning_rate": 6.251928175608602e-08, + "loss": 0.1461, + "step": 12186 + }, + { + "epoch": 0.9654981184392949, + "grad_norm": 1.9305071683652986, + "learning_rate": 6.223312243232693e-08, + "loss": 0.2208, + "step": 12187 + }, + { + "epoch": 0.9655773420479303, + "grad_norm": 1.9536493315406718, + "learning_rate": 6.194761746269762e-08, + "loss": 0.2035, + "step": 12188 + }, + { + "epoch": 0.9656565656565657, + "grad_norm": 1.8463156244153012, + "learning_rate": 6.16627668659997e-08, + "loss": 0.1494, + "step": 12189 + }, + { + "epoch": 0.965735789265201, + "grad_norm": 1.4768600429782186, + "learning_rate": 6.137857066098929e-08, + "loss": 0.1305, + "step": 12190 + }, + { + "epoch": 0.9658150128738364, + "grad_norm": 1.5299778278325868, + "learning_rate": 6.109502886637697e-08, + "loss": 0.1461, + "step": 12191 + }, + { + "epoch": 0.9658942364824717, + "grad_norm": 1.1428062978459437, + "learning_rate": 6.081214150083447e-08, + "loss": 0.0699, + "step": 12192 + }, + { + "epoch": 0.9659734600911072, + "grad_norm": 2.508774274921111, + "learning_rate": 6.052990858298801e-08, + "loss": 0.1779, + "step": 12193 + }, + { + "epoch": 0.9660526836997425, + "grad_norm": 1.6106357260987798, + "learning_rate": 6.024833013142272e-08, + "loss": 0.1547, + "step": 12194 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 1.5543591071303986, + "learning_rate": 5.9967406164676e-08, + "loss": 0.1606, + "step": 12195 + }, + { + "epoch": 0.9662111309170133, + "grad_norm": 1.319013454416618, + "learning_rate": 5.96871367012486e-08, + "loss": 0.1042, + "step": 12196 + }, + { + "epoch": 0.9662903545256486, + "grad_norm": 1.4304249431689604, + "learning_rate": 5.9407521759592414e-08, + "loss": 0.1433, + "step": 12197 + }, + { + "epoch": 0.966369578134284, + "grad_norm": 1.346567722446685, + "learning_rate": 5.912856135812051e-08, + "loss": 0.1191, + "step": 12198 + }, + { + "epoch": 0.9664488017429194, + "grad_norm": 1.8214917433769506, + "learning_rate": 5.8850255515200405e-08, + "loss": 0.1925, + "step": 12199 + }, + { + "epoch": 0.9665280253515548, + "grad_norm": 1.6687924048278213, + "learning_rate": 5.857260424915634e-08, + "loss": 0.1715, + "step": 12200 + }, + { + "epoch": 0.9666072489601901, + "grad_norm": 1.9601689205495383, + "learning_rate": 5.8295607578272575e-08, + "loss": 0.1881, + "step": 12201 + }, + { + "epoch": 0.9666864725688256, + "grad_norm": 1.7156119412555264, + "learning_rate": 5.801926552078563e-08, + "loss": 0.1296, + "step": 12202 + }, + { + "epoch": 0.9667656961774609, + "grad_norm": 1.492349812715188, + "learning_rate": 5.774357809489317e-08, + "loss": 0.1194, + "step": 12203 + }, + { + "epoch": 0.9668449197860962, + "grad_norm": 1.5796337554619697, + "learning_rate": 5.746854531874624e-08, + "loss": 0.1766, + "step": 12204 + }, + { + "epoch": 0.9669241433947316, + "grad_norm": 1.3449913608313366, + "learning_rate": 5.7194167210454785e-08, + "loss": 0.1081, + "step": 12205 + }, + { + "epoch": 0.967003367003367, + "grad_norm": 1.7063251682300757, + "learning_rate": 5.692044378808659e-08, + "loss": 0.1278, + "step": 12206 + }, + { + "epoch": 0.9670825906120024, + "grad_norm": 1.435370766426812, + "learning_rate": 5.664737506966389e-08, + "loss": 0.1511, + "step": 12207 + }, + { + "epoch": 0.9671618142206377, + "grad_norm": 1.742852558468855, + "learning_rate": 5.6374961073166757e-08, + "loss": 0.1953, + "step": 12208 + }, + { + "epoch": 0.9672410378292732, + "grad_norm": 1.4578707634976182, + "learning_rate": 5.610320181653306e-08, + "loss": 0.153, + "step": 12209 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 1.5743535480838153, + "learning_rate": 5.583209731765626e-08, + "loss": 0.1118, + "step": 12210 + }, + { + "epoch": 0.9673994850465438, + "grad_norm": 1.6567126193519128, + "learning_rate": 5.5561647594388756e-08, + "loss": 0.1464, + "step": 12211 + }, + { + "epoch": 0.9674787086551793, + "grad_norm": 1.6756614832940175, + "learning_rate": 5.529185266453629e-08, + "loss": 0.1592, + "step": 12212 + }, + { + "epoch": 0.9675579322638146, + "grad_norm": 1.4557702155772512, + "learning_rate": 5.502271254586356e-08, + "loss": 0.101, + "step": 12213 + }, + { + "epoch": 0.96763715587245, + "grad_norm": 1.5122756097834296, + "learning_rate": 5.4754227256094136e-08, + "loss": 0.129, + "step": 12214 + }, + { + "epoch": 0.9677163794810854, + "grad_norm": 1.1905356724898708, + "learning_rate": 5.4486396812906125e-08, + "loss": 0.0793, + "step": 12215 + }, + { + "epoch": 0.9677956030897208, + "grad_norm": 1.5804237242200136, + "learning_rate": 5.421922123393208e-08, + "loss": 0.1969, + "step": 12216 + }, + { + "epoch": 0.9678748266983561, + "grad_norm": 1.5749341215106953, + "learning_rate": 5.395270053676793e-08, + "loss": 0.1146, + "step": 12217 + }, + { + "epoch": 0.9679540503069914, + "grad_norm": 1.744983224373391, + "learning_rate": 5.3686834738960744e-08, + "loss": 0.1874, + "step": 12218 + }, + { + "epoch": 0.9680332739156269, + "grad_norm": 1.4598429596910647, + "learning_rate": 5.3421623858016525e-08, + "loss": 0.1597, + "step": 12219 + }, + { + "epoch": 0.9681124975242622, + "grad_norm": 1.3191276517370558, + "learning_rate": 5.3157067911399076e-08, + "loss": 0.1136, + "step": 12220 + }, + { + "epoch": 0.9681917211328976, + "grad_norm": 1.6665480109621238, + "learning_rate": 5.289316691652668e-08, + "loss": 0.16, + "step": 12221 + }, + { + "epoch": 0.968270944741533, + "grad_norm": 1.6830878361407333, + "learning_rate": 5.2629920890777676e-08, + "loss": 0.1563, + "step": 12222 + }, + { + "epoch": 0.9683501683501684, + "grad_norm": 1.4552761614082275, + "learning_rate": 5.236732985148374e-08, + "loss": 0.1392, + "step": 12223 + }, + { + "epoch": 0.9684293919588037, + "grad_norm": 1.6979146159028426, + "learning_rate": 5.21053938159366e-08, + "loss": 0.1195, + "step": 12224 + }, + { + "epoch": 0.9685086155674391, + "grad_norm": 1.6490534324196082, + "learning_rate": 5.1844112801383576e-08, + "loss": 0.1387, + "step": 12225 + }, + { + "epoch": 0.9685878391760745, + "grad_norm": 1.6727387044155833, + "learning_rate": 5.158348682502756e-08, + "loss": 0.1632, + "step": 12226 + }, + { + "epoch": 0.9686670627847098, + "grad_norm": 1.9233115750074272, + "learning_rate": 5.1323515904031506e-08, + "loss": 0.218, + "step": 12227 + }, + { + "epoch": 0.9687462863933453, + "grad_norm": 2.0701705368404735, + "learning_rate": 5.1064200055510606e-08, + "loss": 0.1902, + "step": 12228 + }, + { + "epoch": 0.9688255100019806, + "grad_norm": 1.7573631797742955, + "learning_rate": 5.080553929654119e-08, + "loss": 0.1573, + "step": 12229 + }, + { + "epoch": 0.9689047336106159, + "grad_norm": 1.4248217123443074, + "learning_rate": 5.05475336441541e-08, + "loss": 0.0989, + "step": 12230 + }, + { + "epoch": 0.9689839572192513, + "grad_norm": 1.7188010926404635, + "learning_rate": 5.0290183115339065e-08, + "loss": 0.1603, + "step": 12231 + }, + { + "epoch": 0.9690631808278867, + "grad_norm": 2.0484794131566724, + "learning_rate": 5.003348772704031e-08, + "loss": 0.128, + "step": 12232 + }, + { + "epoch": 0.9691424044365221, + "grad_norm": 1.8423067705510663, + "learning_rate": 4.977744749615987e-08, + "loss": 0.1826, + "step": 12233 + }, + { + "epoch": 0.9692216280451574, + "grad_norm": 1.3726862045873143, + "learning_rate": 4.9522062439557595e-08, + "loss": 0.1162, + "step": 12234 + }, + { + "epoch": 0.9693008516537929, + "grad_norm": 1.2156007068105075, + "learning_rate": 4.926733257404892e-08, + "loss": 0.0933, + "step": 12235 + }, + { + "epoch": 0.9693800752624282, + "grad_norm": 2.118271358406553, + "learning_rate": 4.901325791640599e-08, + "loss": 0.2133, + "step": 12236 + }, + { + "epoch": 0.9694592988710635, + "grad_norm": 1.4316443331628415, + "learning_rate": 4.8759838483358745e-08, + "loss": 0.1363, + "step": 12237 + }, + { + "epoch": 0.969538522479699, + "grad_norm": 2.231133858446585, + "learning_rate": 4.850707429159496e-08, + "loss": 0.1442, + "step": 12238 + }, + { + "epoch": 0.9696177460883343, + "grad_norm": 1.858483934633542, + "learning_rate": 4.825496535775576e-08, + "loss": 0.2662, + "step": 12239 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.6228771538490545, + "learning_rate": 4.800351169844231e-08, + "loss": 0.276, + "step": 12240 + }, + { + "epoch": 0.969776193305605, + "grad_norm": 1.9197277747048371, + "learning_rate": 4.7752713330212475e-08, + "loss": 0.1544, + "step": 12241 + }, + { + "epoch": 0.9698554169142405, + "grad_norm": 1.5942272946424842, + "learning_rate": 4.7502570269578605e-08, + "loss": 0.179, + "step": 12242 + }, + { + "epoch": 0.9699346405228758, + "grad_norm": 1.176154300957126, + "learning_rate": 4.725308253301197e-08, + "loss": 0.1173, + "step": 12243 + }, + { + "epoch": 0.9700138641315111, + "grad_norm": 1.473245821437402, + "learning_rate": 4.7004250136940547e-08, + "loss": 0.1358, + "step": 12244 + }, + { + "epoch": 0.9700930877401466, + "grad_norm": 1.6579333250109767, + "learning_rate": 4.675607309774899e-08, + "loss": 0.1719, + "step": 12245 + }, + { + "epoch": 0.9701723113487819, + "grad_norm": 1.3872893085772509, + "learning_rate": 4.650855143177757e-08, + "loss": 0.1332, + "step": 12246 + }, + { + "epoch": 0.9702515349574173, + "grad_norm": 1.6763678585168633, + "learning_rate": 4.626168515532548e-08, + "loss": 0.164, + "step": 12247 + }, + { + "epoch": 0.9703307585660527, + "grad_norm": 1.3758037686984572, + "learning_rate": 4.6015474284646366e-08, + "loss": 0.1185, + "step": 12248 + }, + { + "epoch": 0.9704099821746881, + "grad_norm": 1.4678518549071355, + "learning_rate": 4.576991883595283e-08, + "loss": 0.1258, + "step": 12249 + }, + { + "epoch": 0.9704892057833234, + "grad_norm": 1.6874343154693545, + "learning_rate": 4.5525018825414157e-08, + "loss": 0.1613, + "step": 12250 + }, + { + "epoch": 0.9705684293919588, + "grad_norm": 1.923420199133331, + "learning_rate": 4.528077426915412e-08, + "loss": 0.1577, + "step": 12251 + }, + { + "epoch": 0.9706476530005942, + "grad_norm": 1.9357821043678036, + "learning_rate": 4.50371851832565e-08, + "loss": 0.1686, + "step": 12252 + }, + { + "epoch": 0.9707268766092295, + "grad_norm": 1.6241608989308536, + "learning_rate": 4.4794251583759604e-08, + "loss": 0.1544, + "step": 12253 + }, + { + "epoch": 0.970806100217865, + "grad_norm": 1.8138886908027003, + "learning_rate": 4.4551973486660625e-08, + "loss": 0.1341, + "step": 12254 + }, + { + "epoch": 0.9708853238265003, + "grad_norm": 1.6765462657123031, + "learning_rate": 4.431035090791125e-08, + "loss": 0.2141, + "step": 12255 + }, + { + "epoch": 0.9709645474351357, + "grad_norm": 1.2930391348615318, + "learning_rate": 4.4069383863420966e-08, + "loss": 0.1158, + "step": 12256 + }, + { + "epoch": 0.971043771043771, + "grad_norm": 2.317494027905004, + "learning_rate": 4.38290723690582e-08, + "loss": 0.2502, + "step": 12257 + }, + { + "epoch": 0.9711229946524064, + "grad_norm": 1.2881238163666477, + "learning_rate": 4.3589416440643626e-08, + "loss": 0.0993, + "step": 12258 + }, + { + "epoch": 0.9712022182610418, + "grad_norm": 1.2386644075052187, + "learning_rate": 4.335041609396018e-08, + "loss": 0.0794, + "step": 12259 + }, + { + "epoch": 0.9712814418696771, + "grad_norm": 1.4977607827020032, + "learning_rate": 4.3112071344741935e-08, + "loss": 0.122, + "step": 12260 + }, + { + "epoch": 0.9713606654783126, + "grad_norm": 1.7435984247342418, + "learning_rate": 4.287438220868523e-08, + "loss": 0.1473, + "step": 12261 + }, + { + "epoch": 0.9714398890869479, + "grad_norm": 1.672742217943731, + "learning_rate": 4.263734870143976e-08, + "loss": 0.1694, + "step": 12262 + }, + { + "epoch": 0.9715191126955833, + "grad_norm": 1.5515555579461853, + "learning_rate": 4.2400970838613057e-08, + "loss": 0.1667, + "step": 12263 + }, + { + "epoch": 0.9715983363042187, + "grad_norm": 1.4113169346013799, + "learning_rate": 4.216524863576932e-08, + "loss": 0.123, + "step": 12264 + }, + { + "epoch": 0.971677559912854, + "grad_norm": 1.4795753906771039, + "learning_rate": 4.1930182108430584e-08, + "loss": 0.1157, + "step": 12265 + }, + { + "epoch": 0.9717567835214894, + "grad_norm": 1.4842462040286233, + "learning_rate": 4.1695771272073357e-08, + "loss": 0.0996, + "step": 12266 + }, + { + "epoch": 0.9718360071301247, + "grad_norm": 1.5847326353624451, + "learning_rate": 4.146201614213419e-08, + "loss": 0.1538, + "step": 12267 + }, + { + "epoch": 0.9719152307387602, + "grad_norm": 1.7115599138195556, + "learning_rate": 4.1228916734002976e-08, + "loss": 0.1423, + "step": 12268 + }, + { + "epoch": 0.9719944543473955, + "grad_norm": 1.922178548584178, + "learning_rate": 4.099647306302856e-08, + "loss": 0.255, + "step": 12269 + }, + { + "epoch": 0.972073677956031, + "grad_norm": 1.664771845151745, + "learning_rate": 4.076468514451759e-08, + "loss": 0.1712, + "step": 12270 + }, + { + "epoch": 0.9721529015646663, + "grad_norm": 1.4457877086651802, + "learning_rate": 4.0533552993731186e-08, + "loss": 0.1139, + "step": 12271 + }, + { + "epoch": 0.9722321251733016, + "grad_norm": 1.5502760742015664, + "learning_rate": 4.030307662588939e-08, + "loss": 0.1346, + "step": 12272 + }, + { + "epoch": 0.972311348781937, + "grad_norm": 1.2962776593766618, + "learning_rate": 4.007325605616563e-08, + "loss": 0.1058, + "step": 12273 + }, + { + "epoch": 0.9723905723905724, + "grad_norm": 1.9403016994656344, + "learning_rate": 3.9844091299694466e-08, + "loss": 0.1974, + "step": 12274 + }, + { + "epoch": 0.9724697959992078, + "grad_norm": 1.9776788389513635, + "learning_rate": 3.961558237156493e-08, + "loss": 0.1763, + "step": 12275 + }, + { + "epoch": 0.9725490196078431, + "grad_norm": 1.4671565254664445, + "learning_rate": 3.9387729286821666e-08, + "loss": 0.1046, + "step": 12276 + }, + { + "epoch": 0.9726282432164786, + "grad_norm": 1.485582103943266, + "learning_rate": 3.9160532060470435e-08, + "loss": 0.1522, + "step": 12277 + }, + { + "epoch": 0.9727074668251139, + "grad_norm": 1.4942999489691178, + "learning_rate": 3.893399070746928e-08, + "loss": 0.145, + "step": 12278 + }, + { + "epoch": 0.9727866904337492, + "grad_norm": 1.80356809190626, + "learning_rate": 3.870810524273516e-08, + "loss": 0.1752, + "step": 12279 + }, + { + "epoch": 0.9728659140423846, + "grad_norm": 2.113860172918961, + "learning_rate": 3.8482875681140616e-08, + "loss": 0.1395, + "step": 12280 + }, + { + "epoch": 0.97294513765102, + "grad_norm": 1.8215794032153978, + "learning_rate": 3.8258302037518234e-08, + "loss": 0.1625, + "step": 12281 + }, + { + "epoch": 0.9730243612596554, + "grad_norm": 1.4758744946163298, + "learning_rate": 3.803438432665396e-08, + "loss": 0.146, + "step": 12282 + }, + { + "epoch": 0.9731035848682907, + "grad_norm": 1.2525440344160492, + "learning_rate": 3.781112256329045e-08, + "loss": 0.1088, + "step": 12283 + }, + { + "epoch": 0.9731828084769262, + "grad_norm": 2.0848807064143045, + "learning_rate": 3.758851676213038e-08, + "loss": 0.1802, + "step": 12284 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 1.8031336795280852, + "learning_rate": 3.7366566937829804e-08, + "loss": 0.1865, + "step": 12285 + }, + { + "epoch": 0.9733412556941968, + "grad_norm": 1.558698712066729, + "learning_rate": 3.714527310500371e-08, + "loss": 0.1664, + "step": 12286 + }, + { + "epoch": 0.9734204793028323, + "grad_norm": 1.6940523142881616, + "learning_rate": 3.692463527822376e-08, + "loss": 0.1948, + "step": 12287 + }, + { + "epoch": 0.9734997029114676, + "grad_norm": 1.4824452484540096, + "learning_rate": 3.670465347201724e-08, + "loss": 0.2134, + "step": 12288 + }, + { + "epoch": 0.973578926520103, + "grad_norm": 1.8051065737107204, + "learning_rate": 3.6485327700869214e-08, + "loss": 0.1553, + "step": 12289 + }, + { + "epoch": 0.9736581501287384, + "grad_norm": 1.5249407529640326, + "learning_rate": 3.6266657979220356e-08, + "loss": 0.167, + "step": 12290 + }, + { + "epoch": 0.9737373737373738, + "grad_norm": 1.652902783417573, + "learning_rate": 3.604864432147026e-08, + "loss": 0.1402, + "step": 12291 + }, + { + "epoch": 0.9738165973460091, + "grad_norm": 1.4538661599255542, + "learning_rate": 3.5831286741973006e-08, + "loss": 0.1032, + "step": 12292 + }, + { + "epoch": 0.9738958209546444, + "grad_norm": 1.5529578926696304, + "learning_rate": 3.561458525504047e-08, + "loss": 0.143, + "step": 12293 + }, + { + "epoch": 0.9739750445632799, + "grad_norm": 1.6498829464772267, + "learning_rate": 3.539853987494235e-08, + "loss": 0.1721, + "step": 12294 + }, + { + "epoch": 0.9740542681719152, + "grad_norm": 1.0127699652748523, + "learning_rate": 3.518315061590394e-08, + "loss": 0.0764, + "step": 12295 + }, + { + "epoch": 0.9741334917805506, + "grad_norm": 1.467772557606543, + "learning_rate": 3.496841749210722e-08, + "loss": 0.0901, + "step": 12296 + }, + { + "epoch": 0.974212715389186, + "grad_norm": 1.38345115040309, + "learning_rate": 3.4754340517691996e-08, + "loss": 0.1245, + "step": 12297 + }, + { + "epoch": 0.9742919389978214, + "grad_norm": 1.6873371470936867, + "learning_rate": 3.454091970675366e-08, + "loss": 0.1666, + "step": 12298 + }, + { + "epoch": 0.9743711626064567, + "grad_norm": 1.7562563910055584, + "learning_rate": 3.4328155073344306e-08, + "loss": 0.1563, + "step": 12299 + }, + { + "epoch": 0.9744503862150921, + "grad_norm": 1.7035885288201764, + "learning_rate": 3.411604663147494e-08, + "loss": 0.1796, + "step": 12300 + }, + { + "epoch": 0.9745296098237275, + "grad_norm": 1.6751344057612154, + "learning_rate": 3.3904594395111066e-08, + "loss": 0.1512, + "step": 12301 + }, + { + "epoch": 0.9746088334323628, + "grad_norm": 1.729286171717593, + "learning_rate": 3.369379837817599e-08, + "loss": 0.1698, + "step": 12302 + }, + { + "epoch": 0.9746880570409983, + "grad_norm": 1.8535654306516638, + "learning_rate": 3.3483658594548606e-08, + "loss": 0.1227, + "step": 12303 + }, + { + "epoch": 0.9747672806496336, + "grad_norm": 1.5419905694767184, + "learning_rate": 3.327417505806785e-08, + "loss": 0.1009, + "step": 12304 + }, + { + "epoch": 0.9748465042582689, + "grad_norm": 1.3946092062800934, + "learning_rate": 3.30653477825249e-08, + "loss": 0.1402, + "step": 12305 + }, + { + "epoch": 0.9749257278669043, + "grad_norm": 1.7567383203217113, + "learning_rate": 3.2857176781671e-08, + "loss": 0.156, + "step": 12306 + }, + { + "epoch": 0.9750049514755397, + "grad_norm": 1.3206105291682744, + "learning_rate": 3.264966206921294e-08, + "loss": 0.1254, + "step": 12307 + }, + { + "epoch": 0.9750841750841751, + "grad_norm": 1.1594276730411217, + "learning_rate": 3.244280365881536e-08, + "loss": 0.1018, + "step": 12308 + }, + { + "epoch": 0.9751633986928104, + "grad_norm": 2.0649529144558434, + "learning_rate": 3.223660156409847e-08, + "loss": 0.2217, + "step": 12309 + }, + { + "epoch": 0.9752426223014459, + "grad_norm": 1.5251856628053613, + "learning_rate": 3.203105579863919e-08, + "loss": 0.133, + "step": 12310 + }, + { + "epoch": 0.9753218459100812, + "grad_norm": 1.7406985643029933, + "learning_rate": 3.1826166375972246e-08, + "loss": 0.1759, + "step": 12311 + }, + { + "epoch": 0.9754010695187165, + "grad_norm": 2.054767277732287, + "learning_rate": 3.162193330958796e-08, + "loss": 0.165, + "step": 12312 + }, + { + "epoch": 0.975480293127352, + "grad_norm": 1.5064493767195608, + "learning_rate": 3.141835661293557e-08, + "loss": 0.1337, + "step": 12313 + }, + { + "epoch": 0.9755595167359873, + "grad_norm": 1.1727129082290657, + "learning_rate": 3.12154362994177e-08, + "loss": 0.0985, + "step": 12314 + }, + { + "epoch": 0.9756387403446227, + "grad_norm": 1.3346813257715266, + "learning_rate": 3.1013172382396984e-08, + "loss": 0.079, + "step": 12315 + }, + { + "epoch": 0.975717963953258, + "grad_norm": 1.537474562921926, + "learning_rate": 3.0811564875190544e-08, + "loss": 0.1364, + "step": 12316 + }, + { + "epoch": 0.9757971875618935, + "grad_norm": 1.5185982233065618, + "learning_rate": 3.061061379107555e-08, + "loss": 0.1302, + "step": 12317 + }, + { + "epoch": 0.9758764111705288, + "grad_norm": 1.649497239936244, + "learning_rate": 3.04103191432803e-08, + "loss": 0.1393, + "step": 12318 + }, + { + "epoch": 0.9759556347791641, + "grad_norm": 1.7964027018384028, + "learning_rate": 3.0210680944995354e-08, + "loss": 0.1821, + "step": 12319 + }, + { + "epoch": 0.9760348583877996, + "grad_norm": 1.3749182469481005, + "learning_rate": 3.001169920936575e-08, + "loss": 0.117, + "step": 12320 + }, + { + "epoch": 0.9761140819964349, + "grad_norm": 1.7813499193635711, + "learning_rate": 2.981337394949324e-08, + "loss": 0.1781, + "step": 12321 + }, + { + "epoch": 0.9761933056050703, + "grad_norm": 1.8254876787685672, + "learning_rate": 2.961570517843626e-08, + "loss": 0.1719, + "step": 12322 + }, + { + "epoch": 0.9762725292137057, + "grad_norm": 1.5373118917235846, + "learning_rate": 2.9418692909211066e-08, + "loss": 0.0946, + "step": 12323 + }, + { + "epoch": 0.9763517528223411, + "grad_norm": 2.294324286088914, + "learning_rate": 2.9222337154789504e-08, + "loss": 0.1841, + "step": 12324 + }, + { + "epoch": 0.9764309764309764, + "grad_norm": 1.7009510254794615, + "learning_rate": 2.902663792810012e-08, + "loss": 0.1744, + "step": 12325 + }, + { + "epoch": 0.9765102000396118, + "grad_norm": 1.3767262510544032, + "learning_rate": 2.8831595242030387e-08, + "loss": 0.1186, + "step": 12326 + }, + { + "epoch": 0.9765894236482472, + "grad_norm": 1.7998842911251063, + "learning_rate": 2.863720910942114e-08, + "loss": 0.1188, + "step": 12327 + }, + { + "epoch": 0.9766686472568825, + "grad_norm": 1.2628103947452436, + "learning_rate": 2.8443479543073248e-08, + "loss": 0.1157, + "step": 12328 + }, + { + "epoch": 0.976747870865518, + "grad_norm": 1.6706260728083613, + "learning_rate": 2.825040655574207e-08, + "loss": 0.1096, + "step": 12329 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 1.9990557068158898, + "learning_rate": 2.8057990160139658e-08, + "loss": 0.2288, + "step": 12330 + }, + { + "epoch": 0.9769063180827887, + "grad_norm": 1.7579280682421616, + "learning_rate": 2.7866230368936986e-08, + "loss": 0.1146, + "step": 12331 + }, + { + "epoch": 0.976985541691424, + "grad_norm": 1.7279991057868305, + "learning_rate": 2.767512719476062e-08, + "loss": 0.1875, + "step": 12332 + }, + { + "epoch": 0.9770647653000594, + "grad_norm": 1.3687024420762037, + "learning_rate": 2.7484680650193827e-08, + "loss": 0.1241, + "step": 12333 + }, + { + "epoch": 0.9771439889086948, + "grad_norm": 1.4081058281509866, + "learning_rate": 2.729489074777547e-08, + "loss": 0.1209, + "step": 12334 + }, + { + "epoch": 0.9772232125173301, + "grad_norm": 1.7023883912598565, + "learning_rate": 2.7105757500002215e-08, + "loss": 0.1282, + "step": 12335 + }, + { + "epoch": 0.9773024361259656, + "grad_norm": 1.7665546038957183, + "learning_rate": 2.6917280919329656e-08, + "loss": 0.2054, + "step": 12336 + }, + { + "epoch": 0.9773816597346009, + "grad_norm": 1.386720657581816, + "learning_rate": 2.6729461018166758e-08, + "loss": 0.096, + "step": 12337 + }, + { + "epoch": 0.9774608833432363, + "grad_norm": 1.2272890528464564, + "learning_rate": 2.654229780887918e-08, + "loss": 0.0873, + "step": 12338 + }, + { + "epoch": 0.9775401069518717, + "grad_norm": 2.0200800657909475, + "learning_rate": 2.6355791303792622e-08, + "loss": 0.1253, + "step": 12339 + }, + { + "epoch": 0.977619330560507, + "grad_norm": 2.11011497374221, + "learning_rate": 2.6169941515188368e-08, + "loss": 0.2692, + "step": 12340 + }, + { + "epoch": 0.9776985541691424, + "grad_norm": 1.686480819313864, + "learning_rate": 2.5984748455301077e-08, + "loss": 0.1388, + "step": 12341 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 1.7632877333272383, + "learning_rate": 2.5800212136326552e-08, + "loss": 0.1498, + "step": 12342 + }, + { + "epoch": 0.9778570013864132, + "grad_norm": 1.4215797813800939, + "learning_rate": 2.561633257041507e-08, + "loss": 0.1692, + "step": 12343 + }, + { + "epoch": 0.9779362249950485, + "grad_norm": 1.8101549045097034, + "learning_rate": 2.5433109769674724e-08, + "loss": 0.224, + "step": 12344 + }, + { + "epoch": 0.978015448603684, + "grad_norm": 1.5023008720450823, + "learning_rate": 2.52505437461692e-08, + "loss": 0.1343, + "step": 12345 + }, + { + "epoch": 0.9780946722123193, + "grad_norm": 1.465131783218658, + "learning_rate": 2.5068634511919986e-08, + "loss": 0.1606, + "step": 12346 + }, + { + "epoch": 0.9781738958209546, + "grad_norm": 1.5005017965152394, + "learning_rate": 2.4887382078905287e-08, + "loss": 0.1081, + "step": 12347 + }, + { + "epoch": 0.97825311942959, + "grad_norm": 2.3065767553961813, + "learning_rate": 2.4706786459058885e-08, + "loss": 0.228, + "step": 12348 + }, + { + "epoch": 0.9783323430382254, + "grad_norm": 1.5593496311131996, + "learning_rate": 2.4526847664273488e-08, + "loss": 0.1612, + "step": 12349 + }, + { + "epoch": 0.9784115666468608, + "grad_norm": 1.4836981297066107, + "learning_rate": 2.434756570639518e-08, + "loss": 0.109, + "step": 12350 + }, + { + "epoch": 0.9784907902554961, + "grad_norm": 1.7853589878744967, + "learning_rate": 2.4168940597230074e-08, + "loss": 0.2265, + "step": 12351 + }, + { + "epoch": 0.9785700138641316, + "grad_norm": 1.335425438605114, + "learning_rate": 2.3990972348539864e-08, + "loss": 0.1426, + "step": 12352 + }, + { + "epoch": 0.9786492374727669, + "grad_norm": 1.6431303660377838, + "learning_rate": 2.381366097204296e-08, + "loss": 0.1716, + "step": 12353 + }, + { + "epoch": 0.9787284610814022, + "grad_norm": 1.7708768598609295, + "learning_rate": 2.363700647941336e-08, + "loss": 0.1647, + "step": 12354 + }, + { + "epoch": 0.9788076846900376, + "grad_norm": 1.2547347260398576, + "learning_rate": 2.3461008882283977e-08, + "loss": 0.095, + "step": 12355 + }, + { + "epoch": 0.978886908298673, + "grad_norm": 1.517382104842785, + "learning_rate": 2.3285668192243317e-08, + "loss": 0.1025, + "step": 12356 + }, + { + "epoch": 0.9789661319073084, + "grad_norm": 1.506392849719763, + "learning_rate": 2.311098442083659e-08, + "loss": 0.1622, + "step": 12357 + }, + { + "epoch": 0.9790453555159437, + "grad_norm": 1.5891836980254153, + "learning_rate": 2.293695757956571e-08, + "loss": 0.116, + "step": 12358 + }, + { + "epoch": 0.9791245791245792, + "grad_norm": 1.3760585934136587, + "learning_rate": 2.2763587679889288e-08, + "loss": 0.1604, + "step": 12359 + }, + { + "epoch": 0.9792038027332145, + "grad_norm": 1.713441982280558, + "learning_rate": 2.2590874733223744e-08, + "loss": 0.1372, + "step": 12360 + }, + { + "epoch": 0.9792830263418498, + "grad_norm": 1.407217576682397, + "learning_rate": 2.2418818750939986e-08, + "loss": 0.1398, + "step": 12361 + }, + { + "epoch": 0.9793622499504853, + "grad_norm": 2.05142013732917, + "learning_rate": 2.2247419744368946e-08, + "loss": 0.2117, + "step": 12362 + }, + { + "epoch": 0.9794414735591206, + "grad_norm": 1.7065494669617909, + "learning_rate": 2.207667772479494e-08, + "loss": 0.1743, + "step": 12363 + }, + { + "epoch": 0.979520697167756, + "grad_norm": 1.5762757522980348, + "learning_rate": 2.190659270346118e-08, + "loss": 0.1265, + "step": 12364 + }, + { + "epoch": 0.9795999207763914, + "grad_norm": 1.7896956344573591, + "learning_rate": 2.1737164691566502e-08, + "loss": 0.1998, + "step": 12365 + }, + { + "epoch": 0.9796791443850268, + "grad_norm": 1.5755686454952322, + "learning_rate": 2.156839370026753e-08, + "loss": 0.1253, + "step": 12366 + }, + { + "epoch": 0.9797583679936621, + "grad_norm": 1.7397258737336976, + "learning_rate": 2.140027974067649e-08, + "loss": 0.182, + "step": 12367 + }, + { + "epoch": 0.9798375916022974, + "grad_norm": 1.5180263662006157, + "learning_rate": 2.1232822823862297e-08, + "loss": 0.1158, + "step": 12368 + }, + { + "epoch": 0.9799168152109329, + "grad_norm": 1.8696905032132345, + "learning_rate": 2.1066022960852806e-08, + "loss": 0.1805, + "step": 12369 + }, + { + "epoch": 0.9799960388195682, + "grad_norm": 1.5758674838547122, + "learning_rate": 2.0899880162630336e-08, + "loss": 0.1713, + "step": 12370 + }, + { + "epoch": 0.9800752624282036, + "grad_norm": 1.6070949633931175, + "learning_rate": 2.073439444013392e-08, + "loss": 0.1055, + "step": 12371 + }, + { + "epoch": 0.980154486036839, + "grad_norm": 1.2616297100127591, + "learning_rate": 2.0569565804260393e-08, + "loss": 0.1423, + "step": 12372 + }, + { + "epoch": 0.9802337096454744, + "grad_norm": 1.7696098281385033, + "learning_rate": 2.04053942658633e-08, + "loss": 0.1422, + "step": 12373 + }, + { + "epoch": 0.9803129332541097, + "grad_norm": 2.3166446060251564, + "learning_rate": 2.0241879835752875e-08, + "loss": 0.2003, + "step": 12374 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.973956074444193, + "learning_rate": 2.0079022524694957e-08, + "loss": 0.0906, + "step": 12375 + }, + { + "epoch": 0.9804713804713805, + "grad_norm": 1.833521156966805, + "learning_rate": 1.991682234341208e-08, + "loss": 0.1436, + "step": 12376 + }, + { + "epoch": 0.9805506040800158, + "grad_norm": 1.7486733911926768, + "learning_rate": 1.9755279302585696e-08, + "loss": 0.1899, + "step": 12377 + }, + { + "epoch": 0.9806298276886513, + "grad_norm": 1.3368269980677352, + "learning_rate": 1.959439341285285e-08, + "loss": 0.1195, + "step": 12378 + }, + { + "epoch": 0.9807090512972866, + "grad_norm": 2.0364620521665926, + "learning_rate": 1.943416468480619e-08, + "loss": 0.2122, + "step": 12379 + }, + { + "epoch": 0.980788274905922, + "grad_norm": 1.7251592677506418, + "learning_rate": 1.9274593128996155e-08, + "loss": 0.1831, + "step": 12380 + }, + { + "epoch": 0.9808674985145573, + "grad_norm": 1.063597819311851, + "learning_rate": 1.9115678755929902e-08, + "loss": 0.103, + "step": 12381 + }, + { + "epoch": 0.9809467221231927, + "grad_norm": 1.4831017040020886, + "learning_rate": 1.8957421576071277e-08, + "loss": 0.1577, + "step": 12382 + }, + { + "epoch": 0.9810259457318281, + "grad_norm": 1.85113426373805, + "learning_rate": 1.879982159984084e-08, + "loss": 0.1607, + "step": 12383 + }, + { + "epoch": 0.9811051693404634, + "grad_norm": 1.6443047632513856, + "learning_rate": 1.864287883761695e-08, + "loss": 0.1806, + "step": 12384 + }, + { + "epoch": 0.9811843929490989, + "grad_norm": 1.1664488402340851, + "learning_rate": 1.8486593299730236e-08, + "loss": 0.0816, + "step": 12385 + }, + { + "epoch": 0.9812636165577342, + "grad_norm": 1.595847226597165, + "learning_rate": 1.8330964996474688e-08, + "loss": 0.1469, + "step": 12386 + }, + { + "epoch": 0.9813428401663695, + "grad_norm": 1.4409879254381188, + "learning_rate": 1.817599393809544e-08, + "loss": 0.1208, + "step": 12387 + }, + { + "epoch": 0.981422063775005, + "grad_norm": 1.387103115696771, + "learning_rate": 1.802168013479877e-08, + "loss": 0.122, + "step": 12388 + }, + { + "epoch": 0.9815012873836403, + "grad_norm": 1.647408554712536, + "learning_rate": 1.7868023596743224e-08, + "loss": 0.1859, + "step": 12389 + }, + { + "epoch": 0.9815805109922757, + "grad_norm": 1.8336037062497803, + "learning_rate": 1.771502433404737e-08, + "loss": 0.1536, + "step": 12390 + }, + { + "epoch": 0.981659734600911, + "grad_norm": 1.3492332601414139, + "learning_rate": 1.7562682356786488e-08, + "loss": 0.1375, + "step": 12391 + }, + { + "epoch": 0.9817389582095465, + "grad_norm": 1.3102297016720257, + "learning_rate": 1.7410997674989215e-08, + "loss": 0.0954, + "step": 12392 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 1.5943298437790678, + "learning_rate": 1.7259970298645345e-08, + "loss": 0.1417, + "step": 12393 + }, + { + "epoch": 0.9818974054268171, + "grad_norm": 1.4361320944307887, + "learning_rate": 1.7109600237698032e-08, + "loss": 0.1344, + "step": 12394 + }, + { + "epoch": 0.9819766290354526, + "grad_norm": 1.538976429551498, + "learning_rate": 1.6959887502049356e-08, + "loss": 0.1213, + "step": 12395 + }, + { + "epoch": 0.9820558526440879, + "grad_norm": 1.7975829501764093, + "learning_rate": 1.6810832101556984e-08, + "loss": 0.1758, + "step": 12396 + }, + { + "epoch": 0.9821350762527233, + "grad_norm": 1.5534587008111254, + "learning_rate": 1.666243404603529e-08, + "loss": 0.1174, + "step": 12397 + }, + { + "epoch": 0.9822142998613587, + "grad_norm": 1.3660759641984646, + "learning_rate": 1.651469334525424e-08, + "loss": 0.1243, + "step": 12398 + }, + { + "epoch": 0.9822935234699941, + "grad_norm": 1.268354649410221, + "learning_rate": 1.6367610008944935e-08, + "loss": 0.1036, + "step": 12399 + }, + { + "epoch": 0.9823727470786294, + "grad_norm": 1.6621867918107378, + "learning_rate": 1.622118404678963e-08, + "loss": 0.1546, + "step": 12400 + }, + { + "epoch": 0.9824519706872648, + "grad_norm": 1.7822762481260703, + "learning_rate": 1.607541546843061e-08, + "loss": 0.1702, + "step": 12401 + }, + { + "epoch": 0.9825311942959002, + "grad_norm": 1.5019228339204276, + "learning_rate": 1.593030428346576e-08, + "loss": 0.0975, + "step": 12402 + }, + { + "epoch": 0.9826104179045355, + "grad_norm": 1.5644568229964495, + "learning_rate": 1.578585050144965e-08, + "loss": 0.1381, + "step": 12403 + }, + { + "epoch": 0.982689641513171, + "grad_norm": 1.5760466205170158, + "learning_rate": 1.564205413189468e-08, + "loss": 0.1388, + "step": 12404 + }, + { + "epoch": 0.9827688651218063, + "grad_norm": 1.291388015400666, + "learning_rate": 1.5498915184268826e-08, + "loss": 0.1539, + "step": 12405 + }, + { + "epoch": 0.9828480887304417, + "grad_norm": 1.8307683536200092, + "learning_rate": 1.5356433667996772e-08, + "loss": 0.133, + "step": 12406 + }, + { + "epoch": 0.982927312339077, + "grad_norm": 1.5639722882261975, + "learning_rate": 1.5214609592461015e-08, + "loss": 0.1261, + "step": 12407 + }, + { + "epoch": 0.9830065359477124, + "grad_norm": 1.6015810161272572, + "learning_rate": 1.507344296699964e-08, + "loss": 0.1531, + "step": 12408 + }, + { + "epoch": 0.9830857595563478, + "grad_norm": 1.7458563521434858, + "learning_rate": 1.4932933800907435e-08, + "loss": 0.1959, + "step": 12409 + }, + { + "epoch": 0.9831649831649831, + "grad_norm": 1.8379263732882256, + "learning_rate": 1.4793082103435885e-08, + "loss": 0.2069, + "step": 12410 + }, + { + "epoch": 0.9832442067736186, + "grad_norm": 1.1615904573796025, + "learning_rate": 1.4653887883794293e-08, + "loss": 0.1017, + "step": 12411 + }, + { + "epoch": 0.9833234303822539, + "grad_norm": 1.5293867056843282, + "learning_rate": 1.451535115114866e-08, + "loss": 0.1311, + "step": 12412 + }, + { + "epoch": 0.9834026539908893, + "grad_norm": 1.7724390307381672, + "learning_rate": 1.4377471914619468e-08, + "loss": 0.163, + "step": 12413 + }, + { + "epoch": 0.9834818775995247, + "grad_norm": 1.5914676445095526, + "learning_rate": 1.424025018328612e-08, + "loss": 0.1193, + "step": 12414 + }, + { + "epoch": 0.98356110120816, + "grad_norm": 2.0343428409157074, + "learning_rate": 1.4103685966183612e-08, + "loss": 0.193, + "step": 12415 + }, + { + "epoch": 0.9836403248167954, + "grad_norm": 1.3555722208666934, + "learning_rate": 1.396777927230475e-08, + "loss": 0.1288, + "step": 12416 + }, + { + "epoch": 0.9837195484254307, + "grad_norm": 1.570240237481506, + "learning_rate": 1.383253011059682e-08, + "loss": 0.1382, + "step": 12417 + }, + { + "epoch": 0.9837987720340662, + "grad_norm": 1.4183520822735458, + "learning_rate": 1.3697938489967144e-08, + "loss": 0.1064, + "step": 12418 + }, + { + "epoch": 0.9838779956427015, + "grad_norm": 1.5676707529226253, + "learning_rate": 1.3564004419277522e-08, + "loss": 0.1482, + "step": 12419 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 1.627178275901783, + "learning_rate": 1.3430727907346453e-08, + "loss": 0.2793, + "step": 12420 + }, + { + "epoch": 0.9840364428599723, + "grad_norm": 2.039031538972755, + "learning_rate": 1.329810896294914e-08, + "loss": 0.1992, + "step": 12421 + }, + { + "epoch": 0.9841156664686076, + "grad_norm": 1.7038378864158576, + "learning_rate": 1.3166147594818601e-08, + "loss": 0.1794, + "step": 12422 + }, + { + "epoch": 0.984194890077243, + "grad_norm": 1.7026495195518299, + "learning_rate": 1.3034843811644548e-08, + "loss": 0.1396, + "step": 12423 + }, + { + "epoch": 0.9842741136858784, + "grad_norm": 1.9302224215795944, + "learning_rate": 1.290419762207007e-08, + "loss": 0.1891, + "step": 12424 + }, + { + "epoch": 0.9843533372945138, + "grad_norm": 1.3202966029866599, + "learning_rate": 1.2774209034700503e-08, + "loss": 0.129, + "step": 12425 + }, + { + "epoch": 0.9844325609031491, + "grad_norm": 1.455624966054213, + "learning_rate": 1.2644878058093446e-08, + "loss": 0.1179, + "step": 12426 + }, + { + "epoch": 0.9845117845117846, + "grad_norm": 1.7698781678304694, + "learning_rate": 1.2516204700765422e-08, + "loss": 0.1345, + "step": 12427 + }, + { + "epoch": 0.9845910081204199, + "grad_norm": 1.7969947142856464, + "learning_rate": 1.2388188971188542e-08, + "loss": 0.1764, + "step": 12428 + }, + { + "epoch": 0.9846702317290552, + "grad_norm": 2.1238329988796507, + "learning_rate": 1.2260830877792729e-08, + "loss": 0.1694, + "step": 12429 + }, + { + "epoch": 0.9847494553376906, + "grad_norm": 1.7992510263293502, + "learning_rate": 1.2134130428962387e-08, + "loss": 0.1812, + "step": 12430 + }, + { + "epoch": 0.984828678946326, + "grad_norm": 1.548033712911175, + "learning_rate": 1.2008087633040843e-08, + "loss": 0.1212, + "step": 12431 + }, + { + "epoch": 0.9849079025549614, + "grad_norm": 1.9677071585439214, + "learning_rate": 1.1882702498328125e-08, + "loss": 0.2039, + "step": 12432 + }, + { + "epoch": 0.9849871261635967, + "grad_norm": 1.4092562399212292, + "learning_rate": 1.175797503307874e-08, + "loss": 0.0864, + "step": 12433 + }, + { + "epoch": 0.9850663497722322, + "grad_norm": 2.134091849388255, + "learning_rate": 1.1633905245507227e-08, + "loss": 0.1488, + "step": 12434 + }, + { + "epoch": 0.9851455733808675, + "grad_norm": 1.894407133560039, + "learning_rate": 1.1510493143782609e-08, + "loss": 0.1336, + "step": 12435 + }, + { + "epoch": 0.9852247969895028, + "grad_norm": 2.08084771972134, + "learning_rate": 1.1387738736029496e-08, + "loss": 0.1832, + "step": 12436 + }, + { + "epoch": 0.9853040205981383, + "grad_norm": 1.7229281735764992, + "learning_rate": 1.1265642030331426e-08, + "loss": 0.1279, + "step": 12437 + }, + { + "epoch": 0.9853832442067736, + "grad_norm": 1.5428103897919618, + "learning_rate": 1.114420303472974e-08, + "loss": 0.1416, + "step": 12438 + }, + { + "epoch": 0.985462467815409, + "grad_norm": 1.041621391030963, + "learning_rate": 1.1023421757216934e-08, + "loss": 0.0667, + "step": 12439 + }, + { + "epoch": 0.9855416914240444, + "grad_norm": 1.1926698127427373, + "learning_rate": 1.090329820574887e-08, + "loss": 0.0907, + "step": 12440 + }, + { + "epoch": 0.9856209150326798, + "grad_norm": 1.7650891754571278, + "learning_rate": 1.0783832388234772e-08, + "loss": 0.2197, + "step": 12441 + }, + { + "epoch": 0.9857001386413151, + "grad_norm": 1.8156022016626312, + "learning_rate": 1.0665024312539462e-08, + "loss": 0.2318, + "step": 12442 + }, + { + "epoch": 0.9857793622499504, + "grad_norm": 1.4554797638441235, + "learning_rate": 1.0546873986486682e-08, + "loss": 0.1042, + "step": 12443 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 1.495279435464479, + "learning_rate": 1.0429381417856877e-08, + "loss": 0.164, + "step": 12444 + }, + { + "epoch": 0.9859378094672212, + "grad_norm": 1.5965361214574643, + "learning_rate": 1.0312546614384966e-08, + "loss": 0.1046, + "step": 12445 + }, + { + "epoch": 0.9860170330758566, + "grad_norm": 1.8227203231951694, + "learning_rate": 1.0196369583763688e-08, + "loss": 0.1967, + "step": 12446 + }, + { + "epoch": 0.986096256684492, + "grad_norm": 1.5944055995617108, + "learning_rate": 1.0080850333644698e-08, + "loss": 0.1369, + "step": 12447 + }, + { + "epoch": 0.9861754802931274, + "grad_norm": 1.5407824429045571, + "learning_rate": 9.965988871633025e-09, + "loss": 0.1143, + "step": 12448 + }, + { + "epoch": 0.9862547039017627, + "grad_norm": 1.5726821932994055, + "learning_rate": 9.851785205291508e-09, + "loss": 0.1395, + "step": 12449 + }, + { + "epoch": 0.9863339275103981, + "grad_norm": 1.5981491445323333, + "learning_rate": 9.738239342141909e-09, + "loss": 0.1665, + "step": 12450 + }, + { + "epoch": 0.9864131511190335, + "grad_norm": 1.8835536194709517, + "learning_rate": 9.625351289658247e-09, + "loss": 0.135, + "step": 12451 + }, + { + "epoch": 0.9864923747276688, + "grad_norm": 2.0981991982530874, + "learning_rate": 9.513121055273467e-09, + "loss": 0.1847, + "step": 12452 + }, + { + "epoch": 0.9865715983363043, + "grad_norm": 1.2161602171531127, + "learning_rate": 9.401548646380543e-09, + "loss": 0.0756, + "step": 12453 + }, + { + "epoch": 0.9866508219449396, + "grad_norm": 1.4432815565664276, + "learning_rate": 9.290634070322491e-09, + "loss": 0.1613, + "step": 12454 + }, + { + "epoch": 0.986730045553575, + "grad_norm": 1.4339986456789633, + "learning_rate": 9.180377334404577e-09, + "loss": 0.0987, + "step": 12455 + }, + { + "epoch": 0.9868092691622103, + "grad_norm": 1.5112757708050688, + "learning_rate": 9.070778445885442e-09, + "loss": 0.1678, + "step": 12456 + }, + { + "epoch": 0.9868884927708457, + "grad_norm": 1.5930466202038724, + "learning_rate": 8.961837411982643e-09, + "loss": 0.1492, + "step": 12457 + }, + { + "epoch": 0.9869677163794811, + "grad_norm": 1.5440770106530788, + "learning_rate": 8.853554239869333e-09, + "loss": 0.155, + "step": 12458 + }, + { + "epoch": 0.9870469399881164, + "grad_norm": 2.109187059235162, + "learning_rate": 8.745928936675363e-09, + "loss": 0.1759, + "step": 12459 + }, + { + "epoch": 0.9871261635967519, + "grad_norm": 1.5907348927131344, + "learning_rate": 8.638961509486177e-09, + "loss": 0.126, + "step": 12460 + }, + { + "epoch": 0.9872053872053872, + "grad_norm": 1.2728650579515424, + "learning_rate": 8.53265196534725e-09, + "loss": 0.1232, + "step": 12461 + }, + { + "epoch": 0.9872846108140226, + "grad_norm": 1.669839858049219, + "learning_rate": 8.427000311256317e-09, + "loss": 0.1334, + "step": 12462 + }, + { + "epoch": 0.987363834422658, + "grad_norm": 1.5214898269807555, + "learning_rate": 8.322006554171147e-09, + "loss": 0.1106, + "step": 12463 + }, + { + "epoch": 0.9874430580312933, + "grad_norm": 1.72843950399946, + "learning_rate": 8.217670701005098e-09, + "loss": 0.1556, + "step": 12464 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 1.873778461971483, + "learning_rate": 8.113992758628231e-09, + "loss": 0.1603, + "step": 12465 + }, + { + "epoch": 0.987601505248564, + "grad_norm": 1.8446210558315352, + "learning_rate": 8.010972733867306e-09, + "loss": 0.1642, + "step": 12466 + }, + { + "epoch": 0.9876807288571995, + "grad_norm": 1.745284742975174, + "learning_rate": 7.908610633504676e-09, + "loss": 0.1584, + "step": 12467 + }, + { + "epoch": 0.9877599524658348, + "grad_norm": 1.4580676008636861, + "learning_rate": 7.806906464281617e-09, + "loss": 0.1217, + "step": 12468 + }, + { + "epoch": 0.9878391760744701, + "grad_norm": 1.4265942620387675, + "learning_rate": 7.70586023289388e-09, + "loss": 0.0978, + "step": 12469 + }, + { + "epoch": 0.9879183996831056, + "grad_norm": 1.9317009627320343, + "learning_rate": 7.605471945996146e-09, + "loss": 0.1696, + "step": 12470 + }, + { + "epoch": 0.9879976232917409, + "grad_norm": 1.845378819060387, + "learning_rate": 7.50574161019757e-09, + "loss": 0.1224, + "step": 12471 + }, + { + "epoch": 0.9880768469003763, + "grad_norm": 1.3799040928303554, + "learning_rate": 7.406669232065122e-09, + "loss": 0.1563, + "step": 12472 + }, + { + "epoch": 0.9881560705090117, + "grad_norm": 1.7504875401876603, + "learning_rate": 7.3082548181213635e-09, + "loss": 0.1694, + "step": 12473 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 1.4143016346263686, + "learning_rate": 7.210498374848884e-09, + "loss": 0.1938, + "step": 12474 + }, + { + "epoch": 0.9883145177262824, + "grad_norm": 1.3466417506863884, + "learning_rate": 7.113399908681429e-09, + "loss": 0.1083, + "step": 12475 + }, + { + "epoch": 0.9883937413349178, + "grad_norm": 1.253325418021243, + "learning_rate": 7.016959426013881e-09, + "loss": 0.0888, + "step": 12476 + }, + { + "epoch": 0.9884729649435532, + "grad_norm": 1.5729106238270223, + "learning_rate": 6.9211769331978265e-09, + "loss": 0.1459, + "step": 12477 + }, + { + "epoch": 0.9885521885521885, + "grad_norm": 1.6735713565307373, + "learning_rate": 6.8260524365371115e-09, + "loss": 0.1461, + "step": 12478 + }, + { + "epoch": 0.988631412160824, + "grad_norm": 2.2715362557723915, + "learning_rate": 6.731585942297836e-09, + "loss": 0.216, + "step": 12479 + }, + { + "epoch": 0.9887106357694593, + "grad_norm": 1.5830046317645625, + "learning_rate": 6.637777456698358e-09, + "loss": 0.1153, + "step": 12480 + }, + { + "epoch": 0.9887898593780947, + "grad_norm": 1.908092672299486, + "learning_rate": 6.544626985915958e-09, + "loss": 0.1592, + "step": 12481 + }, + { + "epoch": 0.98886908298673, + "grad_norm": 1.569610161505841, + "learning_rate": 6.45213453608573e-09, + "loss": 0.1117, + "step": 12482 + }, + { + "epoch": 0.9889483065953654, + "grad_norm": 1.5408769875627841, + "learning_rate": 6.360300113295026e-09, + "loss": 0.1383, + "step": 12483 + }, + { + "epoch": 0.9890275302040008, + "grad_norm": 1.5454651306386455, + "learning_rate": 6.269123723593451e-09, + "loss": 0.132, + "step": 12484 + }, + { + "epoch": 0.9891067538126361, + "grad_norm": 1.5499991004134308, + "learning_rate": 6.178605372982871e-09, + "loss": 0.1151, + "step": 12485 + }, + { + "epoch": 0.9891859774212716, + "grad_norm": 2.1185039543375632, + "learning_rate": 6.088745067424073e-09, + "loss": 0.2297, + "step": 12486 + }, + { + "epoch": 0.9892652010299069, + "grad_norm": 1.695955702178954, + "learning_rate": 5.9995428128334365e-09, + "loss": 0.1199, + "step": 12487 + }, + { + "epoch": 0.9893444246385423, + "grad_norm": 1.5292472163103024, + "learning_rate": 5.910998615085151e-09, + "loss": 0.1172, + "step": 12488 + }, + { + "epoch": 0.9894236482471777, + "grad_norm": 1.697023058224961, + "learning_rate": 5.8231124800089965e-09, + "loss": 0.1455, + "step": 12489 + }, + { + "epoch": 0.989502871855813, + "grad_norm": 1.734252971230233, + "learning_rate": 5.735884413391457e-09, + "loss": 0.1408, + "step": 12490 + }, + { + "epoch": 0.9895820954644484, + "grad_norm": 1.6950596643042386, + "learning_rate": 5.6493144209768255e-09, + "loss": 0.1405, + "step": 12491 + }, + { + "epoch": 0.9896613190730837, + "grad_norm": 1.679907328225051, + "learning_rate": 5.5634025084660985e-09, + "loss": 0.1427, + "step": 12492 + }, + { + "epoch": 0.9897405426817192, + "grad_norm": 1.0661739850630583, + "learning_rate": 5.47814868151364e-09, + "loss": 0.0819, + "step": 12493 + }, + { + "epoch": 0.9898197662903545, + "grad_norm": 1.2144825197657225, + "learning_rate": 5.393552945736069e-09, + "loss": 0.0875, + "step": 12494 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 1.6319387686024347, + "learning_rate": 5.309615306701155e-09, + "loss": 0.1605, + "step": 12495 + }, + { + "epoch": 0.9899782135076253, + "grad_norm": 1.7583045477991277, + "learning_rate": 5.226335769936697e-09, + "loss": 0.1448, + "step": 12496 + }, + { + "epoch": 0.9900574371162606, + "grad_norm": 2.2495382702872284, + "learning_rate": 5.143714340926087e-09, + "loss": 0.2549, + "step": 12497 + }, + { + "epoch": 0.990136660724896, + "grad_norm": 1.5806241613422494, + "learning_rate": 5.0617510251105284e-09, + "loss": 0.2028, + "step": 12498 + }, + { + "epoch": 0.9902158843335314, + "grad_norm": 1.5622916836858032, + "learning_rate": 4.980445827885705e-09, + "loss": 0.1557, + "step": 12499 + }, + { + "epoch": 0.9902951079421668, + "grad_norm": 1.4487181852219657, + "learning_rate": 4.899798754605112e-09, + "loss": 0.1301, + "step": 12500 + }, + { + "epoch": 0.9903743315508021, + "grad_norm": 1.5954246758334834, + "learning_rate": 4.819809810578946e-09, + "loss": 0.1808, + "step": 12501 + }, + { + "epoch": 0.9904535551594376, + "grad_norm": 1.8207106551207466, + "learning_rate": 4.740479001076326e-09, + "loss": 0.1737, + "step": 12502 + }, + { + "epoch": 0.9905327787680729, + "grad_norm": 1.6710375648548415, + "learning_rate": 4.66180633131752e-09, + "loss": 0.1613, + "step": 12503 + }, + { + "epoch": 0.9906120023767082, + "grad_norm": 1.955536958794469, + "learning_rate": 4.583791806485049e-09, + "loss": 0.1925, + "step": 12504 + }, + { + "epoch": 0.9906912259853436, + "grad_norm": 1.9307270194253165, + "learning_rate": 4.506435431714806e-09, + "loss": 0.1935, + "step": 12505 + }, + { + "epoch": 0.990770449593979, + "grad_norm": 2.031935580157722, + "learning_rate": 4.429737212100493e-09, + "loss": 0.186, + "step": 12506 + }, + { + "epoch": 0.9908496732026144, + "grad_norm": 1.4441024988029327, + "learning_rate": 4.353697152692515e-09, + "loss": 0.1164, + "step": 12507 + }, + { + "epoch": 0.9909288968112497, + "grad_norm": 1.4596490498243184, + "learning_rate": 4.278315258496868e-09, + "loss": 0.1011, + "step": 12508 + }, + { + "epoch": 0.9910081204198852, + "grad_norm": 1.9955289863048005, + "learning_rate": 4.203591534478468e-09, + "loss": 0.1808, + "step": 12509 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 1.879734776827009, + "learning_rate": 4.129525985556715e-09, + "loss": 0.1169, + "step": 12510 + }, + { + "epoch": 0.9911665676371558, + "grad_norm": 1.7022893611376544, + "learning_rate": 4.056118616608817e-09, + "loss": 0.1102, + "step": 12511 + }, + { + "epoch": 0.9912457912457913, + "grad_norm": 1.8708370037859146, + "learning_rate": 3.9833694324686864e-09, + "loss": 0.1633, + "step": 12512 + }, + { + "epoch": 0.9913250148544266, + "grad_norm": 1.6312338238760893, + "learning_rate": 3.9112784379247145e-09, + "loss": 0.2489, + "step": 12513 + }, + { + "epoch": 0.991404238463062, + "grad_norm": 1.1383080596143327, + "learning_rate": 3.839845637725326e-09, + "loss": 0.0918, + "step": 12514 + }, + { + "epoch": 0.9914834620716974, + "grad_norm": 1.1507404576028977, + "learning_rate": 3.769071036573424e-09, + "loss": 0.0815, + "step": 12515 + }, + { + "epoch": 0.9915626856803328, + "grad_norm": 1.43518976677598, + "learning_rate": 3.698954639129726e-09, + "loss": 0.1713, + "step": 12516 + }, + { + "epoch": 0.9916419092889681, + "grad_norm": 1.7404432219927013, + "learning_rate": 3.6294964500116492e-09, + "loss": 0.1717, + "step": 12517 + }, + { + "epoch": 0.9917211328976034, + "grad_norm": 1.4518134615651712, + "learning_rate": 3.560696473789982e-09, + "loss": 0.1663, + "step": 12518 + }, + { + "epoch": 0.9918003565062389, + "grad_norm": 1.6742078275419632, + "learning_rate": 3.4925547149977645e-09, + "loss": 0.1444, + "step": 12519 + }, + { + "epoch": 0.9918795801148742, + "grad_norm": 1.7398603438386264, + "learning_rate": 3.425071178120298e-09, + "loss": 0.1877, + "step": 12520 + }, + { + "epoch": 0.9919588037235096, + "grad_norm": 1.1020223644771245, + "learning_rate": 3.3582458676018058e-09, + "loss": 0.1069, + "step": 12521 + }, + { + "epoch": 0.992038027332145, + "grad_norm": 1.6777717194486836, + "learning_rate": 3.292078787842101e-09, + "loss": 0.1614, + "step": 12522 + }, + { + "epoch": 0.9921172509407804, + "grad_norm": 1.51816524195961, + "learning_rate": 3.226569943197699e-09, + "loss": 0.1083, + "step": 12523 + }, + { + "epoch": 0.9921964745494157, + "grad_norm": 1.421678240607462, + "learning_rate": 3.1617193379818167e-09, + "loss": 0.1169, + "step": 12524 + }, + { + "epoch": 0.9922756981580511, + "grad_norm": 1.557199992140855, + "learning_rate": 3.0975269764654816e-09, + "loss": 0.1036, + "step": 12525 + }, + { + "epoch": 0.9923549217666865, + "grad_norm": 1.983853466937719, + "learning_rate": 3.033992862875312e-09, + "loss": 0.1805, + "step": 12526 + }, + { + "epoch": 0.9924341453753218, + "grad_norm": 1.5473320986508203, + "learning_rate": 2.9711170013935196e-09, + "loss": 0.1455, + "step": 12527 + }, + { + "epoch": 0.9925133689839573, + "grad_norm": 1.853921675187228, + "learning_rate": 2.9088993961612355e-09, + "loss": 0.1819, + "step": 12528 + }, + { + "epoch": 0.9925925925925926, + "grad_norm": 1.2826764830920114, + "learning_rate": 2.8473400512762928e-09, + "loss": 0.0827, + "step": 12529 + }, + { + "epoch": 0.992671816201228, + "grad_norm": 1.5361954606160375, + "learning_rate": 2.7864389707887853e-09, + "loss": 0.1199, + "step": 12530 + }, + { + "epoch": 0.9927510398098633, + "grad_norm": 1.1710480172134994, + "learning_rate": 2.726196158712169e-09, + "loss": 0.0775, + "step": 12531 + }, + { + "epoch": 0.9928302634184987, + "grad_norm": 1.4398788573484873, + "learning_rate": 2.66661161901105e-09, + "loss": 0.1347, + "step": 12532 + }, + { + "epoch": 0.9929094870271341, + "grad_norm": 1.5783453334716187, + "learning_rate": 2.607685355610068e-09, + "loss": 0.208, + "step": 12533 + }, + { + "epoch": 0.9929887106357694, + "grad_norm": 1.6650130877913982, + "learning_rate": 2.549417372388341e-09, + "loss": 0.1571, + "step": 12534 + }, + { + "epoch": 0.9930679342444049, + "grad_norm": 1.7842176243273924, + "learning_rate": 2.4918076731828e-09, + "loss": 0.1509, + "step": 12535 + }, + { + "epoch": 0.9931471578530402, + "grad_norm": 1.2611477176380135, + "learning_rate": 2.434856261785967e-09, + "loss": 0.101, + "step": 12536 + }, + { + "epoch": 0.9932263814616756, + "grad_norm": 1.5007191209089834, + "learning_rate": 2.378563141949286e-09, + "loss": 0.1141, + "step": 12537 + }, + { + "epoch": 0.993305605070311, + "grad_norm": 1.4105135631721666, + "learning_rate": 2.322928317378681e-09, + "loss": 0.1244, + "step": 12538 + }, + { + "epoch": 0.9933848286789463, + "grad_norm": 1.680041564849387, + "learning_rate": 2.267951791737888e-09, + "loss": 0.1411, + "step": 12539 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 2.1454745653516962, + "learning_rate": 2.213633568646234e-09, + "loss": 0.2092, + "step": 12540 + }, + { + "epoch": 0.993543275896217, + "grad_norm": 1.3865966803231098, + "learning_rate": 2.1599736516808577e-09, + "loss": 0.1322, + "step": 12541 + }, + { + "epoch": 0.9936224995048525, + "grad_norm": 1.0853765039685164, + "learning_rate": 2.106972044373379e-09, + "loss": 0.084, + "step": 12542 + }, + { + "epoch": 0.9937017231134878, + "grad_norm": 2.5082567214771276, + "learning_rate": 2.0546287502165583e-09, + "loss": 0.1972, + "step": 12543 + }, + { + "epoch": 0.9937809467221231, + "grad_norm": 1.515542335321475, + "learning_rate": 2.002943772654309e-09, + "loss": 0.1388, + "step": 12544 + }, + { + "epoch": 0.9938601703307586, + "grad_norm": 1.642570156987105, + "learning_rate": 1.951917115091684e-09, + "loss": 0.1968, + "step": 12545 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 1.886974594798946, + "learning_rate": 1.901548780887108e-09, + "loss": 0.1578, + "step": 12546 + }, + { + "epoch": 0.9940186175480293, + "grad_norm": 1.537882308490948, + "learning_rate": 1.851838773357928e-09, + "loss": 0.1622, + "step": 12547 + }, + { + "epoch": 0.9940978411566647, + "grad_norm": 1.4636521504651094, + "learning_rate": 1.8027870957781912e-09, + "loss": 0.1073, + "step": 12548 + }, + { + "epoch": 0.9941770647653001, + "grad_norm": 1.7547111066154795, + "learning_rate": 1.7543937513753161e-09, + "loss": 0.1672, + "step": 12549 + }, + { + "epoch": 0.9942562883739354, + "grad_norm": 1.5913788771883455, + "learning_rate": 1.7066587433378634e-09, + "loss": 0.1306, + "step": 12550 + }, + { + "epoch": 0.9943355119825708, + "grad_norm": 2.220406713309917, + "learning_rate": 1.659582074807764e-09, + "loss": 0.1297, + "step": 12551 + }, + { + "epoch": 0.9944147355912062, + "grad_norm": 1.7678326596030107, + "learning_rate": 1.6131637488858708e-09, + "loss": 0.1244, + "step": 12552 + }, + { + "epoch": 0.9944939591998415, + "grad_norm": 1.707166138068885, + "learning_rate": 1.5674037686275178e-09, + "loss": 0.1464, + "step": 12553 + }, + { + "epoch": 0.994573182808477, + "grad_norm": 1.4235220696247661, + "learning_rate": 1.5223021370458502e-09, + "loss": 0.1388, + "step": 12554 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.2816741157993223, + "learning_rate": 1.4778588571107144e-09, + "loss": 0.0997, + "step": 12555 + }, + { + "epoch": 0.9947316300257477, + "grad_norm": 1.8855796186399163, + "learning_rate": 1.4340739317497688e-09, + "loss": 0.2045, + "step": 12556 + }, + { + "epoch": 0.994810853634383, + "grad_norm": 2.1874284339007777, + "learning_rate": 1.390947363845152e-09, + "loss": 0.2436, + "step": 12557 + }, + { + "epoch": 0.9948900772430184, + "grad_norm": 1.2451765098993242, + "learning_rate": 1.3484791562357048e-09, + "loss": 0.0917, + "step": 12558 + }, + { + "epoch": 0.9949693008516538, + "grad_norm": 2.0896669793671085, + "learning_rate": 1.3066693117191886e-09, + "loss": 0.2073, + "step": 12559 + }, + { + "epoch": 0.9950485244602891, + "grad_norm": 1.3917184927427793, + "learning_rate": 1.2655178330467366e-09, + "loss": 0.1704, + "step": 12560 + }, + { + "epoch": 0.9951277480689246, + "grad_norm": 1.8642197015980146, + "learning_rate": 1.2250247229295132e-09, + "loss": 0.25, + "step": 12561 + }, + { + "epoch": 0.9952069716775599, + "grad_norm": 1.1924418422126835, + "learning_rate": 1.185189984034274e-09, + "loss": 0.1052, + "step": 12562 + }, + { + "epoch": 0.9952861952861953, + "grad_norm": 1.4368852330984734, + "learning_rate": 1.1460136189822556e-09, + "loss": 0.0934, + "step": 12563 + }, + { + "epoch": 0.9953654188948307, + "grad_norm": 1.5812181761570436, + "learning_rate": 1.1074956303536165e-09, + "loss": 0.156, + "step": 12564 + }, + { + "epoch": 0.995444642503466, + "grad_norm": 1.2348527043417064, + "learning_rate": 1.0696360206852162e-09, + "loss": 0.141, + "step": 12565 + }, + { + "epoch": 0.9955238661121014, + "grad_norm": 1.7238606719674299, + "learning_rate": 1.0324347924695055e-09, + "loss": 0.1963, + "step": 12566 + }, + { + "epoch": 0.9956030897207367, + "grad_norm": 1.4707733794748916, + "learning_rate": 9.958919481556362e-10, + "loss": 0.1342, + "step": 12567 + }, + { + "epoch": 0.9956823133293722, + "grad_norm": 1.980208072614021, + "learning_rate": 9.600074901505718e-10, + "loss": 0.1752, + "step": 12568 + }, + { + "epoch": 0.9957615369380075, + "grad_norm": 2.0886227072375516, + "learning_rate": 9.24781420816867e-10, + "loss": 0.2134, + "step": 12569 + }, + { + "epoch": 0.995840760546643, + "grad_norm": 2.049776224692578, + "learning_rate": 8.902137424726675e-10, + "loss": 0.1645, + "step": 12570 + }, + { + "epoch": 0.9959199841552783, + "grad_norm": 1.9966983008614256, + "learning_rate": 8.56304457396151e-10, + "loss": 0.1786, + "step": 12571 + }, + { + "epoch": 0.9959992077639136, + "grad_norm": 1.9810016644480402, + "learning_rate": 8.230535678188656e-10, + "loss": 0.1591, + "step": 12572 + }, + { + "epoch": 0.996078431372549, + "grad_norm": 1.3003271327558548, + "learning_rate": 7.904610759312814e-10, + "loss": 0.0927, + "step": 12573 + }, + { + "epoch": 0.9961576549811844, + "grad_norm": 1.7514914233199208, + "learning_rate": 7.585269838783494e-10, + "loss": 0.2282, + "step": 12574 + }, + { + "epoch": 0.9962368785898198, + "grad_norm": 1.2762052213663577, + "learning_rate": 7.272512937628318e-10, + "loss": 0.1158, + "step": 12575 + }, + { + "epoch": 0.9963161021984551, + "grad_norm": 1.3055602544838694, + "learning_rate": 6.966340076441924e-10, + "loss": 0.1292, + "step": 12576 + }, + { + "epoch": 0.9963953258070906, + "grad_norm": 1.486082995247119, + "learning_rate": 6.666751275385963e-10, + "loss": 0.1282, + "step": 12577 + }, + { + "epoch": 0.9964745494157259, + "grad_norm": 1.9000829152584406, + "learning_rate": 6.3737465542002e-10, + "loss": 0.1893, + "step": 12578 + }, + { + "epoch": 0.9965537730243612, + "grad_norm": 1.6425556885186559, + "learning_rate": 6.087325932147003e-10, + "loss": 0.1668, + "step": 12579 + }, + { + "epoch": 0.9966329966329966, + "grad_norm": 1.7921914899086733, + "learning_rate": 5.807489428111268e-10, + "loss": 0.197, + "step": 12580 + }, + { + "epoch": 0.996712220241632, + "grad_norm": 1.4539885655148488, + "learning_rate": 5.534237060511594e-10, + "loss": 0.0958, + "step": 12581 + }, + { + "epoch": 0.9967914438502674, + "grad_norm": 2.395950895005591, + "learning_rate": 5.267568847344695e-10, + "loss": 0.1577, + "step": 12582 + }, + { + "epoch": 0.9968706674589027, + "grad_norm": 1.7147772142600528, + "learning_rate": 5.007484806152097e-10, + "loss": 0.2076, + "step": 12583 + }, + { + "epoch": 0.9969498910675382, + "grad_norm": 1.5324147197995575, + "learning_rate": 4.753984954086743e-10, + "loss": 0.1557, + "step": 12584 + }, + { + "epoch": 0.9970291146761735, + "grad_norm": 1.5273650241913026, + "learning_rate": 4.5070693078130834e-10, + "loss": 0.0963, + "step": 12585 + }, + { + "epoch": 0.9971083382848088, + "grad_norm": 1.5436915232982433, + "learning_rate": 4.266737883606986e-10, + "loss": 0.1747, + "step": 12586 + }, + { + "epoch": 0.9971875618934443, + "grad_norm": 1.383404640238586, + "learning_rate": 4.0329906972780276e-10, + "loss": 0.1302, + "step": 12587 + }, + { + "epoch": 0.9972667855020796, + "grad_norm": 1.5651438717352404, + "learning_rate": 3.805827764236103e-10, + "loss": 0.1389, + "step": 12588 + }, + { + "epoch": 0.997346009110715, + "grad_norm": 1.400690444792526, + "learning_rate": 3.585249099435917e-10, + "loss": 0.1944, + "step": 12589 + }, + { + "epoch": 0.9974252327193504, + "grad_norm": 1.6045688483513236, + "learning_rate": 3.3712547173769816e-10, + "loss": 0.1523, + "step": 12590 + }, + { + "epoch": 0.9975044563279858, + "grad_norm": 2.0571902335363594, + "learning_rate": 3.163844632181334e-10, + "loss": 0.1549, + "step": 12591 + }, + { + "epoch": 0.9975836799366211, + "grad_norm": 1.648080725395552, + "learning_rate": 2.963018857493616e-10, + "loss": 0.1357, + "step": 12592 + }, + { + "epoch": 0.9976629035452564, + "grad_norm": 1.3643007072673048, + "learning_rate": 2.7687774065254804e-10, + "loss": 0.1056, + "step": 12593 + }, + { + "epoch": 0.9977421271538919, + "grad_norm": 1.4160793206480224, + "learning_rate": 2.581120292077799e-10, + "loss": 0.1221, + "step": 12594 + }, + { + "epoch": 0.9978213507625272, + "grad_norm": 1.6120568264091362, + "learning_rate": 2.400047526518456e-10, + "loss": 0.1564, + "step": 12595 + }, + { + "epoch": 0.9979005743711626, + "grad_norm": 1.8865861598706208, + "learning_rate": 2.2255591217490437e-10, + "loss": 0.1651, + "step": 12596 + }, + { + "epoch": 0.997979797979798, + "grad_norm": 1.5441932077135865, + "learning_rate": 2.057655089271471e-10, + "loss": 0.0956, + "step": 12597 + }, + { + "epoch": 0.9980590215884334, + "grad_norm": 2.3684282200725626, + "learning_rate": 1.8963354401324575e-10, + "loss": 0.2751, + "step": 12598 + }, + { + "epoch": 0.9981382451970687, + "grad_norm": 2.0902260431451407, + "learning_rate": 1.74160018496794e-10, + "loss": 0.1698, + "step": 12599 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 1.6540591798795525, + "learning_rate": 1.593449333947561e-10, + "loss": 0.1686, + "step": 12600 + }, + { + "epoch": 0.9982966924143395, + "grad_norm": 1.7276707569039078, + "learning_rate": 1.4518828968523857e-10, + "loss": 0.1949, + "step": 12601 + }, + { + "epoch": 0.9983759160229748, + "grad_norm": 1.4032736567744148, + "learning_rate": 1.3169008829749808e-10, + "loss": 0.1346, + "step": 12602 + }, + { + "epoch": 0.9984551396316103, + "grad_norm": 1.5707102377424764, + "learning_rate": 1.1885033012193348e-10, + "loss": 0.1179, + "step": 12603 + }, + { + "epoch": 0.9985343632402456, + "grad_norm": 1.5065612911181991, + "learning_rate": 1.0666901600453473e-10, + "loss": 0.1335, + "step": 12604 + }, + { + "epoch": 0.998613586848881, + "grad_norm": 1.5593091127511316, + "learning_rate": 9.51461467457726e-11, + "loss": 0.145, + "step": 12605 + }, + { + "epoch": 0.9986928104575163, + "grad_norm": 1.467564480822186, + "learning_rate": 8.428172310503968e-11, + "loss": 0.1495, + "step": 12606 + }, + { + "epoch": 0.9987720340661517, + "grad_norm": 1.410824828686225, + "learning_rate": 7.40757457984298e-11, + "loss": 0.116, + "step": 12607 + }, + { + "epoch": 0.9988512576747871, + "grad_norm": 2.2064869273444403, + "learning_rate": 6.452821549651766e-11, + "loss": 0.2888, + "step": 12608 + }, + { + "epoch": 0.9989304812834224, + "grad_norm": 2.261509178509875, + "learning_rate": 5.563913282990996e-11, + "loss": 0.2347, + "step": 12609 + }, + { + "epoch": 0.9990097048920579, + "grad_norm": 2.028624806504608, + "learning_rate": 4.7408498381473765e-11, + "loss": 0.2317, + "step": 12610 + }, + { + "epoch": 0.9990889285006932, + "grad_norm": 1.8873035241611806, + "learning_rate": 3.983631269521837e-11, + "loss": 0.1491, + "step": 12611 + }, + { + "epoch": 0.9991681521093286, + "grad_norm": 1.5129482390371531, + "learning_rate": 3.292257626963391e-11, + "loss": 0.1418, + "step": 12612 + }, + { + "epoch": 0.999247375717964, + "grad_norm": 1.3692280142308895, + "learning_rate": 2.6667289557691378e-11, + "loss": 0.1366, + "step": 12613 + }, + { + "epoch": 0.9993265993265993, + "grad_norm": 1.5306499111713083, + "learning_rate": 2.1070452974614187e-11, + "loss": 0.1265, + "step": 12614 + }, + { + "epoch": 0.9994058229352347, + "grad_norm": 1.9484315086369337, + "learning_rate": 1.6132066886775932e-11, + "loss": 0.1964, + "step": 12615 + }, + { + "epoch": 0.99948504654387, + "grad_norm": 1.104251280544726, + "learning_rate": 1.1852131619471963e-11, + "loss": 0.0761, + "step": 12616 + }, + { + "epoch": 0.9995642701525055, + "grad_norm": 1.8596185762158755, + "learning_rate": 8.230647454698926e-12, + "loss": 0.1732, + "step": 12617 + }, + { + "epoch": 0.9996434937611408, + "grad_norm": 1.907338930572874, + "learning_rate": 5.267614631154772e-12, + "loss": 0.2302, + "step": 12618 + }, + { + "epoch": 0.9997227173697762, + "grad_norm": 1.369808211921963, + "learning_rate": 2.9630333442387525e-12, + "loss": 0.104, + "step": 12619 + }, + { + "epoch": 0.9998019409784116, + "grad_norm": 1.8195352866791545, + "learning_rate": 1.3169037449412004e-12, + "loss": 0.1904, + "step": 12620 + }, + { + "epoch": 0.9998811645870469, + "grad_norm": 1.8033316327087292, + "learning_rate": 3.29225942063971e-13, + "loss": 0.1548, + "step": 12621 + }, + { + "epoch": 0.9999603881956823, + "grad_norm": 1.6205011021565534, + "learning_rate": 0.0, + "loss": 0.181, + "step": 12622 + }, + { + "epoch": 0.9999603881956823, + "step": 12622, + "total_flos": 1046427608678400.0, + "train_loss": 0.28040552918104605, + "train_runtime": 53940.9723, + "train_samples_per_second": 29.952, + "train_steps_per_second": 0.234 + } + ], + "logging_steps": 1.0, + "max_steps": 12622, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1046427608678400.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}