{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.97074312463429, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004681100058513751, "grad_norm": 8.437421798706055, "learning_rate": 4.694835680751174e-07, "loss": 2.8409, "step": 1 }, { "epoch": 0.009362200117027502, "grad_norm": 8.356019973754883, "learning_rate": 9.389671361502348e-07, "loss": 2.8116, "step": 2 }, { "epoch": 0.014043300175541252, "grad_norm": 8.29000473022461, "learning_rate": 1.4084507042253521e-06, "loss": 2.8066, "step": 3 }, { "epoch": 0.018724400234055003, "grad_norm": 8.24352741241455, "learning_rate": 1.8779342723004696e-06, "loss": 2.8373, "step": 4 }, { "epoch": 0.023405500292568753, "grad_norm": 7.985064506530762, "learning_rate": 2.3474178403755868e-06, "loss": 2.8102, "step": 5 }, { "epoch": 0.028086600351082503, "grad_norm": 7.659386157989502, "learning_rate": 2.8169014084507042e-06, "loss": 2.7384, "step": 6 }, { "epoch": 0.032767700409596257, "grad_norm": 7.230035781860352, "learning_rate": 3.286384976525822e-06, "loss": 2.7179, "step": 7 }, { "epoch": 0.037448800468110006, "grad_norm": 6.8731160163879395, "learning_rate": 3.755868544600939e-06, "loss": 2.6758, "step": 8 }, { "epoch": 0.042129900526623756, "grad_norm": 6.585916519165039, "learning_rate": 4.225352112676056e-06, "loss": 2.6716, "step": 9 }, { "epoch": 0.046811000585137506, "grad_norm": 5.923017501831055, "learning_rate": 4.6948356807511736e-06, "loss": 2.5922, "step": 10 }, { "epoch": 0.051492100643651256, "grad_norm": 5.472363471984863, "learning_rate": 5.164319248826292e-06, "loss": 2.5256, "step": 11 }, { "epoch": 0.056173200702165006, "grad_norm": 5.083034515380859, "learning_rate": 5.6338028169014084e-06, "loss": 2.5569, "step": 12 }, { "epoch": 0.060854300760678756, "grad_norm": 4.812219142913818, "learning_rate": 6.103286384976526e-06, "loss": 2.5469, "step": 13 }, { "epoch": 0.06553540081919251, "grad_norm": 4.375195503234863, "learning_rate": 6.572769953051644e-06, "loss": 2.503, "step": 14 }, { "epoch": 0.07021650087770626, "grad_norm": 3.939549684524536, "learning_rate": 7.042253521126762e-06, "loss": 2.4302, "step": 15 }, { "epoch": 0.07489760093622001, "grad_norm": 3.4781973361968994, "learning_rate": 7.511737089201878e-06, "loss": 2.4014, "step": 16 }, { "epoch": 0.07957870099473376, "grad_norm": 3.1489295959472656, "learning_rate": 7.981220657276996e-06, "loss": 2.3437, "step": 17 }, { "epoch": 0.08425980105324751, "grad_norm": 2.8369641304016113, "learning_rate": 8.450704225352112e-06, "loss": 2.2593, "step": 18 }, { "epoch": 0.08894090111176127, "grad_norm": 2.472102403640747, "learning_rate": 8.92018779342723e-06, "loss": 2.2874, "step": 19 }, { "epoch": 0.09362200117027501, "grad_norm": 2.0766918659210205, "learning_rate": 9.389671361502347e-06, "loss": 2.2487, "step": 20 }, { "epoch": 0.09830310122878877, "grad_norm": 1.7876816987991333, "learning_rate": 9.859154929577465e-06, "loss": 2.2559, "step": 21 }, { "epoch": 0.10298420128730251, "grad_norm": 1.6245261430740356, "learning_rate": 1.0328638497652584e-05, "loss": 2.2229, "step": 22 }, { "epoch": 0.10766530134581627, "grad_norm": 1.4229193925857544, "learning_rate": 1.0798122065727701e-05, "loss": 2.2001, "step": 23 }, { "epoch": 0.11234640140433001, "grad_norm": 1.2889472246170044, "learning_rate": 1.1267605633802817e-05, "loss": 2.1775, "step": 24 }, { "epoch": 0.11702750146284377, "grad_norm": 1.1388767957687378, "learning_rate": 1.1737089201877934e-05, "loss": 2.1486, "step": 25 }, { "epoch": 0.12170860152135751, "grad_norm": 1.0823746919631958, "learning_rate": 1.2206572769953052e-05, "loss": 2.1248, "step": 26 }, { "epoch": 0.12638970157987128, "grad_norm": 1.0433950424194336, "learning_rate": 1.267605633802817e-05, "loss": 2.1398, "step": 27 }, { "epoch": 0.13107080163838503, "grad_norm": 1.045602798461914, "learning_rate": 1.3145539906103288e-05, "loss": 2.1176, "step": 28 }, { "epoch": 0.13575190169689877, "grad_norm": 0.9916484951972961, "learning_rate": 1.3615023474178404e-05, "loss": 2.095, "step": 29 }, { "epoch": 0.1404330017554125, "grad_norm": 0.9007664918899536, "learning_rate": 1.4084507042253523e-05, "loss": 2.1067, "step": 30 }, { "epoch": 0.14511410181392628, "grad_norm": 0.8627976775169373, "learning_rate": 1.4553990610328639e-05, "loss": 2.1083, "step": 31 }, { "epoch": 0.14979520187244003, "grad_norm": 0.8264485001564026, "learning_rate": 1.5023474178403756e-05, "loss": 2.0686, "step": 32 }, { "epoch": 0.15447630193095377, "grad_norm": 0.8104755282402039, "learning_rate": 1.5492957746478872e-05, "loss": 2.0687, "step": 33 }, { "epoch": 0.1591574019894675, "grad_norm": 0.7617929577827454, "learning_rate": 1.5962441314553993e-05, "loss": 2.019, "step": 34 }, { "epoch": 0.16383850204798128, "grad_norm": 0.7188560366630554, "learning_rate": 1.643192488262911e-05, "loss": 2.0473, "step": 35 }, { "epoch": 0.16851960210649503, "grad_norm": 0.7130621075630188, "learning_rate": 1.6901408450704224e-05, "loss": 2.0238, "step": 36 }, { "epoch": 0.17320070216500877, "grad_norm": 0.6950539350509644, "learning_rate": 1.7370892018779345e-05, "loss": 2.028, "step": 37 }, { "epoch": 0.17788180222352254, "grad_norm": 0.7059375643730164, "learning_rate": 1.784037558685446e-05, "loss": 1.9914, "step": 38 }, { "epoch": 0.18256290228203628, "grad_norm": 0.6565279960632324, "learning_rate": 1.830985915492958e-05, "loss": 1.9853, "step": 39 }, { "epoch": 0.18724400234055003, "grad_norm": 0.6930928826332092, "learning_rate": 1.8779342723004694e-05, "loss": 1.9887, "step": 40 }, { "epoch": 0.19192510239906377, "grad_norm": 0.6585078835487366, "learning_rate": 1.9248826291079812e-05, "loss": 1.9895, "step": 41 }, { "epoch": 0.19660620245757754, "grad_norm": 0.6370649933815002, "learning_rate": 1.971830985915493e-05, "loss": 1.9903, "step": 42 }, { "epoch": 0.20128730251609128, "grad_norm": 0.6276764273643494, "learning_rate": 2.0187793427230047e-05, "loss": 1.9972, "step": 43 }, { "epoch": 0.20596840257460503, "grad_norm": 0.615270733833313, "learning_rate": 2.0657276995305167e-05, "loss": 1.9474, "step": 44 }, { "epoch": 0.2106495026331188, "grad_norm": 0.62484210729599, "learning_rate": 2.112676056338028e-05, "loss": 1.9598, "step": 45 }, { "epoch": 0.21533060269163254, "grad_norm": 0.5930389165878296, "learning_rate": 2.1596244131455402e-05, "loss": 1.9726, "step": 46 }, { "epoch": 0.22001170275014628, "grad_norm": 0.6064797043800354, "learning_rate": 2.2065727699530516e-05, "loss": 1.9479, "step": 47 }, { "epoch": 0.22469280280866002, "grad_norm": 0.5708951950073242, "learning_rate": 2.2535211267605634e-05, "loss": 1.9425, "step": 48 }, { "epoch": 0.2293739028671738, "grad_norm": 0.5736585855484009, "learning_rate": 2.300469483568075e-05, "loss": 1.962, "step": 49 }, { "epoch": 0.23405500292568754, "grad_norm": 0.5813770890235901, "learning_rate": 2.347417840375587e-05, "loss": 1.9599, "step": 50 }, { "epoch": 0.23873610298420128, "grad_norm": 0.5862187147140503, "learning_rate": 2.3943661971830986e-05, "loss": 1.9415, "step": 51 }, { "epoch": 0.24341720304271502, "grad_norm": 0.5472211241722107, "learning_rate": 2.4413145539906104e-05, "loss": 1.9185, "step": 52 }, { "epoch": 0.2480983031012288, "grad_norm": 0.5598443150520325, "learning_rate": 2.4882629107981224e-05, "loss": 1.9337, "step": 53 }, { "epoch": 0.25277940315974257, "grad_norm": 0.5730856657028198, "learning_rate": 2.535211267605634e-05, "loss": 1.9126, "step": 54 }, { "epoch": 0.2574605032182563, "grad_norm": 0.5666413307189941, "learning_rate": 2.5821596244131456e-05, "loss": 1.9412, "step": 55 }, { "epoch": 0.26214160327677005, "grad_norm": 0.5389975905418396, "learning_rate": 2.6291079812206577e-05, "loss": 1.9045, "step": 56 }, { "epoch": 0.2668227033352838, "grad_norm": 0.5219833850860596, "learning_rate": 2.676056338028169e-05, "loss": 1.8992, "step": 57 }, { "epoch": 0.27150380339379754, "grad_norm": 0.5336855053901672, "learning_rate": 2.7230046948356808e-05, "loss": 1.8941, "step": 58 }, { "epoch": 0.2761849034523113, "grad_norm": 0.5400417447090149, "learning_rate": 2.7699530516431926e-05, "loss": 1.885, "step": 59 }, { "epoch": 0.280866003510825, "grad_norm": 0.5488688945770264, "learning_rate": 2.8169014084507046e-05, "loss": 1.9013, "step": 60 }, { "epoch": 0.28554710356933877, "grad_norm": 0.5290355086326599, "learning_rate": 2.8638497652582164e-05, "loss": 1.889, "step": 61 }, { "epoch": 0.29022820362785257, "grad_norm": 0.5441176295280457, "learning_rate": 2.9107981220657278e-05, "loss": 1.895, "step": 62 }, { "epoch": 0.2949093036863663, "grad_norm": 0.5422474145889282, "learning_rate": 2.9577464788732395e-05, "loss": 1.9095, "step": 63 }, { "epoch": 0.29959040374488005, "grad_norm": 0.526756763458252, "learning_rate": 3.0046948356807513e-05, "loss": 1.8849, "step": 64 }, { "epoch": 0.3042715038033938, "grad_norm": 0.5253100395202637, "learning_rate": 3.0516431924882634e-05, "loss": 1.8704, "step": 65 }, { "epoch": 0.30895260386190754, "grad_norm": 0.5327618718147278, "learning_rate": 3.0985915492957744e-05, "loss": 1.8598, "step": 66 }, { "epoch": 0.3136337039204213, "grad_norm": 0.545571506023407, "learning_rate": 3.1455399061032865e-05, "loss": 1.8789, "step": 67 }, { "epoch": 0.318314803978935, "grad_norm": 0.5246133804321289, "learning_rate": 3.1924882629107986e-05, "loss": 1.8815, "step": 68 }, { "epoch": 0.3229959040374488, "grad_norm": 0.5434203147888184, "learning_rate": 3.23943661971831e-05, "loss": 1.8822, "step": 69 }, { "epoch": 0.32767700409596257, "grad_norm": 0.5504802465438843, "learning_rate": 3.286384976525822e-05, "loss": 1.8565, "step": 70 }, { "epoch": 0.3323581041544763, "grad_norm": 0.5427384376525879, "learning_rate": 3.3333333333333335e-05, "loss": 1.8748, "step": 71 }, { "epoch": 0.33703920421299005, "grad_norm": 0.538313090801239, "learning_rate": 3.380281690140845e-05, "loss": 1.8338, "step": 72 }, { "epoch": 0.3417203042715038, "grad_norm": 0.524651050567627, "learning_rate": 3.427230046948357e-05, "loss": 1.8492, "step": 73 }, { "epoch": 0.34640140433001754, "grad_norm": 0.5431910157203674, "learning_rate": 3.474178403755869e-05, "loss": 1.8543, "step": 74 }, { "epoch": 0.3510825043885313, "grad_norm": 0.5315969586372375, "learning_rate": 3.5211267605633805e-05, "loss": 1.8377, "step": 75 }, { "epoch": 0.3557636044470451, "grad_norm": 0.5872920751571655, "learning_rate": 3.568075117370892e-05, "loss": 1.8463, "step": 76 }, { "epoch": 0.3604447045055588, "grad_norm": 0.5032703280448914, "learning_rate": 3.615023474178404e-05, "loss": 1.8352, "step": 77 }, { "epoch": 0.36512580456407256, "grad_norm": 0.5578582882881165, "learning_rate": 3.661971830985916e-05, "loss": 1.8259, "step": 78 }, { "epoch": 0.3698069046225863, "grad_norm": 0.5350944995880127, "learning_rate": 3.7089201877934274e-05, "loss": 1.8459, "step": 79 }, { "epoch": 0.37448800468110005, "grad_norm": 0.5319028496742249, "learning_rate": 3.755868544600939e-05, "loss": 1.841, "step": 80 }, { "epoch": 0.3791691047396138, "grad_norm": 0.5441652536392212, "learning_rate": 3.802816901408451e-05, "loss": 1.8342, "step": 81 }, { "epoch": 0.38385020479812754, "grad_norm": 0.5238386988639832, "learning_rate": 3.8497652582159623e-05, "loss": 1.8338, "step": 82 }, { "epoch": 0.38853130485664134, "grad_norm": 0.5281409025192261, "learning_rate": 3.8967136150234744e-05, "loss": 1.8282, "step": 83 }, { "epoch": 0.3932124049151551, "grad_norm": 0.5238882899284363, "learning_rate": 3.943661971830986e-05, "loss": 1.843, "step": 84 }, { "epoch": 0.3978935049736688, "grad_norm": 0.5391219854354858, "learning_rate": 3.990610328638498e-05, "loss": 1.8056, "step": 85 }, { "epoch": 0.40257460503218256, "grad_norm": 0.5335526466369629, "learning_rate": 4.037558685446009e-05, "loss": 1.8369, "step": 86 }, { "epoch": 0.4072557050906963, "grad_norm": 0.5062191486358643, "learning_rate": 4.0845070422535214e-05, "loss": 1.7941, "step": 87 }, { "epoch": 0.41193680514921005, "grad_norm": 0.521977424621582, "learning_rate": 4.1314553990610335e-05, "loss": 1.8227, "step": 88 }, { "epoch": 0.4166179052077238, "grad_norm": 0.5376787185668945, "learning_rate": 4.178403755868545e-05, "loss": 1.817, "step": 89 }, { "epoch": 0.4212990052662376, "grad_norm": 0.5220103859901428, "learning_rate": 4.225352112676056e-05, "loss": 1.7939, "step": 90 }, { "epoch": 0.42598010532475133, "grad_norm": 0.5247204303741455, "learning_rate": 4.2723004694835684e-05, "loss": 1.8197, "step": 91 }, { "epoch": 0.4306612053832651, "grad_norm": 0.5285261273384094, "learning_rate": 4.3192488262910805e-05, "loss": 1.8184, "step": 92 }, { "epoch": 0.4353423054417788, "grad_norm": 0.532840371131897, "learning_rate": 4.366197183098591e-05, "loss": 1.8032, "step": 93 }, { "epoch": 0.44002340550029256, "grad_norm": 0.5351529121398926, "learning_rate": 4.413145539906103e-05, "loss": 1.8146, "step": 94 }, { "epoch": 0.4447045055588063, "grad_norm": 0.5198392868041992, "learning_rate": 4.4600938967136154e-05, "loss": 1.7852, "step": 95 }, { "epoch": 0.44938560561732005, "grad_norm": 0.5182085633277893, "learning_rate": 4.507042253521127e-05, "loss": 1.7841, "step": 96 }, { "epoch": 0.45406670567583385, "grad_norm": 0.5423537492752075, "learning_rate": 4.553990610328639e-05, "loss": 1.7969, "step": 97 }, { "epoch": 0.4587478057343476, "grad_norm": 0.5703156590461731, "learning_rate": 4.60093896713615e-05, "loss": 1.7823, "step": 98 }, { "epoch": 0.46342890579286133, "grad_norm": 0.5168886780738831, "learning_rate": 4.647887323943662e-05, "loss": 1.789, "step": 99 }, { "epoch": 0.4681100058513751, "grad_norm": 0.5176598429679871, "learning_rate": 4.694835680751174e-05, "loss": 1.7888, "step": 100 }, { "epoch": 0.4727911059098888, "grad_norm": 0.535539448261261, "learning_rate": 4.741784037558686e-05, "loss": 1.7912, "step": 101 }, { "epoch": 0.47747220596840256, "grad_norm": 0.5312119722366333, "learning_rate": 4.788732394366197e-05, "loss": 1.8038, "step": 102 }, { "epoch": 0.4821533060269163, "grad_norm": 0.5023992657661438, "learning_rate": 4.835680751173709e-05, "loss": 1.7891, "step": 103 }, { "epoch": 0.48683440608543005, "grad_norm": 0.5566378831863403, "learning_rate": 4.882629107981221e-05, "loss": 1.7947, "step": 104 }, { "epoch": 0.49151550614394385, "grad_norm": 0.5359525680541992, "learning_rate": 4.929577464788733e-05, "loss": 1.7786, "step": 105 }, { "epoch": 0.4961966062024576, "grad_norm": 0.535662829875946, "learning_rate": 4.976525821596245e-05, "loss": 1.7852, "step": 106 }, { "epoch": 0.5008777062609713, "grad_norm": 0.5481117367744446, "learning_rate": 5.023474178403756e-05, "loss": 1.7843, "step": 107 }, { "epoch": 0.5055588063194851, "grad_norm": 0.5280411243438721, "learning_rate": 5.070422535211268e-05, "loss": 1.7946, "step": 108 }, { "epoch": 0.5102399063779989, "grad_norm": 0.5515773296356201, "learning_rate": 5.117370892018779e-05, "loss": 1.7706, "step": 109 }, { "epoch": 0.5149210064365126, "grad_norm": 0.5569891929626465, "learning_rate": 5.164319248826291e-05, "loss": 1.755, "step": 110 }, { "epoch": 0.5196021064950264, "grad_norm": 0.5135573744773865, "learning_rate": 5.2112676056338026e-05, "loss": 1.7694, "step": 111 }, { "epoch": 0.5242832065535401, "grad_norm": 0.5315744876861572, "learning_rate": 5.2582159624413153e-05, "loss": 1.7636, "step": 112 }, { "epoch": 0.5289643066120538, "grad_norm": 0.5318157076835632, "learning_rate": 5.305164319248827e-05, "loss": 1.757, "step": 113 }, { "epoch": 0.5336454066705676, "grad_norm": 0.562716007232666, "learning_rate": 5.352112676056338e-05, "loss": 1.7723, "step": 114 }, { "epoch": 0.5383265067290813, "grad_norm": 0.5341477394104004, "learning_rate": 5.39906103286385e-05, "loss": 1.7609, "step": 115 }, { "epoch": 0.5430076067875951, "grad_norm": 0.5458403825759888, "learning_rate": 5.4460093896713616e-05, "loss": 1.7656, "step": 116 }, { "epoch": 0.5476887068461088, "grad_norm": 0.5436280965805054, "learning_rate": 5.492957746478874e-05, "loss": 1.7611, "step": 117 }, { "epoch": 0.5523698069046226, "grad_norm": 0.5393151044845581, "learning_rate": 5.539906103286385e-05, "loss": 1.7364, "step": 118 }, { "epoch": 0.5570509069631363, "grad_norm": 0.5378183126449585, "learning_rate": 5.5868544600938965e-05, "loss": 1.7419, "step": 119 }, { "epoch": 0.56173200702165, "grad_norm": 0.5988975763320923, "learning_rate": 5.633802816901409e-05, "loss": 1.7456, "step": 120 }, { "epoch": 0.5664131070801638, "grad_norm": 0.5297287702560425, "learning_rate": 5.68075117370892e-05, "loss": 1.7475, "step": 121 }, { "epoch": 0.5710942071386775, "grad_norm": 0.5265582203865051, "learning_rate": 5.727699530516433e-05, "loss": 1.7479, "step": 122 }, { "epoch": 0.5757753071971914, "grad_norm": 0.5375944972038269, "learning_rate": 5.774647887323944e-05, "loss": 1.7542, "step": 123 }, { "epoch": 0.5804564072557051, "grad_norm": 0.5334234237670898, "learning_rate": 5.8215962441314556e-05, "loss": 1.7561, "step": 124 }, { "epoch": 0.5851375073142189, "grad_norm": 0.5718224048614502, "learning_rate": 5.868544600938968e-05, "loss": 1.7557, "step": 125 }, { "epoch": 0.5898186073727326, "grad_norm": 0.5835574865341187, "learning_rate": 5.915492957746479e-05, "loss": 1.7572, "step": 126 }, { "epoch": 0.5944997074312464, "grad_norm": 0.53130042552948, "learning_rate": 5.9624413145539905e-05, "loss": 1.748, "step": 127 }, { "epoch": 0.5991808074897601, "grad_norm": 0.5725318789482117, "learning_rate": 6.0093896713615026e-05, "loss": 1.73, "step": 128 }, { "epoch": 0.6038619075482738, "grad_norm": 0.5428541898727417, "learning_rate": 6.056338028169014e-05, "loss": 1.7334, "step": 129 }, { "epoch": 0.6085430076067876, "grad_norm": 0.5751455426216125, "learning_rate": 6.103286384976527e-05, "loss": 1.7184, "step": 130 }, { "epoch": 0.6132241076653013, "grad_norm": 0.5367549061775208, "learning_rate": 6.150234741784038e-05, "loss": 1.7289, "step": 131 }, { "epoch": 0.6179052077238151, "grad_norm": 0.6348855495452881, "learning_rate": 6.197183098591549e-05, "loss": 1.7421, "step": 132 }, { "epoch": 0.6225863077823288, "grad_norm": 0.5319597721099854, "learning_rate": 6.244131455399061e-05, "loss": 1.7477, "step": 133 }, { "epoch": 0.6272674078408426, "grad_norm": 0.6505827307701111, "learning_rate": 6.291079812206573e-05, "loss": 1.7379, "step": 134 }, { "epoch": 0.6319485078993563, "grad_norm": 0.552060067653656, "learning_rate": 6.338028169014085e-05, "loss": 1.7272, "step": 135 }, { "epoch": 0.63662960795787, "grad_norm": 0.5982983708381653, "learning_rate": 6.384976525821597e-05, "loss": 1.7272, "step": 136 }, { "epoch": 0.6413107080163839, "grad_norm": 0.5840004682540894, "learning_rate": 6.431924882629108e-05, "loss": 1.748, "step": 137 }, { "epoch": 0.6459918080748976, "grad_norm": 0.5581035017967224, "learning_rate": 6.47887323943662e-05, "loss": 1.6975, "step": 138 }, { "epoch": 0.6506729081334114, "grad_norm": 0.5873554348945618, "learning_rate": 6.525821596244132e-05, "loss": 1.7248, "step": 139 }, { "epoch": 0.6553540081919251, "grad_norm": 0.5404384732246399, "learning_rate": 6.572769953051644e-05, "loss": 1.7105, "step": 140 }, { "epoch": 0.6600351082504389, "grad_norm": 0.5836492776870728, "learning_rate": 6.619718309859155e-05, "loss": 1.7131, "step": 141 }, { "epoch": 0.6647162083089526, "grad_norm": 0.556516170501709, "learning_rate": 6.666666666666667e-05, "loss": 1.7321, "step": 142 }, { "epoch": 0.6693973083674664, "grad_norm": 0.6061303019523621, "learning_rate": 6.713615023474179e-05, "loss": 1.7291, "step": 143 }, { "epoch": 0.6740784084259801, "grad_norm": 0.5574740171432495, "learning_rate": 6.76056338028169e-05, "loss": 1.7256, "step": 144 }, { "epoch": 0.6787595084844938, "grad_norm": 0.5718780159950256, "learning_rate": 6.807511737089203e-05, "loss": 1.7138, "step": 145 }, { "epoch": 0.6834406085430076, "grad_norm": 0.5740944743156433, "learning_rate": 6.854460093896714e-05, "loss": 1.7277, "step": 146 }, { "epoch": 0.6881217086015213, "grad_norm": 0.5966343283653259, "learning_rate": 6.901408450704226e-05, "loss": 1.6959, "step": 147 }, { "epoch": 0.6928028086600351, "grad_norm": 0.5656957626342773, "learning_rate": 6.948356807511738e-05, "loss": 1.7184, "step": 148 }, { "epoch": 0.6974839087185488, "grad_norm": 0.6364338994026184, "learning_rate": 6.995305164319249e-05, "loss": 1.7022, "step": 149 }, { "epoch": 0.7021650087770626, "grad_norm": 0.5378186702728271, "learning_rate": 7.042253521126761e-05, "loss": 1.731, "step": 150 }, { "epoch": 0.7068461088355764, "grad_norm": 0.5798015594482422, "learning_rate": 7.089201877934273e-05, "loss": 1.6997, "step": 151 }, { "epoch": 0.7115272088940902, "grad_norm": 0.5603360533714294, "learning_rate": 7.136150234741784e-05, "loss": 1.7121, "step": 152 }, { "epoch": 0.7162083089526039, "grad_norm": 0.5362142324447632, "learning_rate": 7.183098591549297e-05, "loss": 1.6943, "step": 153 }, { "epoch": 0.7208894090111176, "grad_norm": 0.6047829985618591, "learning_rate": 7.230046948356808e-05, "loss": 1.7229, "step": 154 }, { "epoch": 0.7255705090696314, "grad_norm": 0.5463905930519104, "learning_rate": 7.276995305164319e-05, "loss": 1.7021, "step": 155 }, { "epoch": 0.7302516091281451, "grad_norm": 0.5977831482887268, "learning_rate": 7.323943661971832e-05, "loss": 1.7043, "step": 156 }, { "epoch": 0.7349327091866589, "grad_norm": 0.5732491612434387, "learning_rate": 7.370892018779343e-05, "loss": 1.7098, "step": 157 }, { "epoch": 0.7396138092451726, "grad_norm": 0.5583239197731018, "learning_rate": 7.417840375586855e-05, "loss": 1.669, "step": 158 }, { "epoch": 0.7442949093036864, "grad_norm": 0.6422227621078491, "learning_rate": 7.464788732394367e-05, "loss": 1.6873, "step": 159 }, { "epoch": 0.7489760093622001, "grad_norm": 0.5348430275917053, "learning_rate": 7.511737089201878e-05, "loss": 1.6982, "step": 160 }, { "epoch": 0.7536571094207138, "grad_norm": 0.6607827544212341, "learning_rate": 7.55868544600939e-05, "loss": 1.708, "step": 161 }, { "epoch": 0.7583382094792276, "grad_norm": 0.5770392417907715, "learning_rate": 7.605633802816902e-05, "loss": 1.7188, "step": 162 }, { "epoch": 0.7630193095377413, "grad_norm": 0.5945390462875366, "learning_rate": 7.652582159624414e-05, "loss": 1.7049, "step": 163 }, { "epoch": 0.7677004095962551, "grad_norm": 0.6323566436767578, "learning_rate": 7.699530516431925e-05, "loss": 1.6993, "step": 164 }, { "epoch": 0.7723815096547688, "grad_norm": 0.5178484320640564, "learning_rate": 7.746478873239437e-05, "loss": 1.6599, "step": 165 }, { "epoch": 0.7770626097132827, "grad_norm": 0.6181017160415649, "learning_rate": 7.793427230046949e-05, "loss": 1.717, "step": 166 }, { "epoch": 0.7817437097717964, "grad_norm": 0.5473299026489258, "learning_rate": 7.840375586854461e-05, "loss": 1.6855, "step": 167 }, { "epoch": 0.7864248098303102, "grad_norm": 0.5608950853347778, "learning_rate": 7.887323943661972e-05, "loss": 1.6793, "step": 168 }, { "epoch": 0.7911059098888239, "grad_norm": 0.5713764429092407, "learning_rate": 7.934272300469484e-05, "loss": 1.6982, "step": 169 }, { "epoch": 0.7957870099473376, "grad_norm": 0.5279483199119568, "learning_rate": 7.981220657276996e-05, "loss": 1.7026, "step": 170 }, { "epoch": 0.8004681100058514, "grad_norm": 0.579554557800293, "learning_rate": 8.028169014084508e-05, "loss": 1.7107, "step": 171 }, { "epoch": 0.8051492100643651, "grad_norm": 0.5823881030082703, "learning_rate": 8.075117370892019e-05, "loss": 1.6994, "step": 172 }, { "epoch": 0.8098303101228789, "grad_norm": 0.5477363467216492, "learning_rate": 8.122065727699531e-05, "loss": 1.7047, "step": 173 }, { "epoch": 0.8145114101813926, "grad_norm": 0.5420286655426025, "learning_rate": 8.169014084507043e-05, "loss": 1.6781, "step": 174 }, { "epoch": 0.8191925102399064, "grad_norm": 0.558557391166687, "learning_rate": 8.215962441314554e-05, "loss": 1.6771, "step": 175 }, { "epoch": 0.8238736102984201, "grad_norm": 0.5379720330238342, "learning_rate": 8.262910798122067e-05, "loss": 1.6797, "step": 176 }, { "epoch": 0.8285547103569338, "grad_norm": 0.5545073747634888, "learning_rate": 8.309859154929578e-05, "loss": 1.6675, "step": 177 }, { "epoch": 0.8332358104154476, "grad_norm": 0.5789998769760132, "learning_rate": 8.35680751173709e-05, "loss": 1.6683, "step": 178 }, { "epoch": 0.8379169104739613, "grad_norm": 0.6201234459877014, "learning_rate": 8.403755868544602e-05, "loss": 1.6806, "step": 179 }, { "epoch": 0.8425980105324752, "grad_norm": 0.5758758783340454, "learning_rate": 8.450704225352113e-05, "loss": 1.6456, "step": 180 }, { "epoch": 0.8472791105909889, "grad_norm": 0.5807463526725769, "learning_rate": 8.497652582159625e-05, "loss": 1.6479, "step": 181 }, { "epoch": 0.8519602106495027, "grad_norm": 0.6110079884529114, "learning_rate": 8.544600938967137e-05, "loss": 1.6858, "step": 182 }, { "epoch": 0.8566413107080164, "grad_norm": 0.566091001033783, "learning_rate": 8.591549295774647e-05, "loss": 1.6743, "step": 183 }, { "epoch": 0.8613224107665302, "grad_norm": 0.6173126101493835, "learning_rate": 8.638497652582161e-05, "loss": 1.6819, "step": 184 }, { "epoch": 0.8660035108250439, "grad_norm": 0.5910197496414185, "learning_rate": 8.685446009389672e-05, "loss": 1.6901, "step": 185 }, { "epoch": 0.8706846108835576, "grad_norm": 0.6235933899879456, "learning_rate": 8.732394366197182e-05, "loss": 1.6558, "step": 186 }, { "epoch": 0.8753657109420714, "grad_norm": 0.6407201290130615, "learning_rate": 8.779342723004696e-05, "loss": 1.6538, "step": 187 }, { "epoch": 0.8800468110005851, "grad_norm": 0.5760080218315125, "learning_rate": 8.826291079812207e-05, "loss": 1.6657, "step": 188 }, { "epoch": 0.8847279110590989, "grad_norm": 0.5801392793655396, "learning_rate": 8.873239436619719e-05, "loss": 1.6783, "step": 189 }, { "epoch": 0.8894090111176126, "grad_norm": 0.6207360625267029, "learning_rate": 8.920187793427231e-05, "loss": 1.6722, "step": 190 }, { "epoch": 0.8940901111761264, "grad_norm": 0.5441955327987671, "learning_rate": 8.967136150234741e-05, "loss": 1.6895, "step": 191 }, { "epoch": 0.8987712112346401, "grad_norm": 0.6543055772781372, "learning_rate": 9.014084507042254e-05, "loss": 1.655, "step": 192 }, { "epoch": 0.9034523112931538, "grad_norm": 0.6866628527641296, "learning_rate": 9.061032863849766e-05, "loss": 1.6692, "step": 193 }, { "epoch": 0.9081334113516677, "grad_norm": 0.5562683939933777, "learning_rate": 9.107981220657278e-05, "loss": 1.6617, "step": 194 }, { "epoch": 0.9128145114101814, "grad_norm": 0.6460333466529846, "learning_rate": 9.15492957746479e-05, "loss": 1.6455, "step": 195 }, { "epoch": 0.9174956114686952, "grad_norm": 0.6548853516578674, "learning_rate": 9.2018779342723e-05, "loss": 1.6894, "step": 196 }, { "epoch": 0.9221767115272089, "grad_norm": 0.5706417560577393, "learning_rate": 9.248826291079813e-05, "loss": 1.6521, "step": 197 }, { "epoch": 0.9268578115857227, "grad_norm": 0.6393511891365051, "learning_rate": 9.295774647887325e-05, "loss": 1.6604, "step": 198 }, { "epoch": 0.9315389116442364, "grad_norm": 0.598738968372345, "learning_rate": 9.342723004694837e-05, "loss": 1.6479, "step": 199 }, { "epoch": 0.9362200117027502, "grad_norm": 0.5855012536048889, "learning_rate": 9.389671361502347e-05, "loss": 1.6348, "step": 200 }, { "epoch": 0.9409011117612639, "grad_norm": 0.5486910343170166, "learning_rate": 9.43661971830986e-05, "loss": 1.6532, "step": 201 }, { "epoch": 0.9455822118197776, "grad_norm": 0.5791751146316528, "learning_rate": 9.483568075117372e-05, "loss": 1.6438, "step": 202 }, { "epoch": 0.9502633118782914, "grad_norm": 0.6078116297721863, "learning_rate": 9.530516431924882e-05, "loss": 1.6477, "step": 203 }, { "epoch": 0.9549444119368051, "grad_norm": 0.5766721367835999, "learning_rate": 9.577464788732394e-05, "loss": 1.6264, "step": 204 }, { "epoch": 0.9596255119953189, "grad_norm": 0.5816053152084351, "learning_rate": 9.624413145539907e-05, "loss": 1.6483, "step": 205 }, { "epoch": 0.9643066120538326, "grad_norm": 0.5633952021598816, "learning_rate": 9.671361502347419e-05, "loss": 1.6346, "step": 206 }, { "epoch": 0.9689877121123464, "grad_norm": 0.592918872833252, "learning_rate": 9.718309859154931e-05, "loss": 1.6453, "step": 207 }, { "epoch": 0.9736688121708601, "grad_norm": 0.5661919116973877, "learning_rate": 9.765258215962441e-05, "loss": 1.6475, "step": 208 }, { "epoch": 0.978349912229374, "grad_norm": 0.6112465858459473, "learning_rate": 9.812206572769954e-05, "loss": 1.6551, "step": 209 }, { "epoch": 0.9830310122878877, "grad_norm": 0.5819314122200012, "learning_rate": 9.859154929577466e-05, "loss": 1.6491, "step": 210 }, { "epoch": 0.9877121123464014, "grad_norm": 0.5747060179710388, "learning_rate": 9.906103286384976e-05, "loss": 1.6388, "step": 211 }, { "epoch": 0.9923932124049152, "grad_norm": 0.5805224180221558, "learning_rate": 9.95305164319249e-05, "loss": 1.6417, "step": 212 }, { "epoch": 0.9970743124634289, "grad_norm": 0.5614432692527771, "learning_rate": 0.0001, "loss": 1.6335, "step": 213 }, { "epoch": 1.0017554125219426, "grad_norm": 2.6474854946136475, "learning_rate": 9.999993285782108e-05, "loss": 1.83, "step": 214 }, { "epoch": 1.0064365125804564, "grad_norm": 0.76060950756073, "learning_rate": 9.999973143146459e-05, "loss": 1.6414, "step": 215 }, { "epoch": 1.0111176126389703, "grad_norm": 0.6044154763221741, "learning_rate": 9.999939572147152e-05, "loss": 1.6377, "step": 216 }, { "epoch": 1.015798712697484, "grad_norm": 0.7118411064147949, "learning_rate": 9.99989257287435e-05, "loss": 1.6528, "step": 217 }, { "epoch": 1.0204798127559978, "grad_norm": 0.5965666174888611, "learning_rate": 9.999832145454275e-05, "loss": 1.6573, "step": 218 }, { "epoch": 1.0251609128145114, "grad_norm": 0.6278660297393799, "learning_rate": 9.99975829004922e-05, "loss": 1.6336, "step": 219 }, { "epoch": 1.0298420128730252, "grad_norm": 0.6511713266372681, "learning_rate": 9.999671006857532e-05, "loss": 1.6196, "step": 220 }, { "epoch": 1.0345231129315389, "grad_norm": 0.6470988392829895, "learning_rate": 9.999570296113632e-05, "loss": 1.6437, "step": 221 }, { "epoch": 1.0392042129900527, "grad_norm": 0.5942560434341431, "learning_rate": 9.999456158087995e-05, "loss": 1.6212, "step": 222 }, { "epoch": 1.0438853130485664, "grad_norm": 0.5859194993972778, "learning_rate": 9.999328593087158e-05, "loss": 1.644, "step": 223 }, { "epoch": 1.0485664131070802, "grad_norm": 0.5919376611709595, "learning_rate": 9.999187601453726e-05, "loss": 1.6231, "step": 224 }, { "epoch": 1.0532475131655938, "grad_norm": 0.5644704103469849, "learning_rate": 9.999033183566353e-05, "loss": 1.6368, "step": 225 }, { "epoch": 1.0579286132241077, "grad_norm": 0.5851888656616211, "learning_rate": 9.998865339839761e-05, "loss": 1.612, "step": 226 }, { "epoch": 1.0626097132826213, "grad_norm": 0.5965368151664734, "learning_rate": 9.998684070724723e-05, "loss": 1.6369, "step": 227 }, { "epoch": 1.0672908133411352, "grad_norm": 0.5883433222770691, "learning_rate": 9.998489376708073e-05, "loss": 1.6436, "step": 228 }, { "epoch": 1.0719719133996488, "grad_norm": 0.5617659091949463, "learning_rate": 9.998281258312697e-05, "loss": 1.6472, "step": 229 }, { "epoch": 1.0766530134581627, "grad_norm": 0.5981467366218567, "learning_rate": 9.998059716097536e-05, "loss": 1.614, "step": 230 }, { "epoch": 1.0813341135166765, "grad_norm": 0.5670073628425598, "learning_rate": 9.997824750657585e-05, "loss": 1.6469, "step": 231 }, { "epoch": 1.0860152135751902, "grad_norm": 0.5550282001495361, "learning_rate": 9.997576362623887e-05, "loss": 1.6296, "step": 232 }, { "epoch": 1.090696313633704, "grad_norm": 0.5583559274673462, "learning_rate": 9.99731455266353e-05, "loss": 1.6544, "step": 233 }, { "epoch": 1.0953774136922176, "grad_norm": 0.5706636905670166, "learning_rate": 9.99703932147966e-05, "loss": 1.6167, "step": 234 }, { "epoch": 1.1000585137507315, "grad_norm": 0.5709851980209351, "learning_rate": 9.996750669811459e-05, "loss": 1.6209, "step": 235 }, { "epoch": 1.1047396138092451, "grad_norm": 0.5738768577575684, "learning_rate": 9.996448598434156e-05, "loss": 1.6275, "step": 236 }, { "epoch": 1.109420713867759, "grad_norm": 0.5648218989372253, "learning_rate": 9.996133108159018e-05, "loss": 1.5934, "step": 237 }, { "epoch": 1.1141018139262726, "grad_norm": 0.5713233351707458, "learning_rate": 9.995804199833356e-05, "loss": 1.6167, "step": 238 }, { "epoch": 1.1187829139847865, "grad_norm": 0.6131446957588196, "learning_rate": 9.995461874340513e-05, "loss": 1.6463, "step": 239 }, { "epoch": 1.1234640140433, "grad_norm": 0.6103014349937439, "learning_rate": 9.995106132599869e-05, "loss": 1.6078, "step": 240 }, { "epoch": 1.128145114101814, "grad_norm": 0.581728994846344, "learning_rate": 9.994736975566834e-05, "loss": 1.6155, "step": 241 }, { "epoch": 1.1328262141603276, "grad_norm": 0.602294385433197, "learning_rate": 9.99435440423285e-05, "loss": 1.6319, "step": 242 }, { "epoch": 1.1375073142188414, "grad_norm": 0.6152282953262329, "learning_rate": 9.993958419625382e-05, "loss": 1.616, "step": 243 }, { "epoch": 1.142188414277355, "grad_norm": 0.6185297966003418, "learning_rate": 9.993549022807922e-05, "loss": 1.5935, "step": 244 }, { "epoch": 1.146869514335869, "grad_norm": 0.6191090941429138, "learning_rate": 9.993126214879982e-05, "loss": 1.6084, "step": 245 }, { "epoch": 1.1515506143943828, "grad_norm": 0.6081867814064026, "learning_rate": 9.992689996977091e-05, "loss": 1.6216, "step": 246 }, { "epoch": 1.1562317144528964, "grad_norm": 0.5717095136642456, "learning_rate": 9.992240370270795e-05, "loss": 1.6291, "step": 247 }, { "epoch": 1.1609128145114103, "grad_norm": 0.6211952567100525, "learning_rate": 9.991777335968651e-05, "loss": 1.606, "step": 248 }, { "epoch": 1.165593914569924, "grad_norm": 0.5580140948295593, "learning_rate": 9.99130089531422e-05, "loss": 1.6145, "step": 249 }, { "epoch": 1.1702750146284377, "grad_norm": 0.5648279786109924, "learning_rate": 9.990811049587077e-05, "loss": 1.6073, "step": 250 }, { "epoch": 1.1749561146869514, "grad_norm": 0.5821574330329895, "learning_rate": 9.990307800102794e-05, "loss": 1.6062, "step": 251 }, { "epoch": 1.1796372147454652, "grad_norm": 0.5487422943115234, "learning_rate": 9.98979114821294e-05, "loss": 1.6039, "step": 252 }, { "epoch": 1.1843183148039789, "grad_norm": 0.5623614192008972, "learning_rate": 9.989261095305082e-05, "loss": 1.5898, "step": 253 }, { "epoch": 1.1889994148624927, "grad_norm": 0.5610089898109436, "learning_rate": 9.988717642802775e-05, "loss": 1.6122, "step": 254 }, { "epoch": 1.1936805149210064, "grad_norm": 0.5645126700401306, "learning_rate": 9.988160792165562e-05, "loss": 1.6148, "step": 255 }, { "epoch": 1.1983616149795202, "grad_norm": 0.6094059348106384, "learning_rate": 9.98759054488897e-05, "loss": 1.5879, "step": 256 }, { "epoch": 1.203042715038034, "grad_norm": 0.5557142496109009, "learning_rate": 9.987006902504506e-05, "loss": 1.578, "step": 257 }, { "epoch": 1.2077238150965477, "grad_norm": 0.5883256793022156, "learning_rate": 9.98640986657965e-05, "loss": 1.6043, "step": 258 }, { "epoch": 1.2124049151550613, "grad_norm": 0.5736782550811768, "learning_rate": 9.985799438717853e-05, "loss": 1.5862, "step": 259 }, { "epoch": 1.2170860152135752, "grad_norm": 0.5847081542015076, "learning_rate": 9.985175620558535e-05, "loss": 1.5853, "step": 260 }, { "epoch": 1.221767115272089, "grad_norm": 0.5794082283973694, "learning_rate": 9.984538413777076e-05, "loss": 1.6245, "step": 261 }, { "epoch": 1.2264482153306027, "grad_norm": 0.581802248954773, "learning_rate": 9.983887820084812e-05, "loss": 1.601, "step": 262 }, { "epoch": 1.2311293153891165, "grad_norm": 0.5950344800949097, "learning_rate": 9.983223841229037e-05, "loss": 1.6066, "step": 263 }, { "epoch": 1.2358104154476302, "grad_norm": 0.6156999468803406, "learning_rate": 9.982546478992989e-05, "loss": 1.5977, "step": 264 }, { "epoch": 1.240491515506144, "grad_norm": 0.5503116250038147, "learning_rate": 9.98185573519585e-05, "loss": 1.6073, "step": 265 }, { "epoch": 1.2451726155646576, "grad_norm": 0.5789607763290405, "learning_rate": 9.981151611692745e-05, "loss": 1.6041, "step": 266 }, { "epoch": 1.2498537156231715, "grad_norm": 0.6294329762458801, "learning_rate": 9.980434110374724e-05, "loss": 1.5896, "step": 267 }, { "epoch": 1.2545348156816851, "grad_norm": 0.5368711352348328, "learning_rate": 9.979703233168778e-05, "loss": 1.5921, "step": 268 }, { "epoch": 1.259215915740199, "grad_norm": 0.548893392086029, "learning_rate": 9.978958982037809e-05, "loss": 1.5859, "step": 269 }, { "epoch": 1.2638970157987126, "grad_norm": 0.5760201811790466, "learning_rate": 9.978201358980645e-05, "loss": 1.6037, "step": 270 }, { "epoch": 1.2685781158572265, "grad_norm": 0.5687435269355774, "learning_rate": 9.977430366032023e-05, "loss": 1.5622, "step": 271 }, { "epoch": 1.2732592159157403, "grad_norm": 0.606561541557312, "learning_rate": 9.976646005262592e-05, "loss": 1.6117, "step": 272 }, { "epoch": 1.277940315974254, "grad_norm": 0.5877252817153931, "learning_rate": 9.975848278778897e-05, "loss": 1.5912, "step": 273 }, { "epoch": 1.2826214160327676, "grad_norm": 0.582199215888977, "learning_rate": 9.975037188723382e-05, "loss": 1.6029, "step": 274 }, { "epoch": 1.2873025160912814, "grad_norm": 0.5783854722976685, "learning_rate": 9.974212737274382e-05, "loss": 1.5892, "step": 275 }, { "epoch": 1.2919836161497953, "grad_norm": 0.5509589910507202, "learning_rate": 9.973374926646116e-05, "loss": 1.5857, "step": 276 }, { "epoch": 1.296664716208309, "grad_norm": 0.65985107421875, "learning_rate": 9.97252375908868e-05, "loss": 1.6011, "step": 277 }, { "epoch": 1.3013458162668228, "grad_norm": 0.5439913272857666, "learning_rate": 9.971659236888045e-05, "loss": 1.5916, "step": 278 }, { "epoch": 1.3060269163253364, "grad_norm": 0.5911054015159607, "learning_rate": 9.970781362366046e-05, "loss": 1.5833, "step": 279 }, { "epoch": 1.3107080163838503, "grad_norm": 0.5758826732635498, "learning_rate": 9.969890137880377e-05, "loss": 1.5998, "step": 280 }, { "epoch": 1.315389116442364, "grad_norm": 0.5465978980064392, "learning_rate": 9.968985565824596e-05, "loss": 1.6098, "step": 281 }, { "epoch": 1.3200702165008777, "grad_norm": 0.5745389461517334, "learning_rate": 9.968067648628093e-05, "loss": 1.593, "step": 282 }, { "epoch": 1.3247513165593914, "grad_norm": 0.543717086315155, "learning_rate": 9.967136388756109e-05, "loss": 1.5512, "step": 283 }, { "epoch": 1.3294324166179052, "grad_norm": 0.5977076292037964, "learning_rate": 9.966191788709716e-05, "loss": 1.5734, "step": 284 }, { "epoch": 1.3341135166764189, "grad_norm": 0.5420624017715454, "learning_rate": 9.965233851025814e-05, "loss": 1.5766, "step": 285 }, { "epoch": 1.3387946167349327, "grad_norm": 0.5433383584022522, "learning_rate": 9.964262578277127e-05, "loss": 1.561, "step": 286 }, { "epoch": 1.3434757167934466, "grad_norm": 0.5578792095184326, "learning_rate": 9.963277973072187e-05, "loss": 1.5546, "step": 287 }, { "epoch": 1.3481568168519602, "grad_norm": 0.585272490978241, "learning_rate": 9.962280038055335e-05, "loss": 1.5803, "step": 288 }, { "epoch": 1.3528379169104738, "grad_norm": 0.5901860594749451, "learning_rate": 9.961268775906715e-05, "loss": 1.5691, "step": 289 }, { "epoch": 1.3575190169689877, "grad_norm": 0.5460952520370483, "learning_rate": 9.960244189342258e-05, "loss": 1.5787, "step": 290 }, { "epoch": 1.3622001170275015, "grad_norm": 0.5862498879432678, "learning_rate": 9.959206281113684e-05, "loss": 1.5932, "step": 291 }, { "epoch": 1.3668812170860152, "grad_norm": 0.5581315755844116, "learning_rate": 9.95815505400849e-05, "loss": 1.5683, "step": 292 }, { "epoch": 1.371562317144529, "grad_norm": 0.541332483291626, "learning_rate": 9.957090510849943e-05, "loss": 1.5734, "step": 293 }, { "epoch": 1.3762434172030427, "grad_norm": 0.5829706192016602, "learning_rate": 9.956012654497074e-05, "loss": 1.5589, "step": 294 }, { "epoch": 1.3809245172615565, "grad_norm": 0.5810964107513428, "learning_rate": 9.954921487844667e-05, "loss": 1.5859, "step": 295 }, { "epoch": 1.3856056173200701, "grad_norm": 0.6038123965263367, "learning_rate": 9.953817013823252e-05, "loss": 1.5826, "step": 296 }, { "epoch": 1.390286717378584, "grad_norm": 0.5622268915176392, "learning_rate": 9.952699235399104e-05, "loss": 1.571, "step": 297 }, { "epoch": 1.3949678174370979, "grad_norm": 0.5940186977386475, "learning_rate": 9.951568155574225e-05, "loss": 1.5573, "step": 298 }, { "epoch": 1.3996489174956115, "grad_norm": 0.6147366166114807, "learning_rate": 9.950423777386341e-05, "loss": 1.5654, "step": 299 }, { "epoch": 1.4043300175541251, "grad_norm": 0.5400118827819824, "learning_rate": 9.949266103908895e-05, "loss": 1.5529, "step": 300 }, { "epoch": 1.409011117612639, "grad_norm": 0.6384579539299011, "learning_rate": 9.948095138251035e-05, "loss": 1.5939, "step": 301 }, { "epoch": 1.4136922176711528, "grad_norm": 0.566920816898346, "learning_rate": 9.946910883557607e-05, "loss": 1.559, "step": 302 }, { "epoch": 1.4183733177296665, "grad_norm": 0.6052601337432861, "learning_rate": 9.945713343009153e-05, "loss": 1.5554, "step": 303 }, { "epoch": 1.42305441778818, "grad_norm": 0.6517474055290222, "learning_rate": 9.944502519821887e-05, "loss": 1.579, "step": 304 }, { "epoch": 1.427735517846694, "grad_norm": 0.5794010758399963, "learning_rate": 9.943278417247705e-05, "loss": 1.5558, "step": 305 }, { "epoch": 1.4324166179052078, "grad_norm": 0.6108701229095459, "learning_rate": 9.942041038574162e-05, "loss": 1.5675, "step": 306 }, { "epoch": 1.4370977179637214, "grad_norm": 0.59806889295578, "learning_rate": 9.940790387124472e-05, "loss": 1.5565, "step": 307 }, { "epoch": 1.4417788180222353, "grad_norm": 0.5927438735961914, "learning_rate": 9.939526466257489e-05, "loss": 1.5607, "step": 308 }, { "epoch": 1.446459918080749, "grad_norm": 0.5815222263336182, "learning_rate": 9.938249279367714e-05, "loss": 1.5423, "step": 309 }, { "epoch": 1.4511410181392628, "grad_norm": 0.5558337569236755, "learning_rate": 9.936958829885268e-05, "loss": 1.5628, "step": 310 }, { "epoch": 1.4558221181977764, "grad_norm": 0.5937175750732422, "learning_rate": 9.935655121275898e-05, "loss": 1.5852, "step": 311 }, { "epoch": 1.4605032182562903, "grad_norm": 0.64537113904953, "learning_rate": 9.934338157040952e-05, "loss": 1.5671, "step": 312 }, { "epoch": 1.4651843183148041, "grad_norm": 0.5682733654975891, "learning_rate": 9.93300794071739e-05, "loss": 1.5603, "step": 313 }, { "epoch": 1.4698654183733177, "grad_norm": 0.6427823901176453, "learning_rate": 9.931664475877753e-05, "loss": 1.5602, "step": 314 }, { "epoch": 1.4745465184318314, "grad_norm": 0.5664746165275574, "learning_rate": 9.930307766130169e-05, "loss": 1.5843, "step": 315 }, { "epoch": 1.4792276184903452, "grad_norm": 0.6186694502830505, "learning_rate": 9.928937815118336e-05, "loss": 1.5918, "step": 316 }, { "epoch": 1.483908718548859, "grad_norm": 0.5681505799293518, "learning_rate": 9.927554626521512e-05, "loss": 1.5656, "step": 317 }, { "epoch": 1.4885898186073727, "grad_norm": 0.5927398204803467, "learning_rate": 9.926158204054511e-05, "loss": 1.5836, "step": 318 }, { "epoch": 1.4932709186658863, "grad_norm": 0.563877522945404, "learning_rate": 9.924748551467686e-05, "loss": 1.5628, "step": 319 }, { "epoch": 1.4979520187244002, "grad_norm": 0.5428170561790466, "learning_rate": 9.923325672546923e-05, "loss": 1.5528, "step": 320 }, { "epoch": 1.502633118782914, "grad_norm": 0.5832676291465759, "learning_rate": 9.921889571113628e-05, "loss": 1.5662, "step": 321 }, { "epoch": 1.5073142188414277, "grad_norm": 0.5446849465370178, "learning_rate": 9.920440251024723e-05, "loss": 1.5593, "step": 322 }, { "epoch": 1.5119953188999413, "grad_norm": 0.5420470237731934, "learning_rate": 9.918977716172628e-05, "loss": 1.566, "step": 323 }, { "epoch": 1.5166764189584554, "grad_norm": 0.5586629509925842, "learning_rate": 9.917501970485251e-05, "loss": 1.5522, "step": 324 }, { "epoch": 1.521357519016969, "grad_norm": 0.5785658955574036, "learning_rate": 9.916013017925987e-05, "loss": 1.5882, "step": 325 }, { "epoch": 1.5260386190754827, "grad_norm": 0.5468897223472595, "learning_rate": 9.914510862493693e-05, "loss": 1.5523, "step": 326 }, { "epoch": 1.5307197191339965, "grad_norm": 0.5480566024780273, "learning_rate": 9.912995508222693e-05, "loss": 1.5472, "step": 327 }, { "epoch": 1.5354008191925104, "grad_norm": 0.5439897179603577, "learning_rate": 9.911466959182751e-05, "loss": 1.554, "step": 328 }, { "epoch": 1.540081919251024, "grad_norm": 0.5619301795959473, "learning_rate": 9.909925219479072e-05, "loss": 1.5626, "step": 329 }, { "epoch": 1.5447630193095376, "grad_norm": 0.5582358241081238, "learning_rate": 9.90837029325229e-05, "loss": 1.5655, "step": 330 }, { "epoch": 1.5494441193680515, "grad_norm": 0.5682559609413147, "learning_rate": 9.906802184678445e-05, "loss": 1.5417, "step": 331 }, { "epoch": 1.5541252194265653, "grad_norm": 0.5619310140609741, "learning_rate": 9.90522089796899e-05, "loss": 1.5645, "step": 332 }, { "epoch": 1.558806319485079, "grad_norm": 0.551527202129364, "learning_rate": 9.903626437370764e-05, "loss": 1.5417, "step": 333 }, { "epoch": 1.5634874195435926, "grad_norm": 0.5816091895103455, "learning_rate": 9.90201880716599e-05, "loss": 1.5497, "step": 334 }, { "epoch": 1.5681685196021065, "grad_norm": 0.5371972322463989, "learning_rate": 9.900398011672262e-05, "loss": 1.5454, "step": 335 }, { "epoch": 1.5728496196606203, "grad_norm": 0.5627063512802124, "learning_rate": 9.898764055242526e-05, "loss": 1.5649, "step": 336 }, { "epoch": 1.577530719719134, "grad_norm": 0.5842807292938232, "learning_rate": 9.89711694226508e-05, "loss": 1.5496, "step": 337 }, { "epoch": 1.5822118197776478, "grad_norm": 0.5478192567825317, "learning_rate": 9.895456677163555e-05, "loss": 1.5585, "step": 338 }, { "epoch": 1.5868929198361617, "grad_norm": 0.5461902022361755, "learning_rate": 9.893783264396904e-05, "loss": 1.5429, "step": 339 }, { "epoch": 1.5915740198946753, "grad_norm": 0.5406402945518494, "learning_rate": 9.892096708459385e-05, "loss": 1.5555, "step": 340 }, { "epoch": 1.596255119953189, "grad_norm": 0.5658143162727356, "learning_rate": 9.890397013880566e-05, "loss": 1.5528, "step": 341 }, { "epoch": 1.6009362200117028, "grad_norm": 0.5685530304908752, "learning_rate": 9.88868418522529e-05, "loss": 1.5483, "step": 342 }, { "epoch": 1.6056173200702166, "grad_norm": 0.5648348331451416, "learning_rate": 9.886958227093682e-05, "loss": 1.5678, "step": 343 }, { "epoch": 1.6102984201287303, "grad_norm": 0.5785353779792786, "learning_rate": 9.885219144121125e-05, "loss": 1.5427, "step": 344 }, { "epoch": 1.6149795201872439, "grad_norm": 0.6532377004623413, "learning_rate": 9.883466940978252e-05, "loss": 1.5651, "step": 345 }, { "epoch": 1.6196606202457577, "grad_norm": 0.5949984192848206, "learning_rate": 9.881701622370932e-05, "loss": 1.5579, "step": 346 }, { "epoch": 1.6243417203042716, "grad_norm": 0.5811630487442017, "learning_rate": 9.879923193040256e-05, "loss": 1.5637, "step": 347 }, { "epoch": 1.6290228203627852, "grad_norm": 0.5769162774085999, "learning_rate": 9.878131657762535e-05, "loss": 1.5532, "step": 348 }, { "epoch": 1.6337039204212989, "grad_norm": 0.5758988261222839, "learning_rate": 9.876327021349267e-05, "loss": 1.5356, "step": 349 }, { "epoch": 1.6383850204798127, "grad_norm": 0.5641076564788818, "learning_rate": 9.874509288647143e-05, "loss": 1.5598, "step": 350 }, { "epoch": 1.6430661205383266, "grad_norm": 0.5294468402862549, "learning_rate": 9.872678464538021e-05, "loss": 1.5359, "step": 351 }, { "epoch": 1.6477472205968402, "grad_norm": 0.6214150786399841, "learning_rate": 9.870834553938927e-05, "loss": 1.5787, "step": 352 }, { "epoch": 1.652428320655354, "grad_norm": 0.5245915651321411, "learning_rate": 9.868977561802027e-05, "loss": 1.5525, "step": 353 }, { "epoch": 1.657109420713868, "grad_norm": 0.5408384799957275, "learning_rate": 9.867107493114617e-05, "loss": 1.5426, "step": 354 }, { "epoch": 1.6617905207723815, "grad_norm": 0.5473621487617493, "learning_rate": 9.865224352899119e-05, "loss": 1.5414, "step": 355 }, { "epoch": 1.6664716208308952, "grad_norm": 0.5682988166809082, "learning_rate": 9.86332814621306e-05, "loss": 1.5487, "step": 356 }, { "epoch": 1.671152720889409, "grad_norm": 0.5523189306259155, "learning_rate": 9.861418878149056e-05, "loss": 1.5656, "step": 357 }, { "epoch": 1.6758338209479229, "grad_norm": 0.5846049189567566, "learning_rate": 9.859496553834804e-05, "loss": 1.5517, "step": 358 }, { "epoch": 1.6805149210064365, "grad_norm": 0.5750234723091125, "learning_rate": 9.857561178433066e-05, "loss": 1.5317, "step": 359 }, { "epoch": 1.6851960210649501, "grad_norm": 0.5728639364242554, "learning_rate": 9.855612757141655e-05, "loss": 1.5326, "step": 360 }, { "epoch": 1.689877121123464, "grad_norm": 0.5863739252090454, "learning_rate": 9.85365129519342e-05, "loss": 1.5359, "step": 361 }, { "epoch": 1.6945582211819779, "grad_norm": 0.6063214540481567, "learning_rate": 9.851676797856235e-05, "loss": 1.5412, "step": 362 }, { "epoch": 1.6992393212404915, "grad_norm": 0.552778959274292, "learning_rate": 9.849689270432984e-05, "loss": 1.5518, "step": 363 }, { "epoch": 1.7039204212990051, "grad_norm": 0.5694995522499084, "learning_rate": 9.84768871826154e-05, "loss": 1.5323, "step": 364 }, { "epoch": 1.708601521357519, "grad_norm": 0.5607172846794128, "learning_rate": 9.845675146714763e-05, "loss": 1.5434, "step": 365 }, { "epoch": 1.7132826214160328, "grad_norm": 0.62750244140625, "learning_rate": 9.843648561200475e-05, "loss": 1.5401, "step": 366 }, { "epoch": 1.7179637214745465, "grad_norm": 0.559459924697876, "learning_rate": 9.841608967161451e-05, "loss": 1.5438, "step": 367 }, { "epoch": 1.7226448215330603, "grad_norm": 0.6135414242744446, "learning_rate": 9.839556370075404e-05, "loss": 1.5237, "step": 368 }, { "epoch": 1.7273259215915742, "grad_norm": 0.547193169593811, "learning_rate": 9.837490775454965e-05, "loss": 1.5201, "step": 369 }, { "epoch": 1.7320070216500878, "grad_norm": 0.5808367729187012, "learning_rate": 9.835412188847677e-05, "loss": 1.5461, "step": 370 }, { "epoch": 1.7366881217086014, "grad_norm": 0.5561200380325317, "learning_rate": 9.833320615835972e-05, "loss": 1.5562, "step": 371 }, { "epoch": 1.7413692217671153, "grad_norm": 0.549072265625, "learning_rate": 9.831216062037163e-05, "loss": 1.5218, "step": 372 }, { "epoch": 1.7460503218256291, "grad_norm": 0.6117122769355774, "learning_rate": 9.829098533103421e-05, "loss": 1.5527, "step": 373 }, { "epoch": 1.7507314218841428, "grad_norm": 0.5658581256866455, "learning_rate": 9.826968034721765e-05, "loss": 1.5346, "step": 374 }, { "epoch": 1.7554125219426564, "grad_norm": 0.5828503966331482, "learning_rate": 9.824824572614051e-05, "loss": 1.5191, "step": 375 }, { "epoch": 1.7600936220011703, "grad_norm": 0.5669796466827393, "learning_rate": 9.822668152536944e-05, "loss": 1.5269, "step": 376 }, { "epoch": 1.764774722059684, "grad_norm": 0.5611358284950256, "learning_rate": 9.820498780281917e-05, "loss": 1.536, "step": 377 }, { "epoch": 1.7694558221181977, "grad_norm": 0.5313979387283325, "learning_rate": 9.818316461675221e-05, "loss": 1.5189, "step": 378 }, { "epoch": 1.7741369221767114, "grad_norm": 0.5862874984741211, "learning_rate": 9.816121202577886e-05, "loss": 1.5261, "step": 379 }, { "epoch": 1.7788180222352252, "grad_norm": 0.5266857147216797, "learning_rate": 9.813913008885687e-05, "loss": 1.4967, "step": 380 }, { "epoch": 1.783499122293739, "grad_norm": 0.5802350044250488, "learning_rate": 9.811691886529144e-05, "loss": 1.5198, "step": 381 }, { "epoch": 1.7881802223522527, "grad_norm": 0.5663620829582214, "learning_rate": 9.809457841473496e-05, "loss": 1.5432, "step": 382 }, { "epoch": 1.7928613224107666, "grad_norm": 0.5301174521446228, "learning_rate": 9.80721087971869e-05, "loss": 1.5261, "step": 383 }, { "epoch": 1.7975424224692804, "grad_norm": 0.5568957924842834, "learning_rate": 9.804951007299359e-05, "loss": 1.5129, "step": 384 }, { "epoch": 1.802223522527794, "grad_norm": 0.5582797527313232, "learning_rate": 9.802678230284817e-05, "loss": 1.529, "step": 385 }, { "epoch": 1.8069046225863077, "grad_norm": 0.5716988444328308, "learning_rate": 9.800392554779031e-05, "loss": 1.5381, "step": 386 }, { "epoch": 1.8115857226448215, "grad_norm": 0.5414494276046753, "learning_rate": 9.798093986920608e-05, "loss": 1.5332, "step": 387 }, { "epoch": 1.8162668227033354, "grad_norm": 0.531887948513031, "learning_rate": 9.795782532882787e-05, "loss": 1.5408, "step": 388 }, { "epoch": 1.820947922761849, "grad_norm": 0.5692090392112732, "learning_rate": 9.793458198873409e-05, "loss": 1.5233, "step": 389 }, { "epoch": 1.8256290228203627, "grad_norm": 0.6099083423614502, "learning_rate": 9.791120991134904e-05, "loss": 1.5325, "step": 390 }, { "epoch": 1.8303101228788765, "grad_norm": 0.5257400870323181, "learning_rate": 9.788770915944283e-05, "loss": 1.5127, "step": 391 }, { "epoch": 1.8349912229373904, "grad_norm": 0.565856397151947, "learning_rate": 9.786407979613115e-05, "loss": 1.5515, "step": 392 }, { "epoch": 1.839672322995904, "grad_norm": 0.5161274671554565, "learning_rate": 9.784032188487506e-05, "loss": 1.535, "step": 393 }, { "epoch": 1.8443534230544176, "grad_norm": 0.5876962542533875, "learning_rate": 9.781643548948088e-05, "loss": 1.5211, "step": 394 }, { "epoch": 1.8490345231129317, "grad_norm": 0.5334473252296448, "learning_rate": 9.779242067409997e-05, "loss": 1.5147, "step": 395 }, { "epoch": 1.8537156231714453, "grad_norm": 0.5968698263168335, "learning_rate": 9.776827750322865e-05, "loss": 1.5136, "step": 396 }, { "epoch": 1.858396723229959, "grad_norm": 0.5792655348777771, "learning_rate": 9.77440060417079e-05, "loss": 1.5066, "step": 397 }, { "epoch": 1.8630778232884728, "grad_norm": 0.574500322341919, "learning_rate": 9.77196063547233e-05, "loss": 1.5238, "step": 398 }, { "epoch": 1.8677589233469867, "grad_norm": 0.565139889717102, "learning_rate": 9.769507850780474e-05, "loss": 1.5124, "step": 399 }, { "epoch": 1.8724400234055003, "grad_norm": 0.5808327198028564, "learning_rate": 9.767042256682637e-05, "loss": 1.5324, "step": 400 }, { "epoch": 1.877121123464014, "grad_norm": 0.5518105626106262, "learning_rate": 9.764563859800631e-05, "loss": 1.5302, "step": 401 }, { "epoch": 1.8818022235225278, "grad_norm": 0.5466589331626892, "learning_rate": 9.762072666790658e-05, "loss": 1.523, "step": 402 }, { "epoch": 1.8864833235810416, "grad_norm": 0.5499767065048218, "learning_rate": 9.759568684343279e-05, "loss": 1.5018, "step": 403 }, { "epoch": 1.8911644236395553, "grad_norm": 0.6103038787841797, "learning_rate": 9.757051919183411e-05, "loss": 1.5308, "step": 404 }, { "epoch": 1.895845523698069, "grad_norm": 0.5939445495605469, "learning_rate": 9.754522378070297e-05, "loss": 1.5091, "step": 405 }, { "epoch": 1.9005266237565828, "grad_norm": 0.5608471035957336, "learning_rate": 9.75198006779749e-05, "loss": 1.5341, "step": 406 }, { "epoch": 1.9052077238150966, "grad_norm": 0.5591056942939758, "learning_rate": 9.749424995192846e-05, "loss": 1.5248, "step": 407 }, { "epoch": 1.9098888238736103, "grad_norm": 0.5589863657951355, "learning_rate": 9.746857167118486e-05, "loss": 1.53, "step": 408 }, { "epoch": 1.9145699239321239, "grad_norm": 0.533953070640564, "learning_rate": 9.744276590470793e-05, "loss": 1.5411, "step": 409 }, { "epoch": 1.919251023990638, "grad_norm": 0.5937032699584961, "learning_rate": 9.741683272180391e-05, "loss": 1.5225, "step": 410 }, { "epoch": 1.9239321240491516, "grad_norm": 0.5463059544563293, "learning_rate": 9.73907721921212e-05, "loss": 1.5109, "step": 411 }, { "epoch": 1.9286132241076652, "grad_norm": 0.5598007440567017, "learning_rate": 9.736458438565023e-05, "loss": 1.5269, "step": 412 }, { "epoch": 1.933294324166179, "grad_norm": 0.5386488437652588, "learning_rate": 9.733826937272327e-05, "loss": 1.5212, "step": 413 }, { "epoch": 1.937975424224693, "grad_norm": 0.5409656763076782, "learning_rate": 9.731182722401421e-05, "loss": 1.5196, "step": 414 }, { "epoch": 1.9426565242832066, "grad_norm": 0.5412046313285828, "learning_rate": 9.728525801053836e-05, "loss": 1.5181, "step": 415 }, { "epoch": 1.9473376243417202, "grad_norm": 0.5616279244422913, "learning_rate": 9.725856180365236e-05, "loss": 1.5266, "step": 416 }, { "epoch": 1.952018724400234, "grad_norm": 0.5128246545791626, "learning_rate": 9.723173867505381e-05, "loss": 1.5208, "step": 417 }, { "epoch": 1.956699824458748, "grad_norm": 0.5439711213111877, "learning_rate": 9.720478869678132e-05, "loss": 1.4862, "step": 418 }, { "epoch": 1.9613809245172615, "grad_norm": 0.5718542337417603, "learning_rate": 9.717771194121404e-05, "loss": 1.5081, "step": 419 }, { "epoch": 1.9660620245757752, "grad_norm": 0.584561288356781, "learning_rate": 9.715050848107168e-05, "loss": 1.5213, "step": 420 }, { "epoch": 1.970743124634289, "grad_norm": 0.5634207725524902, "learning_rate": 9.712317838941423e-05, "loss": 1.5127, "step": 421 }, { "epoch": 1.9754242246928029, "grad_norm": 0.5634327530860901, "learning_rate": 9.709572173964177e-05, "loss": 1.5197, "step": 422 }, { "epoch": 1.9801053247513165, "grad_norm": 0.5960271954536438, "learning_rate": 9.706813860549424e-05, "loss": 1.5239, "step": 423 }, { "epoch": 1.9847864248098304, "grad_norm": 0.5209662318229675, "learning_rate": 9.704042906105136e-05, "loss": 1.5009, "step": 424 }, { "epoch": 1.9894675248683442, "grad_norm": 0.5604713559150696, "learning_rate": 9.701259318073226e-05, "loss": 1.5221, "step": 425 }, { "epoch": 1.9941486249268578, "grad_norm": 0.545125424861908, "learning_rate": 9.698463103929542e-05, "loss": 1.5277, "step": 426 }, { "epoch": 1.9988297249853715, "grad_norm": 0.5339157581329346, "learning_rate": 9.69565427118384e-05, "loss": 1.4962, "step": 427 }, { "epoch": 2.003510825043885, "grad_norm": 1.5908141136169434, "learning_rate": 9.692832827379768e-05, "loss": 1.4203, "step": 428 }, { "epoch": 2.008191925102399, "grad_norm": 0.6132961511611938, "learning_rate": 9.689998780094837e-05, "loss": 1.5247, "step": 429 }, { "epoch": 2.012873025160913, "grad_norm": 0.5658572912216187, "learning_rate": 9.687152136940417e-05, "loss": 1.5084, "step": 430 }, { "epoch": 2.0175541252194265, "grad_norm": 0.545052170753479, "learning_rate": 9.684292905561696e-05, "loss": 1.5128, "step": 431 }, { "epoch": 2.0222352252779405, "grad_norm": 0.5767326951026917, "learning_rate": 9.681421093637678e-05, "loss": 1.516, "step": 432 }, { "epoch": 2.026916325336454, "grad_norm": 0.5857481360435486, "learning_rate": 9.678536708881149e-05, "loss": 1.493, "step": 433 }, { "epoch": 2.031597425394968, "grad_norm": 0.5660700798034668, "learning_rate": 9.675639759038666e-05, "loss": 1.5246, "step": 434 }, { "epoch": 2.0362785254534814, "grad_norm": 0.5561010837554932, "learning_rate": 9.67273025189053e-05, "loss": 1.4908, "step": 435 }, { "epoch": 2.0409596255119955, "grad_norm": 0.5635765194892883, "learning_rate": 9.669808195250766e-05, "loss": 1.4967, "step": 436 }, { "epoch": 2.045640725570509, "grad_norm": 0.5964431762695312, "learning_rate": 9.666873596967105e-05, "loss": 1.5108, "step": 437 }, { "epoch": 2.0503218256290228, "grad_norm": 0.571668267250061, "learning_rate": 9.663926464920958e-05, "loss": 1.514, "step": 438 }, { "epoch": 2.0550029256875364, "grad_norm": 0.552706241607666, "learning_rate": 9.660966807027402e-05, "loss": 1.5147, "step": 439 }, { "epoch": 2.0596840257460505, "grad_norm": 0.5939638018608093, "learning_rate": 9.657994631235152e-05, "loss": 1.4984, "step": 440 }, { "epoch": 2.064365125804564, "grad_norm": 0.5550068616867065, "learning_rate": 9.655009945526541e-05, "loss": 1.5009, "step": 441 }, { "epoch": 2.0690462258630777, "grad_norm": 0.587196409702301, "learning_rate": 9.652012757917501e-05, "loss": 1.5032, "step": 442 }, { "epoch": 2.0737273259215914, "grad_norm": 0.5701729655265808, "learning_rate": 9.649003076457542e-05, "loss": 1.5315, "step": 443 }, { "epoch": 2.0784084259801054, "grad_norm": 0.5458940267562866, "learning_rate": 9.645980909229726e-05, "loss": 1.5028, "step": 444 }, { "epoch": 2.083089526038619, "grad_norm": 0.5555766224861145, "learning_rate": 9.642946264350648e-05, "loss": 1.4991, "step": 445 }, { "epoch": 2.0877706260971327, "grad_norm": 0.5843587517738342, "learning_rate": 9.639899149970415e-05, "loss": 1.5023, "step": 446 }, { "epoch": 2.092451726155647, "grad_norm": 0.5529284477233887, "learning_rate": 9.636839574272622e-05, "loss": 1.4814, "step": 447 }, { "epoch": 2.0971328262141604, "grad_norm": 0.5499287247657776, "learning_rate": 9.633767545474336e-05, "loss": 1.5233, "step": 448 }, { "epoch": 2.101813926272674, "grad_norm": 0.5526400208473206, "learning_rate": 9.630683071826062e-05, "loss": 1.5028, "step": 449 }, { "epoch": 2.1064950263311877, "grad_norm": 0.5946321487426758, "learning_rate": 9.627586161611732e-05, "loss": 1.5055, "step": 450 }, { "epoch": 2.1111761263897018, "grad_norm": 0.5409477949142456, "learning_rate": 9.624476823148678e-05, "loss": 1.4986, "step": 451 }, { "epoch": 2.1158572264482154, "grad_norm": 0.5718604922294617, "learning_rate": 9.62135506478761e-05, "loss": 1.5062, "step": 452 }, { "epoch": 2.120538326506729, "grad_norm": 0.5604239702224731, "learning_rate": 9.618220894912593e-05, "loss": 1.5252, "step": 453 }, { "epoch": 2.1252194265652427, "grad_norm": 0.5496664643287659, "learning_rate": 9.615074321941031e-05, "loss": 1.5092, "step": 454 }, { "epoch": 2.1299005266237567, "grad_norm": 0.548175036907196, "learning_rate": 9.61191535432363e-05, "loss": 1.5099, "step": 455 }, { "epoch": 2.1345816266822704, "grad_norm": 0.569646418094635, "learning_rate": 9.608744000544392e-05, "loss": 1.501, "step": 456 }, { "epoch": 2.139262726740784, "grad_norm": 0.5541195869445801, "learning_rate": 9.605560269120581e-05, "loss": 1.497, "step": 457 }, { "epoch": 2.1439438267992976, "grad_norm": 0.5321877598762512, "learning_rate": 9.6023641686027e-05, "loss": 1.4933, "step": 458 }, { "epoch": 2.1486249268578117, "grad_norm": 0.574272632598877, "learning_rate": 9.59915570757448e-05, "loss": 1.4919, "step": 459 }, { "epoch": 2.1533060269163253, "grad_norm": 0.5240908861160278, "learning_rate": 9.59593489465284e-05, "loss": 1.5041, "step": 460 }, { "epoch": 2.157987126974839, "grad_norm": 0.5542965531349182, "learning_rate": 9.592701738487877e-05, "loss": 1.4997, "step": 461 }, { "epoch": 2.162668227033353, "grad_norm": 0.562849760055542, "learning_rate": 9.589456247762839e-05, "loss": 1.5099, "step": 462 }, { "epoch": 2.1673493270918667, "grad_norm": 0.5381311178207397, "learning_rate": 9.586198431194095e-05, "loss": 1.4904, "step": 463 }, { "epoch": 2.1720304271503803, "grad_norm": 0.5613717436790466, "learning_rate": 9.582928297531124e-05, "loss": 1.4995, "step": 464 }, { "epoch": 2.176711527208894, "grad_norm": 0.529779314994812, "learning_rate": 9.57964585555648e-05, "loss": 1.4842, "step": 465 }, { "epoch": 2.181392627267408, "grad_norm": 0.5571693778038025, "learning_rate": 9.576351114085777e-05, "loss": 1.5059, "step": 466 }, { "epoch": 2.1860737273259216, "grad_norm": 0.5243085026741028, "learning_rate": 9.57304408196766e-05, "loss": 1.4824, "step": 467 }, { "epoch": 2.1907548273844353, "grad_norm": 0.55620938539505, "learning_rate": 9.56972476808378e-05, "loss": 1.4873, "step": 468 }, { "epoch": 2.195435927442949, "grad_norm": 0.5558146238327026, "learning_rate": 9.566393181348779e-05, "loss": 1.4936, "step": 469 }, { "epoch": 2.200117027501463, "grad_norm": 0.583473801612854, "learning_rate": 9.563049330710253e-05, "loss": 1.463, "step": 470 }, { "epoch": 2.2047981275599766, "grad_norm": 0.5721150636672974, "learning_rate": 9.559693225148743e-05, "loss": 1.5186, "step": 471 }, { "epoch": 2.2094792276184902, "grad_norm": 0.5524104237556458, "learning_rate": 9.556324873677695e-05, "loss": 1.4892, "step": 472 }, { "epoch": 2.2141603276770043, "grad_norm": 0.5464508533477783, "learning_rate": 9.55294428534345e-05, "loss": 1.4778, "step": 473 }, { "epoch": 2.218841427735518, "grad_norm": 0.548009991645813, "learning_rate": 9.549551469225208e-05, "loss": 1.5107, "step": 474 }, { "epoch": 2.2235225277940316, "grad_norm": 0.5196877717971802, "learning_rate": 9.546146434435015e-05, "loss": 1.5064, "step": 475 }, { "epoch": 2.228203627852545, "grad_norm": 0.553367555141449, "learning_rate": 9.542729190117727e-05, "loss": 1.499, "step": 476 }, { "epoch": 2.2328847279110593, "grad_norm": 0.5798020958900452, "learning_rate": 9.539299745450992e-05, "loss": 1.5104, "step": 477 }, { "epoch": 2.237565827969573, "grad_norm": 0.5476927757263184, "learning_rate": 9.535858109645228e-05, "loss": 1.4901, "step": 478 }, { "epoch": 2.2422469280280866, "grad_norm": 0.5290266871452332, "learning_rate": 9.532404291943592e-05, "loss": 1.4774, "step": 479 }, { "epoch": 2.2469280280866, "grad_norm": 0.5534362196922302, "learning_rate": 9.528938301621956e-05, "loss": 1.4947, "step": 480 }, { "epoch": 2.2516091281451143, "grad_norm": 0.5712639689445496, "learning_rate": 9.525460147988888e-05, "loss": 1.4757, "step": 481 }, { "epoch": 2.256290228203628, "grad_norm": 0.527601420879364, "learning_rate": 9.521969840385616e-05, "loss": 1.462, "step": 482 }, { "epoch": 2.2609713282621415, "grad_norm": 0.5285370945930481, "learning_rate": 9.51846738818602e-05, "loss": 1.4846, "step": 483 }, { "epoch": 2.265652428320655, "grad_norm": 0.5500218868255615, "learning_rate": 9.514952800796589e-05, "loss": 1.492, "step": 484 }, { "epoch": 2.2703335283791692, "grad_norm": 0.5409276485443115, "learning_rate": 9.511426087656401e-05, "loss": 1.4878, "step": 485 }, { "epoch": 2.275014628437683, "grad_norm": 0.5382949709892273, "learning_rate": 9.507887258237112e-05, "loss": 1.5157, "step": 486 }, { "epoch": 2.2796957284961965, "grad_norm": 0.5844248533248901, "learning_rate": 9.504336322042904e-05, "loss": 1.4953, "step": 487 }, { "epoch": 2.28437682855471, "grad_norm": 0.5399705767631531, "learning_rate": 9.500773288610484e-05, "loss": 1.4921, "step": 488 }, { "epoch": 2.289057928613224, "grad_norm": 0.5701977014541626, "learning_rate": 9.497198167509044e-05, "loss": 1.4958, "step": 489 }, { "epoch": 2.293739028671738, "grad_norm": 0.5427618026733398, "learning_rate": 9.49361096834024e-05, "loss": 1.4818, "step": 490 }, { "epoch": 2.2984201287302515, "grad_norm": 0.5544453263282776, "learning_rate": 9.49001170073817e-05, "loss": 1.4983, "step": 491 }, { "epoch": 2.3031012287887656, "grad_norm": 0.5501473546028137, "learning_rate": 9.48640037436934e-05, "loss": 1.4433, "step": 492 }, { "epoch": 2.307782328847279, "grad_norm": 0.5611422657966614, "learning_rate": 9.482776998932639e-05, "loss": 1.5056, "step": 493 }, { "epoch": 2.312463428905793, "grad_norm": 0.5366073250770569, "learning_rate": 9.479141584159324e-05, "loss": 1.5156, "step": 494 }, { "epoch": 2.3171445289643064, "grad_norm": 0.5340487957000732, "learning_rate": 9.475494139812979e-05, "loss": 1.4878, "step": 495 }, { "epoch": 2.3218256290228205, "grad_norm": 0.5424363017082214, "learning_rate": 9.471834675689499e-05, "loss": 1.4718, "step": 496 }, { "epoch": 2.326506729081334, "grad_norm": 0.5500884652137756, "learning_rate": 9.468163201617062e-05, "loss": 1.4989, "step": 497 }, { "epoch": 2.331187829139848, "grad_norm": 0.5656370520591736, "learning_rate": 9.464479727456097e-05, "loss": 1.494, "step": 498 }, { "epoch": 2.3358689291983614, "grad_norm": 0.5271655321121216, "learning_rate": 9.460784263099262e-05, "loss": 1.4938, "step": 499 }, { "epoch": 2.3405500292568755, "grad_norm": 0.5550000071525574, "learning_rate": 9.457076818471421e-05, "loss": 1.4722, "step": 500 }, { "epoch": 2.345231129315389, "grad_norm": 0.5688049793243408, "learning_rate": 9.453357403529609e-05, "loss": 1.5075, "step": 501 }, { "epoch": 2.3499122293739028, "grad_norm": 0.5786244869232178, "learning_rate": 9.449626028263011e-05, "loss": 1.4759, "step": 502 }, { "epoch": 2.354593329432417, "grad_norm": 0.5229558944702148, "learning_rate": 9.445882702692933e-05, "loss": 1.4841, "step": 503 }, { "epoch": 2.3592744294909305, "grad_norm": 0.530393123626709, "learning_rate": 9.442127436872778e-05, "loss": 1.4722, "step": 504 }, { "epoch": 2.363955529549444, "grad_norm": 0.5296366214752197, "learning_rate": 9.438360240888014e-05, "loss": 1.4728, "step": 505 }, { "epoch": 2.3686366296079577, "grad_norm": 0.5329553484916687, "learning_rate": 9.43458112485615e-05, "loss": 1.4928, "step": 506 }, { "epoch": 2.373317729666472, "grad_norm": 0.5391282439231873, "learning_rate": 9.430790098926711e-05, "loss": 1.4776, "step": 507 }, { "epoch": 2.3779988297249854, "grad_norm": 0.5178598761558533, "learning_rate": 9.426987173281208e-05, "loss": 1.5091, "step": 508 }, { "epoch": 2.382679929783499, "grad_norm": 0.5349472165107727, "learning_rate": 9.423172358133106e-05, "loss": 1.4791, "step": 509 }, { "epoch": 2.3873610298420127, "grad_norm": 0.5359641909599304, "learning_rate": 9.419345663727805e-05, "loss": 1.4864, "step": 510 }, { "epoch": 2.392042129900527, "grad_norm": 0.526164710521698, "learning_rate": 9.415507100342613e-05, "loss": 1.4795, "step": 511 }, { "epoch": 2.3967232299590404, "grad_norm": 0.5576329827308655, "learning_rate": 9.411656678286709e-05, "loss": 1.4974, "step": 512 }, { "epoch": 2.401404330017554, "grad_norm": 0.533876359462738, "learning_rate": 9.407794407901119e-05, "loss": 1.4956, "step": 513 }, { "epoch": 2.406085430076068, "grad_norm": 0.5780550241470337, "learning_rate": 9.403920299558697e-05, "loss": 1.4695, "step": 514 }, { "epoch": 2.4107665301345818, "grad_norm": 0.5714160799980164, "learning_rate": 9.400034363664083e-05, "loss": 1.478, "step": 515 }, { "epoch": 2.4154476301930954, "grad_norm": 0.5485972166061401, "learning_rate": 9.396136610653687e-05, "loss": 1.4872, "step": 516 }, { "epoch": 2.420128730251609, "grad_norm": 0.5578656196594238, "learning_rate": 9.392227050995653e-05, "loss": 1.4811, "step": 517 }, { "epoch": 2.4248098303101226, "grad_norm": 0.513107180595398, "learning_rate": 9.388305695189837e-05, "loss": 1.469, "step": 518 }, { "epoch": 2.4294909303686367, "grad_norm": 0.5163384079933167, "learning_rate": 9.384372553767771e-05, "loss": 1.4785, "step": 519 }, { "epoch": 2.4341720304271504, "grad_norm": 0.5341216921806335, "learning_rate": 9.380427637292646e-05, "loss": 1.4853, "step": 520 }, { "epoch": 2.438853130485664, "grad_norm": 0.5262408256530762, "learning_rate": 9.376470956359269e-05, "loss": 1.4675, "step": 521 }, { "epoch": 2.443534230544178, "grad_norm": 0.5297648310661316, "learning_rate": 9.372502521594052e-05, "loss": 1.4609, "step": 522 }, { "epoch": 2.4482153306026917, "grad_norm": 0.5424336194992065, "learning_rate": 9.368522343654967e-05, "loss": 1.5052, "step": 523 }, { "epoch": 2.4528964306612053, "grad_norm": 0.5182657837867737, "learning_rate": 9.364530433231524e-05, "loss": 1.4835, "step": 524 }, { "epoch": 2.457577530719719, "grad_norm": 0.5619286298751831, "learning_rate": 9.360526801044752e-05, "loss": 1.4511, "step": 525 }, { "epoch": 2.462258630778233, "grad_norm": 0.5724915266036987, "learning_rate": 9.356511457847148e-05, "loss": 1.4618, "step": 526 }, { "epoch": 2.4669397308367467, "grad_norm": 0.5449216365814209, "learning_rate": 9.352484414422674e-05, "loss": 1.4844, "step": 527 }, { "epoch": 2.4716208308952603, "grad_norm": 0.5833266973495483, "learning_rate": 9.348445681586702e-05, "loss": 1.4519, "step": 528 }, { "epoch": 2.476301930953774, "grad_norm": 0.5463898181915283, "learning_rate": 9.344395270186008e-05, "loss": 1.4771, "step": 529 }, { "epoch": 2.480983031012288, "grad_norm": 0.529358983039856, "learning_rate": 9.340333191098733e-05, "loss": 1.4712, "step": 530 }, { "epoch": 2.4856641310708016, "grad_norm": 0.5545377135276794, "learning_rate": 9.336259455234347e-05, "loss": 1.4848, "step": 531 }, { "epoch": 2.4903452311293153, "grad_norm": 0.5353593230247498, "learning_rate": 9.332174073533628e-05, "loss": 1.492, "step": 532 }, { "epoch": 2.4950263311878293, "grad_norm": 0.5352204442024231, "learning_rate": 9.328077056968639e-05, "loss": 1.4894, "step": 533 }, { "epoch": 2.499707431246343, "grad_norm": 0.5731208324432373, "learning_rate": 9.32396841654268e-05, "loss": 1.4775, "step": 534 }, { "epoch": 2.5043885313048566, "grad_norm": 0.5478562712669373, "learning_rate": 9.319848163290279e-05, "loss": 1.4688, "step": 535 }, { "epoch": 2.5090696313633702, "grad_norm": 0.5327141284942627, "learning_rate": 9.31571630827714e-05, "loss": 1.4791, "step": 536 }, { "epoch": 2.513750731421884, "grad_norm": 0.5497174859046936, "learning_rate": 9.311572862600139e-05, "loss": 1.4884, "step": 537 }, { "epoch": 2.518431831480398, "grad_norm": 0.5478792190551758, "learning_rate": 9.307417837387273e-05, "loss": 1.4544, "step": 538 }, { "epoch": 2.5231129315389116, "grad_norm": 0.5436558723449707, "learning_rate": 9.303251243797638e-05, "loss": 1.4773, "step": 539 }, { "epoch": 2.527794031597425, "grad_norm": 0.5304036140441895, "learning_rate": 9.299073093021405e-05, "loss": 1.4674, "step": 540 }, { "epoch": 2.5324751316559393, "grad_norm": 0.5399511456489563, "learning_rate": 9.294883396279774e-05, "loss": 1.4726, "step": 541 }, { "epoch": 2.537156231714453, "grad_norm": 0.5874165296554565, "learning_rate": 9.290682164824964e-05, "loss": 1.4801, "step": 542 }, { "epoch": 2.5418373317729666, "grad_norm": 0.54348224401474, "learning_rate": 9.286469409940168e-05, "loss": 1.4638, "step": 543 }, { "epoch": 2.5465184318314806, "grad_norm": 0.5654876232147217, "learning_rate": 9.282245142939528e-05, "loss": 1.4728, "step": 544 }, { "epoch": 2.5511995318899943, "grad_norm": 0.5578087568283081, "learning_rate": 9.278009375168102e-05, "loss": 1.4708, "step": 545 }, { "epoch": 2.555880631948508, "grad_norm": 0.5147175788879395, "learning_rate": 9.273762118001837e-05, "loss": 1.4613, "step": 546 }, { "epoch": 2.5605617320070215, "grad_norm": 0.5677844285964966, "learning_rate": 9.269503382847539e-05, "loss": 1.4805, "step": 547 }, { "epoch": 2.565242832065535, "grad_norm": 0.5775290131568909, "learning_rate": 9.265233181142836e-05, "loss": 1.4691, "step": 548 }, { "epoch": 2.5699239321240492, "grad_norm": 0.5376417636871338, "learning_rate": 9.260951524356157e-05, "loss": 1.4742, "step": 549 }, { "epoch": 2.574605032182563, "grad_norm": 0.5477780103683472, "learning_rate": 9.256658423986689e-05, "loss": 1.4852, "step": 550 }, { "epoch": 2.5792861322410765, "grad_norm": 0.5555460453033447, "learning_rate": 9.252353891564358e-05, "loss": 1.4658, "step": 551 }, { "epoch": 2.5839672322995906, "grad_norm": 0.5579097867012024, "learning_rate": 9.248037938649792e-05, "loss": 1.466, "step": 552 }, { "epoch": 2.588648332358104, "grad_norm": 0.5582410097122192, "learning_rate": 9.24371057683429e-05, "loss": 1.4521, "step": 553 }, { "epoch": 2.593329432416618, "grad_norm": 0.5532661080360413, "learning_rate": 9.239371817739793e-05, "loss": 1.4845, "step": 554 }, { "epoch": 2.598010532475132, "grad_norm": 0.5436837673187256, "learning_rate": 9.235021673018849e-05, "loss": 1.4701, "step": 555 }, { "epoch": 2.6026916325336455, "grad_norm": 0.5618695020675659, "learning_rate": 9.230660154354587e-05, "loss": 1.4644, "step": 556 }, { "epoch": 2.607372732592159, "grad_norm": 0.5521680116653442, "learning_rate": 9.22628727346068e-05, "loss": 1.4754, "step": 557 }, { "epoch": 2.612053832650673, "grad_norm": 0.5556138753890991, "learning_rate": 9.221903042081319e-05, "loss": 1.4681, "step": 558 }, { "epoch": 2.6167349327091864, "grad_norm": 0.538085401058197, "learning_rate": 9.21750747199118e-05, "loss": 1.4899, "step": 559 }, { "epoch": 2.6214160327677005, "grad_norm": 0.5571359992027283, "learning_rate": 9.213100574995386e-05, "loss": 1.4526, "step": 560 }, { "epoch": 2.626097132826214, "grad_norm": 0.5502709150314331, "learning_rate": 9.208682362929484e-05, "loss": 1.4645, "step": 561 }, { "epoch": 2.630778232884728, "grad_norm": 0.5496858358383179, "learning_rate": 9.204252847659411e-05, "loss": 1.4672, "step": 562 }, { "epoch": 2.635459332943242, "grad_norm": 0.5407517552375793, "learning_rate": 9.199812041081459e-05, "loss": 1.468, "step": 563 }, { "epoch": 2.6401404330017555, "grad_norm": 0.5413292050361633, "learning_rate": 9.195359955122244e-05, "loss": 1.4667, "step": 564 }, { "epoch": 2.644821533060269, "grad_norm": 0.5415422916412354, "learning_rate": 9.190896601738677e-05, "loss": 1.469, "step": 565 }, { "epoch": 2.6495026331187828, "grad_norm": 0.5778108239173889, "learning_rate": 9.186421992917928e-05, "loss": 1.463, "step": 566 }, { "epoch": 2.6541837331772964, "grad_norm": 0.5511682033538818, "learning_rate": 9.181936140677398e-05, "loss": 1.4663, "step": 567 }, { "epoch": 2.6588648332358105, "grad_norm": 0.5939563512802124, "learning_rate": 9.177439057064683e-05, "loss": 1.4887, "step": 568 }, { "epoch": 2.663545933294324, "grad_norm": 0.5491057634353638, "learning_rate": 9.17293075415754e-05, "loss": 1.4704, "step": 569 }, { "epoch": 2.6682270333528377, "grad_norm": 0.528983473777771, "learning_rate": 9.168411244063863e-05, "loss": 1.4699, "step": 570 }, { "epoch": 2.672908133411352, "grad_norm": 0.5451183915138245, "learning_rate": 9.163880538921638e-05, "loss": 1.4635, "step": 571 }, { "epoch": 2.6775892334698654, "grad_norm": 0.562751054763794, "learning_rate": 9.159338650898927e-05, "loss": 1.4656, "step": 572 }, { "epoch": 2.682270333528379, "grad_norm": 0.5425247550010681, "learning_rate": 9.15478559219382e-05, "loss": 1.4799, "step": 573 }, { "epoch": 2.686951433586893, "grad_norm": 0.5311916470527649, "learning_rate": 9.150221375034404e-05, "loss": 1.4587, "step": 574 }, { "epoch": 2.6916325336454068, "grad_norm": 0.5678891539573669, "learning_rate": 9.14564601167874e-05, "loss": 1.5004, "step": 575 }, { "epoch": 2.6963136337039204, "grad_norm": 0.5324888229370117, "learning_rate": 9.141059514414824e-05, "loss": 1.4589, "step": 576 }, { "epoch": 2.700994733762434, "grad_norm": 0.5441606044769287, "learning_rate": 9.136461895560553e-05, "loss": 1.4517, "step": 577 }, { "epoch": 2.7056758338209477, "grad_norm": 0.5288588404655457, "learning_rate": 9.131853167463692e-05, "loss": 1.4729, "step": 578 }, { "epoch": 2.7103569338794617, "grad_norm": 0.5603190660476685, "learning_rate": 9.12723334250184e-05, "loss": 1.4639, "step": 579 }, { "epoch": 2.7150380339379754, "grad_norm": 0.5756075382232666, "learning_rate": 9.122602433082408e-05, "loss": 1.4742, "step": 580 }, { "epoch": 2.719719133996489, "grad_norm": 0.5515328049659729, "learning_rate": 9.117960451642565e-05, "loss": 1.4475, "step": 581 }, { "epoch": 2.724400234055003, "grad_norm": 0.5273950695991516, "learning_rate": 9.11330741064922e-05, "loss": 1.4705, "step": 582 }, { "epoch": 2.7290813341135167, "grad_norm": 0.5433343052864075, "learning_rate": 9.108643322598991e-05, "loss": 1.4691, "step": 583 }, { "epoch": 2.7337624341720304, "grad_norm": 0.5376033782958984, "learning_rate": 9.103968200018154e-05, "loss": 1.4469, "step": 584 }, { "epoch": 2.7384435342305444, "grad_norm": 0.5662835240364075, "learning_rate": 9.09928205546263e-05, "loss": 1.468, "step": 585 }, { "epoch": 2.743124634289058, "grad_norm": 0.5775234699249268, "learning_rate": 9.094584901517933e-05, "loss": 1.4454, "step": 586 }, { "epoch": 2.7478057343475717, "grad_norm": 0.5586291551589966, "learning_rate": 9.089876750799152e-05, "loss": 1.4382, "step": 587 }, { "epoch": 2.7524868344060853, "grad_norm": 0.5502759218215942, "learning_rate": 9.085157615950906e-05, "loss": 1.4901, "step": 588 }, { "epoch": 2.757167934464599, "grad_norm": 0.5833450555801392, "learning_rate": 9.080427509647314e-05, "loss": 1.473, "step": 589 }, { "epoch": 2.761849034523113, "grad_norm": 0.5477181077003479, "learning_rate": 9.075686444591964e-05, "loss": 1.4784, "step": 590 }, { "epoch": 2.7665301345816267, "grad_norm": 0.5742630958557129, "learning_rate": 9.070934433517873e-05, "loss": 1.4615, "step": 591 }, { "epoch": 2.7712112346401403, "grad_norm": 0.5728650093078613, "learning_rate": 9.066171489187452e-05, "loss": 1.4766, "step": 592 }, { "epoch": 2.7758923346986544, "grad_norm": 0.5450015068054199, "learning_rate": 9.061397624392485e-05, "loss": 1.4638, "step": 593 }, { "epoch": 2.780573434757168, "grad_norm": 0.5659042000770569, "learning_rate": 9.056612851954077e-05, "loss": 1.4474, "step": 594 }, { "epoch": 2.7852545348156816, "grad_norm": 0.5580088496208191, "learning_rate": 9.051817184722629e-05, "loss": 1.4765, "step": 595 }, { "epoch": 2.7899356348741957, "grad_norm": 0.5821008086204529, "learning_rate": 9.047010635577805e-05, "loss": 1.457, "step": 596 }, { "epoch": 2.7946167349327093, "grad_norm": 0.5484012961387634, "learning_rate": 9.04219321742849e-05, "loss": 1.4658, "step": 597 }, { "epoch": 2.799297834991223, "grad_norm": 0.596698522567749, "learning_rate": 9.037364943212763e-05, "loss": 1.4584, "step": 598 }, { "epoch": 2.8039789350497366, "grad_norm": 0.5309833288192749, "learning_rate": 9.032525825897859e-05, "loss": 1.4387, "step": 599 }, { "epoch": 2.8086600351082502, "grad_norm": 0.6024175882339478, "learning_rate": 9.027675878480131e-05, "loss": 1.4539, "step": 600 }, { "epoch": 2.8133411351667643, "grad_norm": 0.5484504103660583, "learning_rate": 9.022815113985023e-05, "loss": 1.4633, "step": 601 }, { "epoch": 2.818022235225278, "grad_norm": 0.567604124546051, "learning_rate": 9.017943545467027e-05, "loss": 1.4596, "step": 602 }, { "epoch": 2.8227033352837916, "grad_norm": 0.5658008456230164, "learning_rate": 9.013061186009653e-05, "loss": 1.4869, "step": 603 }, { "epoch": 2.8273844353423057, "grad_norm": 0.5770115852355957, "learning_rate": 9.008168048725387e-05, "loss": 1.4671, "step": 604 }, { "epoch": 2.8320655354008193, "grad_norm": 0.5261843800544739, "learning_rate": 9.00326414675567e-05, "loss": 1.4276, "step": 605 }, { "epoch": 2.836746635459333, "grad_norm": 0.5513188242912292, "learning_rate": 8.998349493270846e-05, "loss": 1.4538, "step": 606 }, { "epoch": 2.8414277355178466, "grad_norm": 0.5463377237319946, "learning_rate": 8.993424101470136e-05, "loss": 1.4747, "step": 607 }, { "epoch": 2.84610883557636, "grad_norm": 0.5515256524085999, "learning_rate": 8.988487984581605e-05, "loss": 1.4463, "step": 608 }, { "epoch": 2.8507899356348743, "grad_norm": 0.5260566473007202, "learning_rate": 8.983541155862114e-05, "loss": 1.4626, "step": 609 }, { "epoch": 2.855471035693388, "grad_norm": 0.5268198251724243, "learning_rate": 8.9785836285973e-05, "loss": 1.4535, "step": 610 }, { "epoch": 2.8601521357519015, "grad_norm": 0.5588566064834595, "learning_rate": 8.97361541610153e-05, "loss": 1.4401, "step": 611 }, { "epoch": 2.8648332358104156, "grad_norm": 0.5275058746337891, "learning_rate": 8.968636531717868e-05, "loss": 1.4727, "step": 612 }, { "epoch": 2.8695143358689292, "grad_norm": 0.5546239018440247, "learning_rate": 8.963646988818042e-05, "loss": 1.482, "step": 613 }, { "epoch": 2.874195435927443, "grad_norm": 0.5342071056365967, "learning_rate": 8.958646800802401e-05, "loss": 1.4612, "step": 614 }, { "epoch": 2.878876535985957, "grad_norm": 0.5397853851318359, "learning_rate": 8.953635981099887e-05, "loss": 1.4502, "step": 615 }, { "epoch": 2.8835576360444706, "grad_norm": 0.5423009991645813, "learning_rate": 8.948614543167993e-05, "loss": 1.4689, "step": 616 }, { "epoch": 2.888238736102984, "grad_norm": 0.5220727920532227, "learning_rate": 8.94358250049273e-05, "loss": 1.4529, "step": 617 }, { "epoch": 2.892919836161498, "grad_norm": 0.5633693933486938, "learning_rate": 8.938539866588592e-05, "loss": 1.432, "step": 618 }, { "epoch": 2.8976009362200115, "grad_norm": 0.5513154864311218, "learning_rate": 8.933486654998515e-05, "loss": 1.4665, "step": 619 }, { "epoch": 2.9022820362785255, "grad_norm": 0.5335742831230164, "learning_rate": 8.928422879293845e-05, "loss": 1.4545, "step": 620 }, { "epoch": 2.906963136337039, "grad_norm": 0.556303083896637, "learning_rate": 8.9233485530743e-05, "loss": 1.4576, "step": 621 }, { "epoch": 2.911644236395553, "grad_norm": 0.5727290511131287, "learning_rate": 8.91826368996793e-05, "loss": 1.463, "step": 622 }, { "epoch": 2.916325336454067, "grad_norm": 0.5217007994651794, "learning_rate": 8.91316830363109e-05, "loss": 1.4659, "step": 623 }, { "epoch": 2.9210064365125805, "grad_norm": 0.5431603789329529, "learning_rate": 8.908062407748393e-05, "loss": 1.4581, "step": 624 }, { "epoch": 2.925687536571094, "grad_norm": 0.5315890312194824, "learning_rate": 8.902946016032676e-05, "loss": 1.4412, "step": 625 }, { "epoch": 2.9303686366296082, "grad_norm": 0.5527530908584595, "learning_rate": 8.897819142224967e-05, "loss": 1.4648, "step": 626 }, { "epoch": 2.935049736688122, "grad_norm": 0.5519826412200928, "learning_rate": 8.892681800094447e-05, "loss": 1.459, "step": 627 }, { "epoch": 2.9397308367466355, "grad_norm": 0.5363120436668396, "learning_rate": 8.887534003438408e-05, "loss": 1.4371, "step": 628 }, { "epoch": 2.944411936805149, "grad_norm": 0.5575461983680725, "learning_rate": 8.882375766082222e-05, "loss": 1.4323, "step": 629 }, { "epoch": 2.9490930368636628, "grad_norm": 0.5456847548484802, "learning_rate": 8.877207101879302e-05, "loss": 1.4561, "step": 630 }, { "epoch": 2.953774136922177, "grad_norm": 0.6018211245536804, "learning_rate": 8.87202802471106e-05, "loss": 1.4586, "step": 631 }, { "epoch": 2.9584552369806905, "grad_norm": 0.5407954454421997, "learning_rate": 8.86683854848688e-05, "loss": 1.4498, "step": 632 }, { "epoch": 2.963136337039204, "grad_norm": 0.5806236863136292, "learning_rate": 8.86163868714407e-05, "loss": 1.4716, "step": 633 }, { "epoch": 2.967817437097718, "grad_norm": 0.5460893511772156, "learning_rate": 8.856428454647831e-05, "loss": 1.4461, "step": 634 }, { "epoch": 2.972498537156232, "grad_norm": 0.5376127362251282, "learning_rate": 8.85120786499122e-05, "loss": 1.4402, "step": 635 }, { "epoch": 2.9771796372147454, "grad_norm": 0.557420015335083, "learning_rate": 8.845976932195103e-05, "loss": 1.4495, "step": 636 }, { "epoch": 2.981860737273259, "grad_norm": 0.5634372234344482, "learning_rate": 8.840735670308133e-05, "loss": 1.4728, "step": 637 }, { "epoch": 2.9865418373317727, "grad_norm": 0.5169801115989685, "learning_rate": 8.835484093406698e-05, "loss": 1.4493, "step": 638 }, { "epoch": 2.9912229373902868, "grad_norm": 0.555625319480896, "learning_rate": 8.83022221559489e-05, "loss": 1.4539, "step": 639 }, { "epoch": 2.9959040374488004, "grad_norm": 0.5352002382278442, "learning_rate": 8.82495005100447e-05, "loss": 1.4447, "step": 640 }, { "epoch": 3.000585137507314, "grad_norm": 1.3402049541473389, "learning_rate": 8.819667613794818e-05, "loss": 1.3743, "step": 641 }, { "epoch": 3.005266237565828, "grad_norm": 0.5957451462745667, "learning_rate": 8.81437491815291e-05, "loss": 1.4553, "step": 642 }, { "epoch": 3.0099473376243417, "grad_norm": 0.5585767030715942, "learning_rate": 8.809071978293271e-05, "loss": 1.4212, "step": 643 }, { "epoch": 3.0146284376828554, "grad_norm": 0.5555313229560852, "learning_rate": 8.803758808457939e-05, "loss": 1.4537, "step": 644 }, { "epoch": 3.019309537741369, "grad_norm": 0.5627725124359131, "learning_rate": 8.798435422916425e-05, "loss": 1.4445, "step": 645 }, { "epoch": 3.023990637799883, "grad_norm": 0.5429000854492188, "learning_rate": 8.793101835965678e-05, "loss": 1.4572, "step": 646 }, { "epoch": 3.0286717378583967, "grad_norm": 0.5517989993095398, "learning_rate": 8.787758061930043e-05, "loss": 1.4421, "step": 647 }, { "epoch": 3.0333528379169103, "grad_norm": 0.5385038256645203, "learning_rate": 8.782404115161225e-05, "loss": 1.4421, "step": 648 }, { "epoch": 3.0380339379754244, "grad_norm": 0.5344865322113037, "learning_rate": 8.777040010038251e-05, "loss": 1.4546, "step": 649 }, { "epoch": 3.042715038033938, "grad_norm": 0.5500159859657288, "learning_rate": 8.771665760967429e-05, "loss": 1.4635, "step": 650 }, { "epoch": 3.0473961380924517, "grad_norm": 0.5563318729400635, "learning_rate": 8.766281382382311e-05, "loss": 1.4304, "step": 651 }, { "epoch": 3.0520772381509653, "grad_norm": 0.5369390249252319, "learning_rate": 8.760886888743653e-05, "loss": 1.4385, "step": 652 }, { "epoch": 3.0567583382094794, "grad_norm": 0.5653578042984009, "learning_rate": 8.755482294539379e-05, "loss": 1.4691, "step": 653 }, { "epoch": 3.061439438267993, "grad_norm": 0.5696621537208557, "learning_rate": 8.750067614284534e-05, "loss": 1.4381, "step": 654 }, { "epoch": 3.0661205383265067, "grad_norm": 0.5326243042945862, "learning_rate": 8.74464286252126e-05, "loss": 1.4609, "step": 655 }, { "epoch": 3.0708016383850203, "grad_norm": 0.5605649352073669, "learning_rate": 8.73920805381874e-05, "loss": 1.4236, "step": 656 }, { "epoch": 3.0754827384435344, "grad_norm": 0.5605049133300781, "learning_rate": 8.733763202773171e-05, "loss": 1.4739, "step": 657 }, { "epoch": 3.080163838502048, "grad_norm": 0.5228269696235657, "learning_rate": 8.72830832400772e-05, "loss": 1.4376, "step": 658 }, { "epoch": 3.0848449385605616, "grad_norm": 0.5667708516120911, "learning_rate": 8.722843432172484e-05, "loss": 1.4552, "step": 659 }, { "epoch": 3.0895260386190753, "grad_norm": 0.5448529124259949, "learning_rate": 8.717368541944452e-05, "loss": 1.4637, "step": 660 }, { "epoch": 3.0942071386775893, "grad_norm": 0.5323814153671265, "learning_rate": 8.711883668027469e-05, "loss": 1.4236, "step": 661 }, { "epoch": 3.098888238736103, "grad_norm": 0.532503068447113, "learning_rate": 8.70638882515219e-05, "loss": 1.4307, "step": 662 }, { "epoch": 3.1035693387946166, "grad_norm": 0.5616405606269836, "learning_rate": 8.700884028076042e-05, "loss": 1.4563, "step": 663 }, { "epoch": 3.1082504388531307, "grad_norm": 0.533190906047821, "learning_rate": 8.695369291583188e-05, "loss": 1.4402, "step": 664 }, { "epoch": 3.1129315389116443, "grad_norm": 0.5488091111183167, "learning_rate": 8.689844630484485e-05, "loss": 1.4464, "step": 665 }, { "epoch": 3.117612638970158, "grad_norm": 0.5348528027534485, "learning_rate": 8.684310059617448e-05, "loss": 1.4316, "step": 666 }, { "epoch": 3.1222937390286716, "grad_norm": 0.5574296116828918, "learning_rate": 8.678765593846197e-05, "loss": 1.4475, "step": 667 }, { "epoch": 3.1269748390871857, "grad_norm": 0.5486092567443848, "learning_rate": 8.673211248061436e-05, "loss": 1.4496, "step": 668 }, { "epoch": 3.1316559391456993, "grad_norm": 0.5664199590682983, "learning_rate": 8.667647037180402e-05, "loss": 1.4527, "step": 669 }, { "epoch": 3.136337039204213, "grad_norm": 0.5176118016242981, "learning_rate": 8.66207297614682e-05, "loss": 1.4533, "step": 670 }, { "epoch": 3.1410181392627265, "grad_norm": 0.5346795916557312, "learning_rate": 8.656489079930878e-05, "loss": 1.4466, "step": 671 }, { "epoch": 3.1456992393212406, "grad_norm": 0.5439758896827698, "learning_rate": 8.650895363529173e-05, "loss": 1.464, "step": 672 }, { "epoch": 3.1503803393797543, "grad_norm": 0.5399249792098999, "learning_rate": 8.645291841964675e-05, "loss": 1.4438, "step": 673 }, { "epoch": 3.155061439438268, "grad_norm": 0.5242234468460083, "learning_rate": 8.639678530286693e-05, "loss": 1.4551, "step": 674 }, { "epoch": 3.159742539496782, "grad_norm": 0.5472708940505981, "learning_rate": 8.634055443570826e-05, "loss": 1.4368, "step": 675 }, { "epoch": 3.1644236395552956, "grad_norm": 0.5396461486816406, "learning_rate": 8.628422596918923e-05, "loss": 1.4461, "step": 676 }, { "epoch": 3.1691047396138092, "grad_norm": 0.5699884295463562, "learning_rate": 8.622780005459052e-05, "loss": 1.4359, "step": 677 }, { "epoch": 3.173785839672323, "grad_norm": 0.5235402584075928, "learning_rate": 8.617127684345445e-05, "loss": 1.4417, "step": 678 }, { "epoch": 3.178466939730837, "grad_norm": 0.5453900098800659, "learning_rate": 8.61146564875847e-05, "loss": 1.458, "step": 679 }, { "epoch": 3.1831480397893506, "grad_norm": 0.5414010286331177, "learning_rate": 8.60579391390458e-05, "loss": 1.4408, "step": 680 }, { "epoch": 3.187829139847864, "grad_norm": 0.5347760915756226, "learning_rate": 8.600112495016288e-05, "loss": 1.4272, "step": 681 }, { "epoch": 3.192510239906378, "grad_norm": 0.526759147644043, "learning_rate": 8.5944214073521e-05, "loss": 1.431, "step": 682 }, { "epoch": 3.197191339964892, "grad_norm": 0.540042519569397, "learning_rate": 8.588720666196499e-05, "loss": 1.4328, "step": 683 }, { "epoch": 3.2018724400234055, "grad_norm": 0.5147102475166321, "learning_rate": 8.583010286859896e-05, "loss": 1.4331, "step": 684 }, { "epoch": 3.206553540081919, "grad_norm": 0.5127415060997009, "learning_rate": 8.577290284678578e-05, "loss": 1.4137, "step": 685 }, { "epoch": 3.211234640140433, "grad_norm": 0.5615378618240356, "learning_rate": 8.571560675014687e-05, "loss": 1.4393, "step": 686 }, { "epoch": 3.215915740198947, "grad_norm": 0.5693033337593079, "learning_rate": 8.565821473256156e-05, "loss": 1.4372, "step": 687 }, { "epoch": 3.2205968402574605, "grad_norm": 0.5161866545677185, "learning_rate": 8.560072694816693e-05, "loss": 1.4578, "step": 688 }, { "epoch": 3.225277940315974, "grad_norm": 0.5499429106712341, "learning_rate": 8.554314355135711e-05, "loss": 1.4432, "step": 689 }, { "epoch": 3.2299590403744878, "grad_norm": 0.5418663620948792, "learning_rate": 8.548546469678311e-05, "loss": 1.4387, "step": 690 }, { "epoch": 3.234640140433002, "grad_norm": 0.5338912010192871, "learning_rate": 8.542769053935232e-05, "loss": 1.4345, "step": 691 }, { "epoch": 3.2393212404915155, "grad_norm": 0.5374161601066589, "learning_rate": 8.536982123422803e-05, "loss": 1.4377, "step": 692 }, { "epoch": 3.244002340550029, "grad_norm": 0.5633624792098999, "learning_rate": 8.531185693682908e-05, "loss": 1.4386, "step": 693 }, { "epoch": 3.248683440608543, "grad_norm": 0.5362398624420166, "learning_rate": 8.525379780282947e-05, "loss": 1.4442, "step": 694 }, { "epoch": 3.253364540667057, "grad_norm": 0.5384619832038879, "learning_rate": 8.519564398815783e-05, "loss": 1.4351, "step": 695 }, { "epoch": 3.2580456407255705, "grad_norm": 0.5073276162147522, "learning_rate": 8.513739564899715e-05, "loss": 1.4234, "step": 696 }, { "epoch": 3.262726740784084, "grad_norm": 0.5378078818321228, "learning_rate": 8.507905294178422e-05, "loss": 1.4497, "step": 697 }, { "epoch": 3.267407840842598, "grad_norm": 0.5337597727775574, "learning_rate": 8.502061602320933e-05, "loss": 1.4409, "step": 698 }, { "epoch": 3.272088940901112, "grad_norm": 0.5651869177818298, "learning_rate": 8.496208505021571e-05, "loss": 1.4611, "step": 699 }, { "epoch": 3.2767700409596254, "grad_norm": 0.5655667185783386, "learning_rate": 8.490346017999929e-05, "loss": 1.4335, "step": 700 }, { "epoch": 3.281451141018139, "grad_norm": 0.548124372959137, "learning_rate": 8.484474157000812e-05, "loss": 1.45, "step": 701 }, { "epoch": 3.286132241076653, "grad_norm": 0.5557027459144592, "learning_rate": 8.4785929377942e-05, "loss": 1.459, "step": 702 }, { "epoch": 3.2908133411351668, "grad_norm": 0.5695424675941467, "learning_rate": 8.47270237617521e-05, "loss": 1.4587, "step": 703 }, { "epoch": 3.2954944411936804, "grad_norm": 0.5496786832809448, "learning_rate": 8.466802487964047e-05, "loss": 1.4273, "step": 704 }, { "epoch": 3.3001755412521945, "grad_norm": 0.5440911054611206, "learning_rate": 8.460893289005965e-05, "loss": 1.4467, "step": 705 }, { "epoch": 3.304856641310708, "grad_norm": 0.5482668876647949, "learning_rate": 8.454974795171222e-05, "loss": 1.4618, "step": 706 }, { "epoch": 3.3095377413692217, "grad_norm": 0.562824010848999, "learning_rate": 8.449047022355043e-05, "loss": 1.4451, "step": 707 }, { "epoch": 3.3142188414277354, "grad_norm": 0.5314816832542419, "learning_rate": 8.443109986477573e-05, "loss": 1.4324, "step": 708 }, { "epoch": 3.3188999414862494, "grad_norm": 0.5433753728866577, "learning_rate": 8.437163703483827e-05, "loss": 1.4271, "step": 709 }, { "epoch": 3.323581041544763, "grad_norm": 0.5342699885368347, "learning_rate": 8.43120818934367e-05, "loss": 1.447, "step": 710 }, { "epoch": 3.3282621416032767, "grad_norm": 0.5455060005187988, "learning_rate": 8.425243460051739e-05, "loss": 1.4147, "step": 711 }, { "epoch": 3.3329432416617903, "grad_norm": 0.5278387665748596, "learning_rate": 8.419269531627439e-05, "loss": 1.4347, "step": 712 }, { "epoch": 3.3376243417203044, "grad_norm": 0.5744187235832214, "learning_rate": 8.41328642011487e-05, "loss": 1.4277, "step": 713 }, { "epoch": 3.342305441778818, "grad_norm": 0.5402573347091675, "learning_rate": 8.407294141582797e-05, "loss": 1.4347, "step": 714 }, { "epoch": 3.3469865418373317, "grad_norm": 0.5673272609710693, "learning_rate": 8.401292712124609e-05, "loss": 1.4444, "step": 715 }, { "epoch": 3.3516676418958458, "grad_norm": 0.5388854742050171, "learning_rate": 8.395282147858264e-05, "loss": 1.4518, "step": 716 }, { "epoch": 3.3563487419543594, "grad_norm": 0.5453377962112427, "learning_rate": 8.389262464926256e-05, "loss": 1.4257, "step": 717 }, { "epoch": 3.361029842012873, "grad_norm": 0.5486723184585571, "learning_rate": 8.383233679495577e-05, "loss": 1.4255, "step": 718 }, { "epoch": 3.3657109420713867, "grad_norm": 0.5446637272834778, "learning_rate": 8.377195807757652e-05, "loss": 1.4526, "step": 719 }, { "epoch": 3.3703920421299003, "grad_norm": 0.5255379676818848, "learning_rate": 8.371148865928319e-05, "loss": 1.4246, "step": 720 }, { "epoch": 3.3750731421884144, "grad_norm": 0.5421748757362366, "learning_rate": 8.36509287024777e-05, "loss": 1.4381, "step": 721 }, { "epoch": 3.379754242246928, "grad_norm": 0.5305628180503845, "learning_rate": 8.359027836980517e-05, "loss": 1.4356, "step": 722 }, { "epoch": 3.3844353423054416, "grad_norm": 0.5197787880897522, "learning_rate": 8.352953782415341e-05, "loss": 1.4422, "step": 723 }, { "epoch": 3.3891164423639557, "grad_norm": 0.521529495716095, "learning_rate": 8.346870722865251e-05, "loss": 1.429, "step": 724 }, { "epoch": 3.3937975424224693, "grad_norm": 0.5614930987358093, "learning_rate": 8.340778674667444e-05, "loss": 1.4318, "step": 725 }, { "epoch": 3.398478642480983, "grad_norm": 0.5214779376983643, "learning_rate": 8.334677654183254e-05, "loss": 1.3981, "step": 726 }, { "epoch": 3.4031597425394966, "grad_norm": 0.5232903957366943, "learning_rate": 8.328567677798115e-05, "loss": 1.4362, "step": 727 }, { "epoch": 3.4078408425980107, "grad_norm": 0.5370655655860901, "learning_rate": 8.322448761921511e-05, "loss": 1.4426, "step": 728 }, { "epoch": 3.4125219426565243, "grad_norm": 0.5535497069358826, "learning_rate": 8.316320922986935e-05, "loss": 1.4286, "step": 729 }, { "epoch": 3.417203042715038, "grad_norm": 0.5056213736534119, "learning_rate": 8.310184177451847e-05, "loss": 1.4402, "step": 730 }, { "epoch": 3.4218841427735516, "grad_norm": 0.5447074174880981, "learning_rate": 8.304038541797625e-05, "loss": 1.4167, "step": 731 }, { "epoch": 3.4265652428320656, "grad_norm": 0.5250815749168396, "learning_rate": 8.297884032529523e-05, "loss": 1.414, "step": 732 }, { "epoch": 3.4312463428905793, "grad_norm": 0.5571607351303101, "learning_rate": 8.29172066617663e-05, "loss": 1.4144, "step": 733 }, { "epoch": 3.435927442949093, "grad_norm": 0.551026463508606, "learning_rate": 8.285548459291817e-05, "loss": 1.4486, "step": 734 }, { "epoch": 3.440608543007607, "grad_norm": 0.5348817110061646, "learning_rate": 8.279367428451702e-05, "loss": 1.4424, "step": 735 }, { "epoch": 3.4452896430661206, "grad_norm": 0.5166352987289429, "learning_rate": 8.2731775902566e-05, "loss": 1.4447, "step": 736 }, { "epoch": 3.4499707431246343, "grad_norm": 0.5199401378631592, "learning_rate": 8.266978961330479e-05, "loss": 1.4445, "step": 737 }, { "epoch": 3.454651843183148, "grad_norm": 0.5190863609313965, "learning_rate": 8.260771558320919e-05, "loss": 1.437, "step": 738 }, { "epoch": 3.459332943241662, "grad_norm": 0.5204188227653503, "learning_rate": 8.254555397899061e-05, "loss": 1.4279, "step": 739 }, { "epoch": 3.4640140433001756, "grad_norm": 0.5144184827804565, "learning_rate": 8.248330496759568e-05, "loss": 1.4376, "step": 740 }, { "epoch": 3.4686951433586892, "grad_norm": 0.5174996256828308, "learning_rate": 8.242096871620578e-05, "loss": 1.4067, "step": 741 }, { "epoch": 3.473376243417203, "grad_norm": 0.5053052306175232, "learning_rate": 8.235854539223654e-05, "loss": 1.4425, "step": 742 }, { "epoch": 3.478057343475717, "grad_norm": 0.5285958051681519, "learning_rate": 8.229603516333753e-05, "loss": 1.4215, "step": 743 }, { "epoch": 3.4827384435342306, "grad_norm": 0.5122373104095459, "learning_rate": 8.223343819739164e-05, "loss": 1.4413, "step": 744 }, { "epoch": 3.487419543592744, "grad_norm": 0.5227556228637695, "learning_rate": 8.217075466251477e-05, "loss": 1.4256, "step": 745 }, { "epoch": 3.4921006436512583, "grad_norm": 0.5254419445991516, "learning_rate": 8.210798472705522e-05, "loss": 1.4248, "step": 746 }, { "epoch": 3.496781743709772, "grad_norm": 0.5361796617507935, "learning_rate": 8.204512855959345e-05, "loss": 1.4356, "step": 747 }, { "epoch": 3.5014628437682855, "grad_norm": 0.5470665693283081, "learning_rate": 8.198218632894145e-05, "loss": 1.4229, "step": 748 }, { "epoch": 3.506143943826799, "grad_norm": 0.5319539904594421, "learning_rate": 8.191915820414237e-05, "loss": 1.4288, "step": 749 }, { "epoch": 3.510825043885313, "grad_norm": 0.5314258337020874, "learning_rate": 8.185604435447002e-05, "loss": 1.4321, "step": 750 }, { "epoch": 3.515506143943827, "grad_norm": 0.55251544713974, "learning_rate": 8.179284494942846e-05, "loss": 1.435, "step": 751 }, { "epoch": 3.5201872440023405, "grad_norm": 0.550113320350647, "learning_rate": 8.172956015875151e-05, "loss": 1.4424, "step": 752 }, { "epoch": 3.524868344060854, "grad_norm": 0.5319443345069885, "learning_rate": 8.166619015240236e-05, "loss": 1.4439, "step": 753 }, { "epoch": 3.529549444119368, "grad_norm": 0.5305471420288086, "learning_rate": 8.160273510057296e-05, "loss": 1.4407, "step": 754 }, { "epoch": 3.534230544177882, "grad_norm": 0.5263088345527649, "learning_rate": 8.15391951736838e-05, "loss": 1.4359, "step": 755 }, { "epoch": 3.5389116442363955, "grad_norm": 0.5275413990020752, "learning_rate": 8.147557054238317e-05, "loss": 1.4003, "step": 756 }, { "epoch": 3.5435927442949096, "grad_norm": 0.5371294617652893, "learning_rate": 8.141186137754698e-05, "loss": 1.4161, "step": 757 }, { "epoch": 3.548273844353423, "grad_norm": 0.4912378191947937, "learning_rate": 8.13480678502781e-05, "loss": 1.4107, "step": 758 }, { "epoch": 3.552954944411937, "grad_norm": 0.5293853878974915, "learning_rate": 8.128419013190598e-05, "loss": 1.3992, "step": 759 }, { "epoch": 3.5576360444704505, "grad_norm": 0.5455768704414368, "learning_rate": 8.12202283939862e-05, "loss": 1.4105, "step": 760 }, { "epoch": 3.562317144528964, "grad_norm": 0.5033174157142639, "learning_rate": 8.115618280829997e-05, "loss": 1.4323, "step": 761 }, { "epoch": 3.566998244587478, "grad_norm": 0.5291778445243835, "learning_rate": 8.109205354685368e-05, "loss": 1.4055, "step": 762 }, { "epoch": 3.571679344645992, "grad_norm": 0.5420578718185425, "learning_rate": 8.102784078187851e-05, "loss": 1.4305, "step": 763 }, { "epoch": 3.5763604447045054, "grad_norm": 0.5107989311218262, "learning_rate": 8.096354468582981e-05, "loss": 1.4081, "step": 764 }, { "epoch": 3.5810415447630195, "grad_norm": 0.5385276675224304, "learning_rate": 8.089916543138681e-05, "loss": 1.4376, "step": 765 }, { "epoch": 3.585722644821533, "grad_norm": 0.5747690796852112, "learning_rate": 8.083470319145203e-05, "loss": 1.4271, "step": 766 }, { "epoch": 3.5904037448800468, "grad_norm": 0.5204617977142334, "learning_rate": 8.077015813915088e-05, "loss": 1.4334, "step": 767 }, { "epoch": 3.5950848449385604, "grad_norm": 0.560840904712677, "learning_rate": 8.070553044783119e-05, "loss": 1.4072, "step": 768 }, { "epoch": 3.5997659449970745, "grad_norm": 0.5681152939796448, "learning_rate": 8.064082029106271e-05, "loss": 1.4163, "step": 769 }, { "epoch": 3.604447045055588, "grad_norm": 0.5297346711158752, "learning_rate": 8.05760278426367e-05, "loss": 1.4132, "step": 770 }, { "epoch": 3.6091281451141017, "grad_norm": 0.5584424734115601, "learning_rate": 8.051115327656538e-05, "loss": 1.4256, "step": 771 }, { "epoch": 3.6138092451726154, "grad_norm": 0.5776448249816895, "learning_rate": 8.044619676708153e-05, "loss": 1.437, "step": 772 }, { "epoch": 3.6184903452311294, "grad_norm": 0.523693859577179, "learning_rate": 8.038115848863805e-05, "loss": 1.4335, "step": 773 }, { "epoch": 3.623171445289643, "grad_norm": 0.52964186668396, "learning_rate": 8.031603861590737e-05, "loss": 1.4266, "step": 774 }, { "epoch": 3.6278525453481567, "grad_norm": 0.5207391977310181, "learning_rate": 8.025083732378114e-05, "loss": 1.4182, "step": 775 }, { "epoch": 3.632533645406671, "grad_norm": 0.5130608677864075, "learning_rate": 8.01855547873696e-05, "loss": 1.4123, "step": 776 }, { "epoch": 3.6372147454651844, "grad_norm": 0.5629760026931763, "learning_rate": 8.012019118200123e-05, "loss": 1.4324, "step": 777 }, { "epoch": 3.641895845523698, "grad_norm": 0.5352134108543396, "learning_rate": 8.005474668322221e-05, "loss": 1.4471, "step": 778 }, { "epoch": 3.6465769455822117, "grad_norm": 0.5482569932937622, "learning_rate": 7.998922146679602e-05, "loss": 1.4192, "step": 779 }, { "epoch": 3.6512580456407253, "grad_norm": 0.5770877599716187, "learning_rate": 7.992361570870288e-05, "loss": 1.4312, "step": 780 }, { "epoch": 3.6559391456992394, "grad_norm": 0.540606677532196, "learning_rate": 7.985792958513931e-05, "loss": 1.441, "step": 781 }, { "epoch": 3.660620245757753, "grad_norm": 0.5749906301498413, "learning_rate": 7.979216327251772e-05, "loss": 1.4313, "step": 782 }, { "epoch": 3.6653013458162667, "grad_norm": 0.5137084126472473, "learning_rate": 7.972631694746583e-05, "loss": 1.4219, "step": 783 }, { "epoch": 3.6699824458747807, "grad_norm": 0.5164451003074646, "learning_rate": 7.966039078682627e-05, "loss": 1.4088, "step": 784 }, { "epoch": 3.6746635459332944, "grad_norm": 0.5294907093048096, "learning_rate": 7.959438496765611e-05, "loss": 1.4178, "step": 785 }, { "epoch": 3.679344645991808, "grad_norm": 0.5210172533988953, "learning_rate": 7.95282996672263e-05, "loss": 1.4223, "step": 786 }, { "epoch": 3.684025746050322, "grad_norm": 0.5051605105400085, "learning_rate": 7.946213506302129e-05, "loss": 1.4262, "step": 787 }, { "epoch": 3.6887068461088357, "grad_norm": 0.52712082862854, "learning_rate": 7.939589133273853e-05, "loss": 1.4094, "step": 788 }, { "epoch": 3.6933879461673493, "grad_norm": 0.5296499133110046, "learning_rate": 7.932956865428791e-05, "loss": 1.4226, "step": 789 }, { "epoch": 3.698069046225863, "grad_norm": 0.5116936564445496, "learning_rate": 7.926316720579144e-05, "loss": 1.4022, "step": 790 }, { "epoch": 3.7027501462843766, "grad_norm": 0.5399641394615173, "learning_rate": 7.919668716558262e-05, "loss": 1.423, "step": 791 }, { "epoch": 3.7074312463428907, "grad_norm": 0.5201460123062134, "learning_rate": 7.913012871220604e-05, "loss": 1.4254, "step": 792 }, { "epoch": 3.7121123464014043, "grad_norm": 0.5204175114631653, "learning_rate": 7.906349202441687e-05, "loss": 1.4126, "step": 793 }, { "epoch": 3.716793446459918, "grad_norm": 0.5481407046318054, "learning_rate": 7.899677728118043e-05, "loss": 1.418, "step": 794 }, { "epoch": 3.721474546518432, "grad_norm": 0.5074529051780701, "learning_rate": 7.892998466167165e-05, "loss": 1.3903, "step": 795 }, { "epoch": 3.7261556465769456, "grad_norm": 0.5236367583274841, "learning_rate": 7.886311434527459e-05, "loss": 1.4446, "step": 796 }, { "epoch": 3.7308367466354593, "grad_norm": 0.5233679413795471, "learning_rate": 7.879616651158201e-05, "loss": 1.3958, "step": 797 }, { "epoch": 3.7355178466939734, "grad_norm": 0.5118744969367981, "learning_rate": 7.872914134039484e-05, "loss": 1.4242, "step": 798 }, { "epoch": 3.740198946752487, "grad_norm": 0.5215593576431274, "learning_rate": 7.866203901172175e-05, "loss": 1.4231, "step": 799 }, { "epoch": 3.7448800468110006, "grad_norm": 0.5329627990722656, "learning_rate": 7.859485970577856e-05, "loss": 1.431, "step": 800 }, { "epoch": 3.7495611468695142, "grad_norm": 0.5146481990814209, "learning_rate": 7.85276036029879e-05, "loss": 1.392, "step": 801 }, { "epoch": 3.754242246928028, "grad_norm": 0.5273050665855408, "learning_rate": 7.846027088397862e-05, "loss": 1.4377, "step": 802 }, { "epoch": 3.758923346986542, "grad_norm": 0.5356155633926392, "learning_rate": 7.83928617295853e-05, "loss": 1.4286, "step": 803 }, { "epoch": 3.7636044470450556, "grad_norm": 0.5126620531082153, "learning_rate": 7.83253763208479e-05, "loss": 1.4101, "step": 804 }, { "epoch": 3.768285547103569, "grad_norm": 0.5486011505126953, "learning_rate": 7.825781483901109e-05, "loss": 1.4076, "step": 805 }, { "epoch": 3.7729666471620833, "grad_norm": 0.5352131128311157, "learning_rate": 7.819017746552387e-05, "loss": 1.3961, "step": 806 }, { "epoch": 3.777647747220597, "grad_norm": 0.5462915301322937, "learning_rate": 7.812246438203904e-05, "loss": 1.4247, "step": 807 }, { "epoch": 3.7823288472791106, "grad_norm": 0.5541925430297852, "learning_rate": 7.80546757704128e-05, "loss": 1.4295, "step": 808 }, { "epoch": 3.787009947337624, "grad_norm": 0.5532597303390503, "learning_rate": 7.798681181270413e-05, "loss": 1.413, "step": 809 }, { "epoch": 3.791691047396138, "grad_norm": 0.5388747453689575, "learning_rate": 7.791887269117442e-05, "loss": 1.4304, "step": 810 }, { "epoch": 3.796372147454652, "grad_norm": 0.5408781170845032, "learning_rate": 7.785085858828685e-05, "loss": 1.4126, "step": 811 }, { "epoch": 3.8010532475131655, "grad_norm": 0.5439730882644653, "learning_rate": 7.778276968670606e-05, "loss": 1.4035, "step": 812 }, { "epoch": 3.805734347571679, "grad_norm": 0.5271381735801697, "learning_rate": 7.77146061692975e-05, "loss": 1.4185, "step": 813 }, { "epoch": 3.8104154476301932, "grad_norm": 0.5220599174499512, "learning_rate": 7.764636821912711e-05, "loss": 1.3992, "step": 814 }, { "epoch": 3.815096547688707, "grad_norm": 0.5409356951713562, "learning_rate": 7.757805601946064e-05, "loss": 1.4282, "step": 815 }, { "epoch": 3.8197776477472205, "grad_norm": 0.5270222425460815, "learning_rate": 7.750966975376328e-05, "loss": 1.43, "step": 816 }, { "epoch": 3.8244587478057346, "grad_norm": 0.5513463616371155, "learning_rate": 7.744120960569918e-05, "loss": 1.4232, "step": 817 }, { "epoch": 3.829139847864248, "grad_norm": 0.5306671857833862, "learning_rate": 7.737267575913083e-05, "loss": 1.4234, "step": 818 }, { "epoch": 3.833820947922762, "grad_norm": 0.519351601600647, "learning_rate": 7.730406839811874e-05, "loss": 1.4073, "step": 819 }, { "epoch": 3.8385020479812755, "grad_norm": 0.5287462472915649, "learning_rate": 7.723538770692081e-05, "loss": 1.4187, "step": 820 }, { "epoch": 3.843183148039789, "grad_norm": 0.5309276580810547, "learning_rate": 7.71666338699919e-05, "loss": 1.4321, "step": 821 }, { "epoch": 3.847864248098303, "grad_norm": 0.5215145945549011, "learning_rate": 7.709780707198328e-05, "loss": 1.4212, "step": 822 }, { "epoch": 3.852545348156817, "grad_norm": 0.5737372636795044, "learning_rate": 7.70289074977422e-05, "loss": 1.4051, "step": 823 }, { "epoch": 3.8572264482153304, "grad_norm": 0.5519363880157471, "learning_rate": 7.69599353323114e-05, "loss": 1.4322, "step": 824 }, { "epoch": 3.8619075482738445, "grad_norm": 0.5450279712677002, "learning_rate": 7.68908907609285e-05, "loss": 1.4188, "step": 825 }, { "epoch": 3.866588648332358, "grad_norm": 0.6025644540786743, "learning_rate": 7.682177396902566e-05, "loss": 1.4205, "step": 826 }, { "epoch": 3.871269748390872, "grad_norm": 0.5163719654083252, "learning_rate": 7.67525851422289e-05, "loss": 1.43, "step": 827 }, { "epoch": 3.875950848449386, "grad_norm": 0.5643473863601685, "learning_rate": 7.668332446635779e-05, "loss": 1.4233, "step": 828 }, { "epoch": 3.8806319485078995, "grad_norm": 0.5531340837478638, "learning_rate": 7.661399212742487e-05, "loss": 1.4196, "step": 829 }, { "epoch": 3.885313048566413, "grad_norm": 0.5392515063285828, "learning_rate": 7.654458831163506e-05, "loss": 1.4164, "step": 830 }, { "epoch": 3.8899941486249268, "grad_norm": 0.5888575911521912, "learning_rate": 7.647511320538536e-05, "loss": 1.419, "step": 831 }, { "epoch": 3.8946752486834404, "grad_norm": 0.542989194393158, "learning_rate": 7.64055669952641e-05, "loss": 1.3953, "step": 832 }, { "epoch": 3.8993563487419545, "grad_norm": 0.5783752202987671, "learning_rate": 7.633594986805069e-05, "loss": 1.3971, "step": 833 }, { "epoch": 3.904037448800468, "grad_norm": 0.5455073118209839, "learning_rate": 7.626626201071494e-05, "loss": 1.4088, "step": 834 }, { "epoch": 3.9087185488589817, "grad_norm": 0.6045266389846802, "learning_rate": 7.619650361041663e-05, "loss": 1.4383, "step": 835 }, { "epoch": 3.913399648917496, "grad_norm": 0.5085153579711914, "learning_rate": 7.612667485450502e-05, "loss": 1.4218, "step": 836 }, { "epoch": 3.9180807489760094, "grad_norm": 0.5967952013015747, "learning_rate": 7.605677593051827e-05, "loss": 1.4266, "step": 837 }, { "epoch": 3.922761849034523, "grad_norm": 0.560104250907898, "learning_rate": 7.598680702618304e-05, "loss": 1.4135, "step": 838 }, { "epoch": 3.9274429490930367, "grad_norm": 0.5937385559082031, "learning_rate": 7.591676832941394e-05, "loss": 1.4419, "step": 839 }, { "epoch": 3.9321240491515503, "grad_norm": 0.616173505783081, "learning_rate": 7.584666002831296e-05, "loss": 1.4358, "step": 840 }, { "epoch": 3.9368051492100644, "grad_norm": 0.5356134176254272, "learning_rate": 7.577648231116907e-05, "loss": 1.3979, "step": 841 }, { "epoch": 3.941486249268578, "grad_norm": 0.5758895874023438, "learning_rate": 7.570623536645768e-05, "loss": 1.4052, "step": 842 }, { "epoch": 3.9461673493270917, "grad_norm": 0.5662379860877991, "learning_rate": 7.563591938284011e-05, "loss": 1.418, "step": 843 }, { "epoch": 3.9508484493856058, "grad_norm": 0.5756869912147522, "learning_rate": 7.556553454916308e-05, "loss": 1.3976, "step": 844 }, { "epoch": 3.9555295494441194, "grad_norm": 0.5514070391654968, "learning_rate": 7.549508105445825e-05, "loss": 1.422, "step": 845 }, { "epoch": 3.960210649502633, "grad_norm": 0.5394649505615234, "learning_rate": 7.542455908794163e-05, "loss": 1.407, "step": 846 }, { "epoch": 3.964891749561147, "grad_norm": 0.5769184827804565, "learning_rate": 7.535396883901322e-05, "loss": 1.415, "step": 847 }, { "epoch": 3.9695728496196607, "grad_norm": 0.5303329229354858, "learning_rate": 7.528331049725627e-05, "loss": 1.4326, "step": 848 }, { "epoch": 3.9742539496781744, "grad_norm": 0.5458217859268188, "learning_rate": 7.521258425243705e-05, "loss": 1.4167, "step": 849 }, { "epoch": 3.978935049736688, "grad_norm": 0.5328347086906433, "learning_rate": 7.514179029450409e-05, "loss": 1.4191, "step": 850 }, { "epoch": 3.9836161497952016, "grad_norm": 0.525581955909729, "learning_rate": 7.507092881358782e-05, "loss": 1.4093, "step": 851 }, { "epoch": 3.9882972498537157, "grad_norm": 0.5150876641273499, "learning_rate": 7.500000000000001e-05, "loss": 1.3958, "step": 852 }, { "epoch": 3.9929783499122293, "grad_norm": 0.5269944071769714, "learning_rate": 7.492900404423326e-05, "loss": 1.4066, "step": 853 }, { "epoch": 3.997659449970743, "grad_norm": 0.5274633765220642, "learning_rate": 7.48579411369605e-05, "loss": 1.4296, "step": 854 }, { "epoch": 4.002340550029257, "grad_norm": 2.507246971130371, "learning_rate": 7.478681146903448e-05, "loss": 1.4841, "step": 855 }, { "epoch": 4.00702165008777, "grad_norm": 0.5649014115333557, "learning_rate": 7.471561523148722e-05, "loss": 1.4352, "step": 856 }, { "epoch": 4.011702750146284, "grad_norm": 0.5550701022148132, "learning_rate": 7.464435261552956e-05, "loss": 1.4092, "step": 857 }, { "epoch": 4.016383850204798, "grad_norm": 0.5513316988945007, "learning_rate": 7.457302381255054e-05, "loss": 1.4058, "step": 858 }, { "epoch": 4.021064950263312, "grad_norm": 0.5378918647766113, "learning_rate": 7.450162901411708e-05, "loss": 1.4232, "step": 859 }, { "epoch": 4.025746050321826, "grad_norm": 0.5452824234962463, "learning_rate": 7.443016841197323e-05, "loss": 1.4016, "step": 860 }, { "epoch": 4.03042715038034, "grad_norm": 0.5211923122406006, "learning_rate": 7.435864219803983e-05, "loss": 1.4091, "step": 861 }, { "epoch": 4.035108250438853, "grad_norm": 0.5528005361557007, "learning_rate": 7.42870505644139e-05, "loss": 1.409, "step": 862 }, { "epoch": 4.039789350497367, "grad_norm": 0.536733090877533, "learning_rate": 7.421539370336819e-05, "loss": 1.4136, "step": 863 }, { "epoch": 4.044470450555881, "grad_norm": 0.5282482504844666, "learning_rate": 7.414367180735058e-05, "loss": 1.3946, "step": 864 }, { "epoch": 4.049151550614394, "grad_norm": 0.5195043087005615, "learning_rate": 7.407188506898367e-05, "loss": 1.4206, "step": 865 }, { "epoch": 4.053832650672908, "grad_norm": 0.5188748240470886, "learning_rate": 7.400003368106419e-05, "loss": 1.4144, "step": 866 }, { "epoch": 4.0585137507314215, "grad_norm": 0.5505629181861877, "learning_rate": 7.392811783656246e-05, "loss": 1.4355, "step": 867 }, { "epoch": 4.063194850789936, "grad_norm": 0.5169565677642822, "learning_rate": 7.385613772862197e-05, "loss": 1.4153, "step": 868 }, { "epoch": 4.06787595084845, "grad_norm": 0.5217868685722351, "learning_rate": 7.378409355055874e-05, "loss": 1.4018, "step": 869 }, { "epoch": 4.072557050906963, "grad_norm": 0.5039651393890381, "learning_rate": 7.371198549586091e-05, "loss": 1.3759, "step": 870 }, { "epoch": 4.077238150965477, "grad_norm": 0.5173787474632263, "learning_rate": 7.363981375818817e-05, "loss": 1.4033, "step": 871 }, { "epoch": 4.081919251023991, "grad_norm": 0.5146439075469971, "learning_rate": 7.35675785313712e-05, "loss": 1.4211, "step": 872 }, { "epoch": 4.086600351082504, "grad_norm": 0.5264257192611694, "learning_rate": 7.349528000941126e-05, "loss": 1.3901, "step": 873 }, { "epoch": 4.091281451141018, "grad_norm": 0.5119170546531677, "learning_rate": 7.342291838647952e-05, "loss": 1.4119, "step": 874 }, { "epoch": 4.0959625511995315, "grad_norm": 0.5248039960861206, "learning_rate": 7.335049385691669e-05, "loss": 1.3874, "step": 875 }, { "epoch": 4.1006436512580455, "grad_norm": 0.5059455633163452, "learning_rate": 7.327800661523238e-05, "loss": 1.4059, "step": 876 }, { "epoch": 4.10532475131656, "grad_norm": 0.5265378952026367, "learning_rate": 7.320545685610467e-05, "loss": 1.4202, "step": 877 }, { "epoch": 4.110005851375073, "grad_norm": 0.5115812420845032, "learning_rate": 7.31328447743795e-05, "loss": 1.4021, "step": 878 }, { "epoch": 4.114686951433587, "grad_norm": 0.5200651288032532, "learning_rate": 7.306017056507017e-05, "loss": 1.4136, "step": 879 }, { "epoch": 4.119368051492101, "grad_norm": 0.5201098322868347, "learning_rate": 7.298743442335693e-05, "loss": 1.4063, "step": 880 }, { "epoch": 4.124049151550614, "grad_norm": 0.5152908563613892, "learning_rate": 7.291463654458629e-05, "loss": 1.4263, "step": 881 }, { "epoch": 4.128730251609128, "grad_norm": 0.5363911390304565, "learning_rate": 7.284177712427056e-05, "loss": 1.4095, "step": 882 }, { "epoch": 4.133411351667642, "grad_norm": 0.5588401556015015, "learning_rate": 7.276885635808734e-05, "loss": 1.3952, "step": 883 }, { "epoch": 4.1380924517261555, "grad_norm": 0.50605708360672, "learning_rate": 7.269587444187902e-05, "loss": 1.3897, "step": 884 }, { "epoch": 4.1427735517846696, "grad_norm": 0.5475851893424988, "learning_rate": 7.262283157165219e-05, "loss": 1.3952, "step": 885 }, { "epoch": 4.147454651843183, "grad_norm": 0.5669999122619629, "learning_rate": 7.254972794357713e-05, "loss": 1.4031, "step": 886 }, { "epoch": 4.152135751901697, "grad_norm": 0.5178671479225159, "learning_rate": 7.247656375398734e-05, "loss": 1.3768, "step": 887 }, { "epoch": 4.156816851960211, "grad_norm": 0.5260223746299744, "learning_rate": 7.240333919937893e-05, "loss": 1.4019, "step": 888 }, { "epoch": 4.161497952018724, "grad_norm": 0.5484073162078857, "learning_rate": 7.233005447641015e-05, "loss": 1.3978, "step": 889 }, { "epoch": 4.166179052077238, "grad_norm": 0.5305057764053345, "learning_rate": 7.225670978190085e-05, "loss": 1.4178, "step": 890 }, { "epoch": 4.170860152135752, "grad_norm": 0.5260404348373413, "learning_rate": 7.21833053128319e-05, "loss": 1.4109, "step": 891 }, { "epoch": 4.175541252194265, "grad_norm": 0.5581408739089966, "learning_rate": 7.210984126634476e-05, "loss": 1.4049, "step": 892 }, { "epoch": 4.1802223522527795, "grad_norm": 0.5089554786682129, "learning_rate": 7.20363178397409e-05, "loss": 1.3986, "step": 893 }, { "epoch": 4.184903452311294, "grad_norm": 0.5193794965744019, "learning_rate": 7.19627352304812e-05, "loss": 1.4088, "step": 894 }, { "epoch": 4.189584552369807, "grad_norm": 0.5327683091163635, "learning_rate": 7.188909363618557e-05, "loss": 1.4, "step": 895 }, { "epoch": 4.194265652428321, "grad_norm": 0.5190610885620117, "learning_rate": 7.181539325463227e-05, "loss": 1.4081, "step": 896 }, { "epoch": 4.198946752486834, "grad_norm": 0.5354302525520325, "learning_rate": 7.174163428375748e-05, "loss": 1.4022, "step": 897 }, { "epoch": 4.203627852545348, "grad_norm": 0.5408794283866882, "learning_rate": 7.16678169216547e-05, "loss": 1.4132, "step": 898 }, { "epoch": 4.208308952603862, "grad_norm": 0.5260702967643738, "learning_rate": 7.159394136657427e-05, "loss": 1.4131, "step": 899 }, { "epoch": 4.212990052662375, "grad_norm": 0.546550452709198, "learning_rate": 7.152000781692286e-05, "loss": 1.3992, "step": 900 }, { "epoch": 4.217671152720889, "grad_norm": 0.516927182674408, "learning_rate": 7.144601647126281e-05, "loss": 1.3858, "step": 901 }, { "epoch": 4.2223522527794035, "grad_norm": 0.5527111887931824, "learning_rate": 7.137196752831177e-05, "loss": 1.3951, "step": 902 }, { "epoch": 4.227033352837917, "grad_norm": 0.5520824790000916, "learning_rate": 7.1297861186942e-05, "loss": 1.3779, "step": 903 }, { "epoch": 4.231714452896431, "grad_norm": 0.5436320304870605, "learning_rate": 7.122369764617993e-05, "loss": 1.4142, "step": 904 }, { "epoch": 4.236395552954944, "grad_norm": 0.5542235374450684, "learning_rate": 7.114947710520569e-05, "loss": 1.4066, "step": 905 }, { "epoch": 4.241076653013458, "grad_norm": 0.5395337343215942, "learning_rate": 7.10751997633524e-05, "loss": 1.403, "step": 906 }, { "epoch": 4.245757753071972, "grad_norm": 0.5479816198348999, "learning_rate": 7.100086582010576e-05, "loss": 1.3946, "step": 907 }, { "epoch": 4.250438853130485, "grad_norm": 0.5550580620765686, "learning_rate": 7.092647547510351e-05, "loss": 1.3902, "step": 908 }, { "epoch": 4.255119953188999, "grad_norm": 0.5406416058540344, "learning_rate": 7.08520289281348e-05, "loss": 1.4019, "step": 909 }, { "epoch": 4.2598010532475135, "grad_norm": 0.5209553837776184, "learning_rate": 7.07775263791398e-05, "loss": 1.4078, "step": 910 }, { "epoch": 4.264482153306027, "grad_norm": 0.5705069303512573, "learning_rate": 7.070296802820907e-05, "loss": 1.3928, "step": 911 }, { "epoch": 4.269163253364541, "grad_norm": 0.5689321756362915, "learning_rate": 7.062835407558294e-05, "loss": 1.3885, "step": 912 }, { "epoch": 4.273844353423055, "grad_norm": 0.5145436525344849, "learning_rate": 7.055368472165123e-05, "loss": 1.4017, "step": 913 }, { "epoch": 4.278525453481568, "grad_norm": 0.5727246403694153, "learning_rate": 7.047896016695239e-05, "loss": 1.3992, "step": 914 }, { "epoch": 4.283206553540082, "grad_norm": 0.5670337677001953, "learning_rate": 7.040418061217325e-05, "loss": 1.4002, "step": 915 }, { "epoch": 4.287887653598595, "grad_norm": 0.5520856380462646, "learning_rate": 7.032934625814829e-05, "loss": 1.4064, "step": 916 }, { "epoch": 4.292568753657109, "grad_norm": 0.556670069694519, "learning_rate": 7.025445730585915e-05, "loss": 1.4072, "step": 917 }, { "epoch": 4.297249853715623, "grad_norm": 0.5678969025611877, "learning_rate": 7.017951395643417e-05, "loss": 1.424, "step": 918 }, { "epoch": 4.301930953774137, "grad_norm": 0.5321788787841797, "learning_rate": 7.010451641114768e-05, "loss": 1.3822, "step": 919 }, { "epoch": 4.306612053832651, "grad_norm": 0.5413317084312439, "learning_rate": 7.002946487141967e-05, "loss": 1.4016, "step": 920 }, { "epoch": 4.311293153891165, "grad_norm": 0.5802444219589233, "learning_rate": 6.995435953881509e-05, "loss": 1.4047, "step": 921 }, { "epoch": 4.315974253949678, "grad_norm": 0.5204755067825317, "learning_rate": 6.987920061504336e-05, "loss": 1.3945, "step": 922 }, { "epoch": 4.320655354008192, "grad_norm": 0.5474327802658081, "learning_rate": 6.980398830195785e-05, "loss": 1.3974, "step": 923 }, { "epoch": 4.325336454066706, "grad_norm": 0.5491933226585388, "learning_rate": 6.972872280155528e-05, "loss": 1.4182, "step": 924 }, { "epoch": 4.330017554125219, "grad_norm": 0.5143939256668091, "learning_rate": 6.965340431597525e-05, "loss": 1.4069, "step": 925 }, { "epoch": 4.334698654183733, "grad_norm": 0.5000368356704712, "learning_rate": 6.957803304749966e-05, "loss": 1.3999, "step": 926 }, { "epoch": 4.3393797542422465, "grad_norm": 0.5126700401306152, "learning_rate": 6.950260919855213e-05, "loss": 1.3851, "step": 927 }, { "epoch": 4.344060854300761, "grad_norm": 0.5102867484092712, "learning_rate": 6.942713297169755e-05, "loss": 1.3897, "step": 928 }, { "epoch": 4.348741954359275, "grad_norm": 0.5007814764976501, "learning_rate": 6.935160456964144e-05, "loss": 1.3965, "step": 929 }, { "epoch": 4.353423054417788, "grad_norm": 0.5208529829978943, "learning_rate": 6.927602419522947e-05, "loss": 1.3983, "step": 930 }, { "epoch": 4.358104154476302, "grad_norm": 0.5251065492630005, "learning_rate": 6.920039205144686e-05, "loss": 1.4218, "step": 931 }, { "epoch": 4.362785254534816, "grad_norm": 0.49510329961776733, "learning_rate": 6.912470834141791e-05, "loss": 1.3946, "step": 932 }, { "epoch": 4.367466354593329, "grad_norm": 0.5090747475624084, "learning_rate": 6.904897326840537e-05, "loss": 1.3917, "step": 933 }, { "epoch": 4.372147454651843, "grad_norm": 0.49734053015708923, "learning_rate": 6.897318703580995e-05, "loss": 1.3687, "step": 934 }, { "epoch": 4.376828554710357, "grad_norm": 0.5525650978088379, "learning_rate": 6.889734984716982e-05, "loss": 1.4092, "step": 935 }, { "epoch": 4.3815096547688706, "grad_norm": 0.5270445942878723, "learning_rate": 6.882146190615984e-05, "loss": 1.4065, "step": 936 }, { "epoch": 4.386190754827385, "grad_norm": 0.5141691565513611, "learning_rate": 6.874552341659137e-05, "loss": 1.4111, "step": 937 }, { "epoch": 4.390871854885898, "grad_norm": 0.5064695477485657, "learning_rate": 6.86695345824114e-05, "loss": 1.387, "step": 938 }, { "epoch": 4.395552954944412, "grad_norm": 0.5093940496444702, "learning_rate": 6.859349560770213e-05, "loss": 1.4198, "step": 939 }, { "epoch": 4.400234055002926, "grad_norm": 0.5096136927604675, "learning_rate": 6.851740669668055e-05, "loss": 1.3782, "step": 940 }, { "epoch": 4.404915155061439, "grad_norm": 0.5299738049507141, "learning_rate": 6.844126805369758e-05, "loss": 1.3926, "step": 941 }, { "epoch": 4.409596255119953, "grad_norm": 0.5203460454940796, "learning_rate": 6.836507988323784e-05, "loss": 1.3865, "step": 942 }, { "epoch": 4.414277355178467, "grad_norm": 0.5185226202011108, "learning_rate": 6.828884238991892e-05, "loss": 1.4046, "step": 943 }, { "epoch": 4.4189584552369805, "grad_norm": 0.5208016037940979, "learning_rate": 6.821255577849086e-05, "loss": 1.4046, "step": 944 }, { "epoch": 4.423639555295495, "grad_norm": 0.5160471200942993, "learning_rate": 6.813622025383565e-05, "loss": 1.4017, "step": 945 }, { "epoch": 4.428320655354009, "grad_norm": 0.5242071151733398, "learning_rate": 6.805983602096661e-05, "loss": 1.3965, "step": 946 }, { "epoch": 4.433001755412522, "grad_norm": 0.5100763440132141, "learning_rate": 6.798340328502792e-05, "loss": 1.4058, "step": 947 }, { "epoch": 4.437682855471036, "grad_norm": 0.5151635408401489, "learning_rate": 6.790692225129398e-05, "loss": 1.4159, "step": 948 }, { "epoch": 4.442363955529549, "grad_norm": 0.5144315958023071, "learning_rate": 6.783039312516889e-05, "loss": 1.3947, "step": 949 }, { "epoch": 4.447045055588063, "grad_norm": 0.5269882678985596, "learning_rate": 6.7753816112186e-05, "loss": 1.383, "step": 950 }, { "epoch": 4.451726155646577, "grad_norm": 0.5096060037612915, "learning_rate": 6.767719141800717e-05, "loss": 1.3778, "step": 951 }, { "epoch": 4.45640725570509, "grad_norm": 0.4936077892780304, "learning_rate": 6.760051924842239e-05, "loss": 1.3645, "step": 952 }, { "epoch": 4.4610883557636045, "grad_norm": 0.5201056003570557, "learning_rate": 6.752379980934909e-05, "loss": 1.3968, "step": 953 }, { "epoch": 4.465769455822119, "grad_norm": 0.516392707824707, "learning_rate": 6.744703330683169e-05, "loss": 1.3793, "step": 954 }, { "epoch": 4.470450555880632, "grad_norm": 0.505644679069519, "learning_rate": 6.7370219947041e-05, "loss": 1.3902, "step": 955 }, { "epoch": 4.475131655939146, "grad_norm": 0.4903636574745178, "learning_rate": 6.729335993627368e-05, "loss": 1.3741, "step": 956 }, { "epoch": 4.479812755997659, "grad_norm": 0.5225103497505188, "learning_rate": 6.721645348095167e-05, "loss": 1.3859, "step": 957 }, { "epoch": 4.484493856056173, "grad_norm": 0.5010210275650024, "learning_rate": 6.713950078762165e-05, "loss": 1.4044, "step": 958 }, { "epoch": 4.489174956114687, "grad_norm": 0.5460252165794373, "learning_rate": 6.706250206295449e-05, "loss": 1.3896, "step": 959 }, { "epoch": 4.4938560561732, "grad_norm": 0.5131853222846985, "learning_rate": 6.698545751374465e-05, "loss": 1.3853, "step": 960 }, { "epoch": 4.4985371562317145, "grad_norm": 0.5487529039382935, "learning_rate": 6.69083673469097e-05, "loss": 1.401, "step": 961 }, { "epoch": 4.5032182562902285, "grad_norm": 0.5241919159889221, "learning_rate": 6.683123176948973e-05, "loss": 1.3839, "step": 962 }, { "epoch": 4.507899356348742, "grad_norm": 0.5286917686462402, "learning_rate": 6.675405098864675e-05, "loss": 1.3946, "step": 963 }, { "epoch": 4.512580456407256, "grad_norm": 0.5300504565238953, "learning_rate": 6.667682521166419e-05, "loss": 1.4107, "step": 964 }, { "epoch": 4.517261556465769, "grad_norm": 0.5106096267700195, "learning_rate": 6.659955464594632e-05, "loss": 1.3983, "step": 965 }, { "epoch": 4.521942656524283, "grad_norm": 0.5417435169219971, "learning_rate": 6.652223949901774e-05, "loss": 1.4005, "step": 966 }, { "epoch": 4.526623756582797, "grad_norm": 0.5418683290481567, "learning_rate": 6.644487997852272e-05, "loss": 1.4127, "step": 967 }, { "epoch": 4.53130485664131, "grad_norm": 0.5197861790657043, "learning_rate": 6.636747629222475e-05, "loss": 1.391, "step": 968 }, { "epoch": 4.535985956699824, "grad_norm": 0.5314134359359741, "learning_rate": 6.629002864800589e-05, "loss": 1.3912, "step": 969 }, { "epoch": 4.5406670567583385, "grad_norm": 0.526654839515686, "learning_rate": 6.621253725386628e-05, "loss": 1.3949, "step": 970 }, { "epoch": 4.545348156816852, "grad_norm": 0.527079701423645, "learning_rate": 6.61350023179236e-05, "loss": 1.4042, "step": 971 }, { "epoch": 4.550029256875366, "grad_norm": 0.5492599606513977, "learning_rate": 6.605742404841241e-05, "loss": 1.4006, "step": 972 }, { "epoch": 4.55471035693388, "grad_norm": 0.522839367389679, "learning_rate": 6.597980265368367e-05, "loss": 1.3986, "step": 973 }, { "epoch": 4.559391456992393, "grad_norm": 0.5506901741027832, "learning_rate": 6.590213834220415e-05, "loss": 1.3812, "step": 974 }, { "epoch": 4.564072557050907, "grad_norm": 0.5584288835525513, "learning_rate": 6.582443132255592e-05, "loss": 1.4006, "step": 975 }, { "epoch": 4.56875365710942, "grad_norm": 0.5198907256126404, "learning_rate": 6.574668180343571e-05, "loss": 1.4016, "step": 976 }, { "epoch": 4.573434757167934, "grad_norm": 0.5312773585319519, "learning_rate": 6.566888999365441e-05, "loss": 1.3853, "step": 977 }, { "epoch": 4.578115857226448, "grad_norm": 0.5397941470146179, "learning_rate": 6.559105610213649e-05, "loss": 1.376, "step": 978 }, { "epoch": 4.582796957284962, "grad_norm": 0.515097439289093, "learning_rate": 6.551318033791942e-05, "loss": 1.4004, "step": 979 }, { "epoch": 4.587478057343476, "grad_norm": 0.554069459438324, "learning_rate": 6.543526291015315e-05, "loss": 1.3937, "step": 980 }, { "epoch": 4.59215915740199, "grad_norm": 0.5135395526885986, "learning_rate": 6.53573040280995e-05, "loss": 1.3848, "step": 981 }, { "epoch": 4.596840257460503, "grad_norm": 0.554758608341217, "learning_rate": 6.527930390113166e-05, "loss": 1.3784, "step": 982 }, { "epoch": 4.601521357519017, "grad_norm": 0.5125584006309509, "learning_rate": 6.520126273873356e-05, "loss": 1.3807, "step": 983 }, { "epoch": 4.606202457577531, "grad_norm": 0.5136418342590332, "learning_rate": 6.512318075049935e-05, "loss": 1.3824, "step": 984 }, { "epoch": 4.610883557636044, "grad_norm": 0.5604468584060669, "learning_rate": 6.50450581461328e-05, "loss": 1.3738, "step": 985 }, { "epoch": 4.615564657694558, "grad_norm": 0.5250749588012695, "learning_rate": 6.496689513544682e-05, "loss": 1.4121, "step": 986 }, { "epoch": 4.620245757753072, "grad_norm": 0.5173577666282654, "learning_rate": 6.488869192836278e-05, "loss": 1.3659, "step": 987 }, { "epoch": 4.624926857811586, "grad_norm": 0.5003069639205933, "learning_rate": 6.481044873491004e-05, "loss": 1.3967, "step": 988 }, { "epoch": 4.6296079578701, "grad_norm": 0.49318012595176697, "learning_rate": 6.473216576522535e-05, "loss": 1.3986, "step": 989 }, { "epoch": 4.634289057928613, "grad_norm": 0.5480040907859802, "learning_rate": 6.465384322955224e-05, "loss": 1.404, "step": 990 }, { "epoch": 4.638970157987127, "grad_norm": 0.49322742223739624, "learning_rate": 6.457548133824057e-05, "loss": 1.3786, "step": 991 }, { "epoch": 4.643651258045641, "grad_norm": 0.5618907809257507, "learning_rate": 6.449708030174587e-05, "loss": 1.3984, "step": 992 }, { "epoch": 4.648332358104154, "grad_norm": 0.5069143772125244, "learning_rate": 6.441864033062879e-05, "loss": 1.3675, "step": 993 }, { "epoch": 4.653013458162668, "grad_norm": 0.5381274223327637, "learning_rate": 6.434016163555452e-05, "loss": 1.3848, "step": 994 }, { "epoch": 4.657694558221182, "grad_norm": 0.5342992544174194, "learning_rate": 6.426164442729232e-05, "loss": 1.3804, "step": 995 }, { "epoch": 4.662375658279696, "grad_norm": 0.5064785480499268, "learning_rate": 6.418308891671484e-05, "loss": 1.3686, "step": 996 }, { "epoch": 4.66705675833821, "grad_norm": 0.5631519556045532, "learning_rate": 6.410449531479761e-05, "loss": 1.3819, "step": 997 }, { "epoch": 4.671737858396723, "grad_norm": 0.5275139808654785, "learning_rate": 6.402586383261844e-05, "loss": 1.3877, "step": 998 }, { "epoch": 4.676418958455237, "grad_norm": 0.5353507995605469, "learning_rate": 6.394719468135691e-05, "loss": 1.388, "step": 999 }, { "epoch": 4.681100058513751, "grad_norm": 0.543814480304718, "learning_rate": 6.386848807229373e-05, "loss": 1.4066, "step": 1000 }, { "epoch": 4.685781158572264, "grad_norm": 0.5282537937164307, "learning_rate": 6.378974421681027e-05, "loss": 1.3901, "step": 1001 }, { "epoch": 4.690462258630778, "grad_norm": 0.5356928110122681, "learning_rate": 6.371096332638784e-05, "loss": 1.3953, "step": 1002 }, { "epoch": 4.695143358689292, "grad_norm": 0.503050684928894, "learning_rate": 6.36321456126073e-05, "loss": 1.3777, "step": 1003 }, { "epoch": 4.6998244587478055, "grad_norm": 0.5379858016967773, "learning_rate": 6.355329128714834e-05, "loss": 1.3895, "step": 1004 }, { "epoch": 4.70450555880632, "grad_norm": 0.5181742310523987, "learning_rate": 6.347440056178904e-05, "loss": 1.3797, "step": 1005 }, { "epoch": 4.709186658864834, "grad_norm": 0.5005363821983337, "learning_rate": 6.339547364840522e-05, "loss": 1.3904, "step": 1006 }, { "epoch": 4.713867758923347, "grad_norm": 0.5222419500350952, "learning_rate": 6.331651075896983e-05, "loss": 1.3625, "step": 1007 }, { "epoch": 4.718548858981861, "grad_norm": 0.5192439556121826, "learning_rate": 6.323751210555252e-05, "loss": 1.3799, "step": 1008 }, { "epoch": 4.723229959040374, "grad_norm": 0.5143327116966248, "learning_rate": 6.315847790031895e-05, "loss": 1.3915, "step": 1009 }, { "epoch": 4.727911059098888, "grad_norm": 0.5154076814651489, "learning_rate": 6.307940835553027e-05, "loss": 1.3921, "step": 1010 }, { "epoch": 4.732592159157402, "grad_norm": 0.5128686428070068, "learning_rate": 6.300030368354255e-05, "loss": 1.3899, "step": 1011 }, { "epoch": 4.7372732592159155, "grad_norm": 0.5078786015510559, "learning_rate": 6.292116409680617e-05, "loss": 1.3807, "step": 1012 }, { "epoch": 4.7419543592744295, "grad_norm": 0.5336173176765442, "learning_rate": 6.284198980786533e-05, "loss": 1.4099, "step": 1013 }, { "epoch": 4.746635459332944, "grad_norm": 0.5144036412239075, "learning_rate": 6.276278102935739e-05, "loss": 1.3845, "step": 1014 }, { "epoch": 4.751316559391457, "grad_norm": 0.5357654094696045, "learning_rate": 6.268353797401234e-05, "loss": 1.373, "step": 1015 }, { "epoch": 4.755997659449971, "grad_norm": 0.5412711501121521, "learning_rate": 6.260426085465225e-05, "loss": 1.3678, "step": 1016 }, { "epoch": 4.760678759508485, "grad_norm": 0.5578483939170837, "learning_rate": 6.252494988419066e-05, "loss": 1.3851, "step": 1017 }, { "epoch": 4.765359859566998, "grad_norm": 0.5388118028640747, "learning_rate": 6.244560527563199e-05, "loss": 1.3954, "step": 1018 }, { "epoch": 4.770040959625512, "grad_norm": 0.5390235781669617, "learning_rate": 6.23662272420711e-05, "loss": 1.3814, "step": 1019 }, { "epoch": 4.774722059684025, "grad_norm": 0.5449773073196411, "learning_rate": 6.228681599669248e-05, "loss": 1.3758, "step": 1020 }, { "epoch": 4.7794031597425395, "grad_norm": 0.515874445438385, "learning_rate": 6.220737175276996e-05, "loss": 1.362, "step": 1021 }, { "epoch": 4.784084259801054, "grad_norm": 0.5363708138465881, "learning_rate": 6.212789472366591e-05, "loss": 1.4039, "step": 1022 }, { "epoch": 4.788765359859567, "grad_norm": 0.5405462980270386, "learning_rate": 6.204838512283072e-05, "loss": 1.3834, "step": 1023 }, { "epoch": 4.793446459918081, "grad_norm": 0.5273584723472595, "learning_rate": 6.196884316380237e-05, "loss": 1.369, "step": 1024 }, { "epoch": 4.798127559976594, "grad_norm": 0.5312931537628174, "learning_rate": 6.188926906020562e-05, "loss": 1.3913, "step": 1025 }, { "epoch": 4.802808660035108, "grad_norm": 0.5499268174171448, "learning_rate": 6.180966302575166e-05, "loss": 1.4118, "step": 1026 }, { "epoch": 4.807489760093622, "grad_norm": 0.5172853469848633, "learning_rate": 6.173002527423737e-05, "loss": 1.3869, "step": 1027 }, { "epoch": 4.812170860152136, "grad_norm": 0.5686773657798767, "learning_rate": 6.165035601954485e-05, "loss": 1.3715, "step": 1028 }, { "epoch": 4.816851960210649, "grad_norm": 0.5325746536254883, "learning_rate": 6.157065547564079e-05, "loss": 1.3785, "step": 1029 }, { "epoch": 4.8215330602691635, "grad_norm": 0.5127576589584351, "learning_rate": 6.149092385657591e-05, "loss": 1.3906, "step": 1030 }, { "epoch": 4.826214160327677, "grad_norm": 0.5501847267150879, "learning_rate": 6.14111613764844e-05, "loss": 1.4056, "step": 1031 }, { "epoch": 4.830895260386191, "grad_norm": 0.5234781503677368, "learning_rate": 6.133136824958335e-05, "loss": 1.4034, "step": 1032 }, { "epoch": 4.835576360444705, "grad_norm": 0.5253031849861145, "learning_rate": 6.125154469017209e-05, "loss": 1.3904, "step": 1033 }, { "epoch": 4.840257460503218, "grad_norm": 0.5461185574531555, "learning_rate": 6.117169091263177e-05, "loss": 1.3921, "step": 1034 }, { "epoch": 4.844938560561732, "grad_norm": 0.5113133788108826, "learning_rate": 6.109180713142465e-05, "loss": 1.3875, "step": 1035 }, { "epoch": 4.849619660620245, "grad_norm": 0.5434524416923523, "learning_rate": 6.1011893561093535e-05, "loss": 1.412, "step": 1036 }, { "epoch": 4.854300760678759, "grad_norm": 0.5480051636695862, "learning_rate": 6.093195041626132e-05, "loss": 1.4039, "step": 1037 }, { "epoch": 4.8589818607372735, "grad_norm": 0.520328938961029, "learning_rate": 6.085197791163028e-05, "loss": 1.3992, "step": 1038 }, { "epoch": 4.863662960795787, "grad_norm": 0.5311024188995361, "learning_rate": 6.077197626198151e-05, "loss": 1.3981, "step": 1039 }, { "epoch": 4.868344060854301, "grad_norm": 0.5565276741981506, "learning_rate": 6.0691945682174434e-05, "loss": 1.3742, "step": 1040 }, { "epoch": 4.873025160912815, "grad_norm": 0.518947958946228, "learning_rate": 6.0611886387146156e-05, "loss": 1.3944, "step": 1041 }, { "epoch": 4.877706260971328, "grad_norm": 0.5651046633720398, "learning_rate": 6.0531798591910896e-05, "loss": 1.4093, "step": 1042 }, { "epoch": 4.882387361029842, "grad_norm": 0.5247631072998047, "learning_rate": 6.0451682511559416e-05, "loss": 1.3807, "step": 1043 }, { "epoch": 4.887068461088356, "grad_norm": 0.5134245753288269, "learning_rate": 6.037153836125844e-05, "loss": 1.383, "step": 1044 }, { "epoch": 4.891749561146869, "grad_norm": 0.5301898717880249, "learning_rate": 6.029136635625008e-05, "loss": 1.3791, "step": 1045 }, { "epoch": 4.896430661205383, "grad_norm": 0.5449047684669495, "learning_rate": 6.0211166711851264e-05, "loss": 1.3943, "step": 1046 }, { "epoch": 4.901111761263897, "grad_norm": 0.5147010684013367, "learning_rate": 6.013093964345314e-05, "loss": 1.3803, "step": 1047 }, { "epoch": 4.905792861322411, "grad_norm": 0.5083584785461426, "learning_rate": 6.0050685366520545e-05, "loss": 1.3966, "step": 1048 }, { "epoch": 4.910473961380925, "grad_norm": 0.5392999053001404, "learning_rate": 5.997040409659132e-05, "loss": 1.3822, "step": 1049 }, { "epoch": 4.915155061439438, "grad_norm": 0.5468650460243225, "learning_rate": 5.989009604927587e-05, "loss": 1.3845, "step": 1050 }, { "epoch": 4.919836161497952, "grad_norm": 0.5140033960342407, "learning_rate": 5.980976144025647e-05, "loss": 1.3736, "step": 1051 }, { "epoch": 4.924517261556466, "grad_norm": 0.5236037969589233, "learning_rate": 5.972940048528675e-05, "loss": 1.3911, "step": 1052 }, { "epoch": 4.929198361614979, "grad_norm": 0.5266885161399841, "learning_rate": 5.9649013400191104e-05, "loss": 1.3906, "step": 1053 }, { "epoch": 4.933879461673493, "grad_norm": 0.5133062601089478, "learning_rate": 5.956860040086407e-05, "loss": 1.3672, "step": 1054 }, { "epoch": 4.938560561732007, "grad_norm": 0.5198325514793396, "learning_rate": 5.948816170326985e-05, "loss": 1.3813, "step": 1055 }, { "epoch": 4.943241661790521, "grad_norm": 0.530920684337616, "learning_rate": 5.9407697523441576e-05, "loss": 1.3997, "step": 1056 }, { "epoch": 4.947922761849035, "grad_norm": 0.5203038454055786, "learning_rate": 5.932720807748089e-05, "loss": 1.3741, "step": 1057 }, { "epoch": 4.952603861907548, "grad_norm": 0.5104234218597412, "learning_rate": 5.924669358155727e-05, "loss": 1.3938, "step": 1058 }, { "epoch": 4.957284961966062, "grad_norm": 0.5079901814460754, "learning_rate": 5.916615425190743e-05, "loss": 1.371, "step": 1059 }, { "epoch": 4.961966062024576, "grad_norm": 0.5349999070167542, "learning_rate": 5.908559030483484e-05, "loss": 1.3715, "step": 1060 }, { "epoch": 4.966647162083089, "grad_norm": 0.5009754300117493, "learning_rate": 5.900500195670905e-05, "loss": 1.3818, "step": 1061 }, { "epoch": 4.971328262141603, "grad_norm": 0.4894314706325531, "learning_rate": 5.8924389423965155e-05, "loss": 1.379, "step": 1062 }, { "epoch": 4.976009362200117, "grad_norm": 0.5114794969558716, "learning_rate": 5.884375292310319e-05, "loss": 1.37, "step": 1063 }, { "epoch": 4.9806904622586305, "grad_norm": 0.5092068314552307, "learning_rate": 5.8763092670687584e-05, "loss": 1.4138, "step": 1064 }, { "epoch": 4.985371562317145, "grad_norm": 0.511785089969635, "learning_rate": 5.868240888334653e-05, "loss": 1.3846, "step": 1065 }, { "epoch": 4.990052662375659, "grad_norm": 0.508854329586029, "learning_rate": 5.860170177777144e-05, "loss": 1.3671, "step": 1066 }, { "epoch": 4.994733762434172, "grad_norm": 0.5381656885147095, "learning_rate": 5.852097157071635e-05, "loss": 1.3861, "step": 1067 }, { "epoch": 4.999414862492686, "grad_norm": 0.5062696933746338, "learning_rate": 5.844021847899734e-05, "loss": 1.3698, "step": 1068 }, { "epoch": 5.004095962551199, "grad_norm": 1.824023723602295, "learning_rate": 5.8359442719491975e-05, "loss": 1.3047, "step": 1069 }, { "epoch": 5.008777062609713, "grad_norm": 0.5977228879928589, "learning_rate": 5.8278644509138645e-05, "loss": 1.3578, "step": 1070 }, { "epoch": 5.013458162668227, "grad_norm": 0.5300642848014832, "learning_rate": 5.8197824064936066e-05, "loss": 1.3888, "step": 1071 }, { "epoch": 5.0181392627267405, "grad_norm": 0.5731930136680603, "learning_rate": 5.81169816039427e-05, "loss": 1.3759, "step": 1072 }, { "epoch": 5.022820362785255, "grad_norm": 0.5253560543060303, "learning_rate": 5.8036117343276065e-05, "loss": 1.361, "step": 1073 }, { "epoch": 5.027501462843769, "grad_norm": 0.5331753492355347, "learning_rate": 5.7955231500112296e-05, "loss": 1.4112, "step": 1074 }, { "epoch": 5.032182562902282, "grad_norm": 0.536668062210083, "learning_rate": 5.7874324291685446e-05, "loss": 1.4051, "step": 1075 }, { "epoch": 5.036863662960796, "grad_norm": 0.5135036110877991, "learning_rate": 5.7793395935286964e-05, "loss": 1.3882, "step": 1076 }, { "epoch": 5.04154476301931, "grad_norm": 0.5347151756286621, "learning_rate": 5.771244664826512e-05, "loss": 1.3693, "step": 1077 }, { "epoch": 5.046225863077823, "grad_norm": 0.5390860438346863, "learning_rate": 5.763147664802435e-05, "loss": 1.3811, "step": 1078 }, { "epoch": 5.050906963136337, "grad_norm": 0.5213562250137329, "learning_rate": 5.7550486152024774e-05, "loss": 1.3884, "step": 1079 }, { "epoch": 5.05558806319485, "grad_norm": 0.5514521598815918, "learning_rate": 5.74694753777815e-05, "loss": 1.3939, "step": 1080 }, { "epoch": 5.0602691632533645, "grad_norm": 0.5222517251968384, "learning_rate": 5.738844454286411e-05, "loss": 1.3776, "step": 1081 }, { "epoch": 5.064950263311879, "grad_norm": 0.5084861516952515, "learning_rate": 5.730739386489612e-05, "loss": 1.3786, "step": 1082 }, { "epoch": 5.069631363370392, "grad_norm": 0.5144921541213989, "learning_rate": 5.7226323561554276e-05, "loss": 1.3698, "step": 1083 }, { "epoch": 5.074312463428906, "grad_norm": 0.5156840085983276, "learning_rate": 5.714523385056802e-05, "loss": 1.3798, "step": 1084 }, { "epoch": 5.07899356348742, "grad_norm": 0.5120629668235779, "learning_rate": 5.706412494971901e-05, "loss": 1.3657, "step": 1085 }, { "epoch": 5.083674663545933, "grad_norm": 0.5259561538696289, "learning_rate": 5.6982997076840306e-05, "loss": 1.3836, "step": 1086 }, { "epoch": 5.088355763604447, "grad_norm": 0.5179267525672913, "learning_rate": 5.690185044981606e-05, "loss": 1.4128, "step": 1087 }, { "epoch": 5.09303686366296, "grad_norm": 0.4868166148662567, "learning_rate": 5.68206852865807e-05, "loss": 1.3634, "step": 1088 }, { "epoch": 5.0977179637214745, "grad_norm": 0.5132505297660828, "learning_rate": 5.673950180511845e-05, "loss": 1.3701, "step": 1089 }, { "epoch": 5.1023990637799885, "grad_norm": 0.5255224108695984, "learning_rate": 5.665830022346277e-05, "loss": 1.3893, "step": 1090 }, { "epoch": 5.107080163838502, "grad_norm": 0.5020806789398193, "learning_rate": 5.657708075969567e-05, "loss": 1.3687, "step": 1091 }, { "epoch": 5.111761263897016, "grad_norm": 0.4931749701499939, "learning_rate": 5.649584363194724e-05, "loss": 1.3766, "step": 1092 }, { "epoch": 5.11644236395553, "grad_norm": 0.49240025877952576, "learning_rate": 5.6414589058395004e-05, "loss": 1.3917, "step": 1093 }, { "epoch": 5.121123464014043, "grad_norm": 0.5174960494041443, "learning_rate": 5.6333317257263305e-05, "loss": 1.3842, "step": 1094 }, { "epoch": 5.125804564072557, "grad_norm": 0.5175259709358215, "learning_rate": 5.62520284468228e-05, "loss": 1.3828, "step": 1095 }, { "epoch": 5.130485664131071, "grad_norm": 0.5083780884742737, "learning_rate": 5.6170722845389765e-05, "loss": 1.3807, "step": 1096 }, { "epoch": 5.135166764189584, "grad_norm": 0.4956238865852356, "learning_rate": 5.6089400671325644e-05, "loss": 1.3676, "step": 1097 }, { "epoch": 5.1398478642480985, "grad_norm": 0.5338079929351807, "learning_rate": 5.6008062143036346e-05, "loss": 1.4095, "step": 1098 }, { "epoch": 5.144528964306612, "grad_norm": 0.5186750888824463, "learning_rate": 5.592670747897171e-05, "loss": 1.3557, "step": 1099 }, { "epoch": 5.149210064365126, "grad_norm": 0.505437970161438, "learning_rate": 5.5845336897624915e-05, "loss": 1.3948, "step": 1100 }, { "epoch": 5.15389116442364, "grad_norm": 0.4789367914199829, "learning_rate": 5.576395061753187e-05, "loss": 1.3748, "step": 1101 }, { "epoch": 5.158572264482153, "grad_norm": 0.5059368014335632, "learning_rate": 5.568254885727068e-05, "loss": 1.3792, "step": 1102 }, { "epoch": 5.163253364540667, "grad_norm": 0.5207875370979309, "learning_rate": 5.5601131835461003e-05, "loss": 1.3598, "step": 1103 }, { "epoch": 5.167934464599181, "grad_norm": 0.5086791515350342, "learning_rate": 5.55196997707635e-05, "loss": 1.3763, "step": 1104 }, { "epoch": 5.172615564657694, "grad_norm": 0.5212629437446594, "learning_rate": 5.5438252881879194e-05, "loss": 1.3568, "step": 1105 }, { "epoch": 5.177296664716208, "grad_norm": 0.5509768724441528, "learning_rate": 5.535679138754897e-05, "loss": 1.3703, "step": 1106 }, { "epoch": 5.181977764774722, "grad_norm": 0.5098535418510437, "learning_rate": 5.5275315506552906e-05, "loss": 1.3798, "step": 1107 }, { "epoch": 5.186658864833236, "grad_norm": 0.5017245411872864, "learning_rate": 5.5193825457709745e-05, "loss": 1.3775, "step": 1108 }, { "epoch": 5.19133996489175, "grad_norm": 0.523624062538147, "learning_rate": 5.511232145987626e-05, "loss": 1.3714, "step": 1109 }, { "epoch": 5.196021064950263, "grad_norm": 0.5008803009986877, "learning_rate": 5.5030803731946665e-05, "loss": 1.3593, "step": 1110 }, { "epoch": 5.200702165008777, "grad_norm": 0.5039325952529907, "learning_rate": 5.494927249285211e-05, "loss": 1.3629, "step": 1111 }, { "epoch": 5.205383265067291, "grad_norm": 0.4887968897819519, "learning_rate": 5.486772796155999e-05, "loss": 1.3852, "step": 1112 }, { "epoch": 5.210064365125804, "grad_norm": 0.5224624276161194, "learning_rate": 5.478617035707337e-05, "loss": 1.3957, "step": 1113 }, { "epoch": 5.214745465184318, "grad_norm": 0.5084558129310608, "learning_rate": 5.470459989843052e-05, "loss": 1.3731, "step": 1114 }, { "epoch": 5.219426565242832, "grad_norm": 0.48302987217903137, "learning_rate": 5.462301680470412e-05, "loss": 1.3728, "step": 1115 }, { "epoch": 5.224107665301346, "grad_norm": 0.5186018943786621, "learning_rate": 5.4541421295000864e-05, "loss": 1.3822, "step": 1116 }, { "epoch": 5.22878876535986, "grad_norm": 0.5260018110275269, "learning_rate": 5.445981358846077e-05, "loss": 1.3599, "step": 1117 }, { "epoch": 5.233469865418373, "grad_norm": 0.5030722618103027, "learning_rate": 5.437819390425659e-05, "loss": 1.3905, "step": 1118 }, { "epoch": 5.238150965476887, "grad_norm": 0.5315456986427307, "learning_rate": 5.429656246159327e-05, "loss": 1.3853, "step": 1119 }, { "epoch": 5.242832065535401, "grad_norm": 0.49964120984077454, "learning_rate": 5.4214919479707325e-05, "loss": 1.3709, "step": 1120 }, { "epoch": 5.247513165593914, "grad_norm": 0.5210468769073486, "learning_rate": 5.4133265177866256e-05, "loss": 1.3792, "step": 1121 }, { "epoch": 5.252194265652428, "grad_norm": 0.5345668792724609, "learning_rate": 5.4051599775368004e-05, "loss": 1.3763, "step": 1122 }, { "epoch": 5.256875365710942, "grad_norm": 0.5057262778282166, "learning_rate": 5.396992349154024e-05, "loss": 1.355, "step": 1123 }, { "epoch": 5.261556465769456, "grad_norm": 0.512408971786499, "learning_rate": 5.3888236545739955e-05, "loss": 1.3738, "step": 1124 }, { "epoch": 5.26623756582797, "grad_norm": 0.5097825527191162, "learning_rate": 5.380653915735272e-05, "loss": 1.388, "step": 1125 }, { "epoch": 5.270918665886484, "grad_norm": 0.5049353241920471, "learning_rate": 5.372483154579213e-05, "loss": 1.361, "step": 1126 }, { "epoch": 5.275599765944997, "grad_norm": 0.5072355270385742, "learning_rate": 5.364311393049931e-05, "loss": 1.3837, "step": 1127 }, { "epoch": 5.280280866003511, "grad_norm": 0.5134193897247314, "learning_rate": 5.356138653094219e-05, "loss": 1.3833, "step": 1128 }, { "epoch": 5.284961966062024, "grad_norm": 0.48993170261383057, "learning_rate": 5.347964956661502e-05, "loss": 1.399, "step": 1129 }, { "epoch": 5.289643066120538, "grad_norm": 0.5153975486755371, "learning_rate": 5.3397903257037685e-05, "loss": 1.3627, "step": 1130 }, { "epoch": 5.294324166179052, "grad_norm": 0.504741370677948, "learning_rate": 5.3316147821755205e-05, "loss": 1.3756, "step": 1131 }, { "epoch": 5.2990052662375655, "grad_norm": 0.502116322517395, "learning_rate": 5.3234383480337104e-05, "loss": 1.3584, "step": 1132 }, { "epoch": 5.30368636629608, "grad_norm": 0.5001406669616699, "learning_rate": 5.315261045237684e-05, "loss": 1.3828, "step": 1133 }, { "epoch": 5.308367466354594, "grad_norm": 0.49901527166366577, "learning_rate": 5.3070828957491156e-05, "loss": 1.3853, "step": 1134 }, { "epoch": 5.313048566413107, "grad_norm": 0.5057284832000732, "learning_rate": 5.29890392153196e-05, "loss": 1.3496, "step": 1135 }, { "epoch": 5.317729666471621, "grad_norm": 0.5117406845092773, "learning_rate": 5.290724144552379e-05, "loss": 1.3709, "step": 1136 }, { "epoch": 5.322410766530135, "grad_norm": 0.5053237080574036, "learning_rate": 5.2825435867786975e-05, "loss": 1.366, "step": 1137 }, { "epoch": 5.327091866588648, "grad_norm": 0.5771435499191284, "learning_rate": 5.2743622701813344e-05, "loss": 1.3787, "step": 1138 }, { "epoch": 5.331772966647162, "grad_norm": 0.5015900135040283, "learning_rate": 5.2661802167327445e-05, "loss": 1.3677, "step": 1139 }, { "epoch": 5.3364540667056755, "grad_norm": 0.4910103380680084, "learning_rate": 5.2579974484073655e-05, "loss": 1.357, "step": 1140 }, { "epoch": 5.3411351667641895, "grad_norm": 0.5642368793487549, "learning_rate": 5.249813987181553e-05, "loss": 1.3786, "step": 1141 }, { "epoch": 5.345816266822704, "grad_norm": 0.519067645072937, "learning_rate": 5.241629855033523e-05, "loss": 1.3667, "step": 1142 }, { "epoch": 5.350497366881217, "grad_norm": 0.483230859041214, "learning_rate": 5.233445073943295e-05, "loss": 1.366, "step": 1143 }, { "epoch": 5.355178466939731, "grad_norm": 0.494340717792511, "learning_rate": 5.225259665892629e-05, "loss": 1.3661, "step": 1144 }, { "epoch": 5.359859566998245, "grad_norm": 0.5203878879547119, "learning_rate": 5.217073652864972e-05, "loss": 1.3753, "step": 1145 }, { "epoch": 5.364540667056758, "grad_norm": 0.4912766218185425, "learning_rate": 5.208887056845394e-05, "loss": 1.3673, "step": 1146 }, { "epoch": 5.369221767115272, "grad_norm": 0.5085307359695435, "learning_rate": 5.200699899820527e-05, "loss": 1.3987, "step": 1147 }, { "epoch": 5.373902867173786, "grad_norm": 0.5408493876457214, "learning_rate": 5.192512203778519e-05, "loss": 1.362, "step": 1148 }, { "epoch": 5.3785839672322995, "grad_norm": 0.5014744997024536, "learning_rate": 5.184323990708958e-05, "loss": 1.3545, "step": 1149 }, { "epoch": 5.3832650672908136, "grad_norm": 0.5128848552703857, "learning_rate": 5.176135282602821e-05, "loss": 1.3659, "step": 1150 }, { "epoch": 5.387946167349327, "grad_norm": 0.49814894795417786, "learning_rate": 5.167946101452419e-05, "loss": 1.3796, "step": 1151 }, { "epoch": 5.392627267407841, "grad_norm": 0.4915131628513336, "learning_rate": 5.159756469251327e-05, "loss": 1.3845, "step": 1152 }, { "epoch": 5.397308367466355, "grad_norm": 0.48490697145462036, "learning_rate": 5.151566407994339e-05, "loss": 1.3686, "step": 1153 }, { "epoch": 5.401989467524868, "grad_norm": 0.5091562867164612, "learning_rate": 5.143375939677396e-05, "loss": 1.365, "step": 1154 }, { "epoch": 5.406670567583382, "grad_norm": 0.5359136462211609, "learning_rate": 5.1351850862975315e-05, "loss": 1.363, "step": 1155 }, { "epoch": 5.411351667641896, "grad_norm": 0.4946351945400238, "learning_rate": 5.126993869852819e-05, "loss": 1.3613, "step": 1156 }, { "epoch": 5.416032767700409, "grad_norm": 0.5049864649772644, "learning_rate": 5.118802312342299e-05, "loss": 1.3847, "step": 1157 }, { "epoch": 5.4207138677589235, "grad_norm": 0.506483256816864, "learning_rate": 5.110610435765934e-05, "loss": 1.3732, "step": 1158 }, { "epoch": 5.425394967817437, "grad_norm": 0.5119485855102539, "learning_rate": 5.1024182621245455e-05, "loss": 1.3457, "step": 1159 }, { "epoch": 5.430076067875951, "grad_norm": 0.507080614566803, "learning_rate": 5.094225813419743e-05, "loss": 1.3595, "step": 1160 }, { "epoch": 5.434757167934465, "grad_norm": 0.49319717288017273, "learning_rate": 5.086033111653884e-05, "loss": 1.3653, "step": 1161 }, { "epoch": 5.439438267992978, "grad_norm": 0.5073138475418091, "learning_rate": 5.0778401788300026e-05, "loss": 1.3739, "step": 1162 }, { "epoch": 5.444119368051492, "grad_norm": 0.4962432086467743, "learning_rate": 5.069647036951751e-05, "loss": 1.3528, "step": 1163 }, { "epoch": 5.448800468110006, "grad_norm": 0.48607367277145386, "learning_rate": 5.0614537080233484e-05, "loss": 1.346, "step": 1164 }, { "epoch": 5.453481568168519, "grad_norm": 0.5067170858383179, "learning_rate": 5.05326021404951e-05, "loss": 1.3729, "step": 1165 }, { "epoch": 5.458162668227033, "grad_norm": 0.5128630995750427, "learning_rate": 5.045066577035401e-05, "loss": 1.3561, "step": 1166 }, { "epoch": 5.4628437682855475, "grad_norm": 0.508293867111206, "learning_rate": 5.036872818986562e-05, "loss": 1.3688, "step": 1167 }, { "epoch": 5.467524868344061, "grad_norm": 0.5079975128173828, "learning_rate": 5.028678961908868e-05, "loss": 1.3581, "step": 1168 }, { "epoch": 5.472205968402575, "grad_norm": 0.517815113067627, "learning_rate": 5.020485027808455e-05, "loss": 1.37, "step": 1169 }, { "epoch": 5.476887068461088, "grad_norm": 0.5143315196037292, "learning_rate": 5.0122910386916656e-05, "loss": 1.3897, "step": 1170 }, { "epoch": 5.481568168519602, "grad_norm": 0.5109765529632568, "learning_rate": 5.0040970165649906e-05, "loss": 1.3633, "step": 1171 }, { "epoch": 5.486249268578116, "grad_norm": 0.5113378167152405, "learning_rate": 4.995902983435012e-05, "loss": 1.3613, "step": 1172 }, { "epoch": 5.490930368636629, "grad_norm": 0.4970690608024597, "learning_rate": 4.9877089613083356e-05, "loss": 1.3629, "step": 1173 }, { "epoch": 5.495611468695143, "grad_norm": 0.5230845212936401, "learning_rate": 4.9795149721915476e-05, "loss": 1.383, "step": 1174 }, { "epoch": 5.5002925687536575, "grad_norm": 0.5186367034912109, "learning_rate": 4.971321038091133e-05, "loss": 1.3685, "step": 1175 }, { "epoch": 5.504973668812171, "grad_norm": 0.5045866966247559, "learning_rate": 4.963127181013438e-05, "loss": 1.3856, "step": 1176 }, { "epoch": 5.509654768870685, "grad_norm": 0.49818751215934753, "learning_rate": 4.954933422964602e-05, "loss": 1.3566, "step": 1177 }, { "epoch": 5.514335868929198, "grad_norm": 0.5398290753364563, "learning_rate": 4.94673978595049e-05, "loss": 1.376, "step": 1178 }, { "epoch": 5.519016968987712, "grad_norm": 0.5045533180236816, "learning_rate": 4.938546291976654e-05, "loss": 1.3568, "step": 1179 }, { "epoch": 5.523698069046226, "grad_norm": 0.5707494020462036, "learning_rate": 4.9303529630482495e-05, "loss": 1.3664, "step": 1180 }, { "epoch": 5.528379169104739, "grad_norm": 0.505876362323761, "learning_rate": 4.922159821169999e-05, "loss": 1.3668, "step": 1181 }, { "epoch": 5.533060269163253, "grad_norm": 0.5047862529754639, "learning_rate": 4.913966888346118e-05, "loss": 1.3598, "step": 1182 }, { "epoch": 5.537741369221767, "grad_norm": 0.4833430349826813, "learning_rate": 4.905774186580259e-05, "loss": 1.3625, "step": 1183 }, { "epoch": 5.542422469280281, "grad_norm": 0.5005712509155273, "learning_rate": 4.8975817378754576e-05, "loss": 1.3525, "step": 1184 }, { "epoch": 5.547103569338795, "grad_norm": 0.4941693842411041, "learning_rate": 4.889389564234066e-05, "loss": 1.3613, "step": 1185 }, { "epoch": 5.551784669397309, "grad_norm": 0.5212202072143555, "learning_rate": 4.881197687657702e-05, "loss": 1.38, "step": 1186 }, { "epoch": 5.556465769455822, "grad_norm": 0.4857902526855469, "learning_rate": 4.8730061301471836e-05, "loss": 1.378, "step": 1187 }, { "epoch": 5.561146869514336, "grad_norm": 0.4982302486896515, "learning_rate": 4.864814913702469e-05, "loss": 1.3625, "step": 1188 }, { "epoch": 5.565827969572849, "grad_norm": 0.5094555020332336, "learning_rate": 4.856624060322606e-05, "loss": 1.3751, "step": 1189 }, { "epoch": 5.570509069631363, "grad_norm": 0.4969262480735779, "learning_rate": 4.8484335920056615e-05, "loss": 1.346, "step": 1190 }, { "epoch": 5.575190169689877, "grad_norm": 0.5096610188484192, "learning_rate": 4.8402435307486734e-05, "loss": 1.3752, "step": 1191 }, { "epoch": 5.5798712697483905, "grad_norm": 0.5154902338981628, "learning_rate": 4.832053898547583e-05, "loss": 1.3772, "step": 1192 }, { "epoch": 5.584552369806905, "grad_norm": 0.5006143450737, "learning_rate": 4.82386471739718e-05, "loss": 1.3897, "step": 1193 }, { "epoch": 5.589233469865419, "grad_norm": 0.5159085392951965, "learning_rate": 4.8156760092910444e-05, "loss": 1.387, "step": 1194 }, { "epoch": 5.593914569923932, "grad_norm": 0.4952409565448761, "learning_rate": 4.807487796221482e-05, "loss": 1.3462, "step": 1195 }, { "epoch": 5.598595669982446, "grad_norm": 0.49595585465431213, "learning_rate": 4.799300100179473e-05, "loss": 1.3811, "step": 1196 }, { "epoch": 5.60327677004096, "grad_norm": 0.49851763248443604, "learning_rate": 4.7911129431546084e-05, "loss": 1.3673, "step": 1197 }, { "epoch": 5.607957870099473, "grad_norm": 0.5071251392364502, "learning_rate": 4.782926347135029e-05, "loss": 1.3756, "step": 1198 }, { "epoch": 5.612638970157987, "grad_norm": 0.49200981855392456, "learning_rate": 4.7747403341073725e-05, "loss": 1.3631, "step": 1199 }, { "epoch": 5.6173200702165005, "grad_norm": 0.47780096530914307, "learning_rate": 4.766554926056707e-05, "loss": 1.3533, "step": 1200 }, { "epoch": 5.622001170275015, "grad_norm": 0.5060575008392334, "learning_rate": 4.758370144966477e-05, "loss": 1.4004, "step": 1201 }, { "epoch": 5.626682270333529, "grad_norm": 0.5059685707092285, "learning_rate": 4.750186012818448e-05, "loss": 1.3682, "step": 1202 }, { "epoch": 5.631363370392042, "grad_norm": 0.4853475093841553, "learning_rate": 4.742002551592635e-05, "loss": 1.3636, "step": 1203 }, { "epoch": 5.636044470450556, "grad_norm": 0.5116673707962036, "learning_rate": 4.7338197832672566e-05, "loss": 1.3723, "step": 1204 }, { "epoch": 5.64072557050907, "grad_norm": 0.5065430998802185, "learning_rate": 4.725637729818667e-05, "loss": 1.3578, "step": 1205 }, { "epoch": 5.645406670567583, "grad_norm": 0.4856509864330292, "learning_rate": 4.717456413221302e-05, "loss": 1.3414, "step": 1206 }, { "epoch": 5.650087770626097, "grad_norm": 0.49742671847343445, "learning_rate": 4.709275855447621e-05, "loss": 1.3629, "step": 1207 }, { "epoch": 5.654768870684611, "grad_norm": 0.5032638311386108, "learning_rate": 4.701096078468041e-05, "loss": 1.354, "step": 1208 }, { "epoch": 5.6594499707431245, "grad_norm": 0.490828275680542, "learning_rate": 4.6929171042508855e-05, "loss": 1.3569, "step": 1209 }, { "epoch": 5.664131070801639, "grad_norm": 0.5146189332008362, "learning_rate": 4.684738954762316e-05, "loss": 1.3846, "step": 1210 }, { "epoch": 5.668812170860152, "grad_norm": 0.4796738028526306, "learning_rate": 4.6765616519662894e-05, "loss": 1.3576, "step": 1211 }, { "epoch": 5.673493270918666, "grad_norm": 0.5143107771873474, "learning_rate": 4.668385217824482e-05, "loss": 1.365, "step": 1212 }, { "epoch": 5.67817437097718, "grad_norm": 0.5042698979377747, "learning_rate": 4.660209674296233e-05, "loss": 1.3561, "step": 1213 }, { "epoch": 5.682855471035693, "grad_norm": 0.4811539351940155, "learning_rate": 4.652035043338501e-05, "loss": 1.3464, "step": 1214 }, { "epoch": 5.687536571094207, "grad_norm": 0.5001482963562012, "learning_rate": 4.643861346905781e-05, "loss": 1.3742, "step": 1215 }, { "epoch": 5.692217671152721, "grad_norm": 0.5193770527839661, "learning_rate": 4.6356886069500685e-05, "loss": 1.3502, "step": 1216 }, { "epoch": 5.6968987712112344, "grad_norm": 0.5073791742324829, "learning_rate": 4.6275168454207885e-05, "loss": 1.3572, "step": 1217 }, { "epoch": 5.7015798712697485, "grad_norm": 0.4926114082336426, "learning_rate": 4.619346084264729e-05, "loss": 1.3439, "step": 1218 }, { "epoch": 5.706260971328263, "grad_norm": 0.5074840784072876, "learning_rate": 4.611176345426006e-05, "loss": 1.386, "step": 1219 }, { "epoch": 5.710942071386776, "grad_norm": 0.5240839719772339, "learning_rate": 4.6030076508459776e-05, "loss": 1.3681, "step": 1220 }, { "epoch": 5.71562317144529, "grad_norm": 0.46889132261276245, "learning_rate": 4.5948400224632e-05, "loss": 1.3777, "step": 1221 }, { "epoch": 5.720304271503803, "grad_norm": 0.5044253468513489, "learning_rate": 4.5866734822133755e-05, "loss": 1.3609, "step": 1222 }, { "epoch": 5.724985371562317, "grad_norm": 0.5392898321151733, "learning_rate": 4.5785080520292687e-05, "loss": 1.3505, "step": 1223 }, { "epoch": 5.729666471620831, "grad_norm": 0.4860037863254547, "learning_rate": 4.570343753840675e-05, "loss": 1.3765, "step": 1224 }, { "epoch": 5.734347571679344, "grad_norm": 0.49485546350479126, "learning_rate": 4.562180609574343e-05, "loss": 1.378, "step": 1225 }, { "epoch": 5.7390286717378585, "grad_norm": 0.4981268048286438, "learning_rate": 4.5540186411539234e-05, "loss": 1.3644, "step": 1226 }, { "epoch": 5.743709771796372, "grad_norm": 0.5137907862663269, "learning_rate": 4.545857870499914e-05, "loss": 1.3722, "step": 1227 }, { "epoch": 5.748390871854886, "grad_norm": 0.5029009580612183, "learning_rate": 4.537698319529588e-05, "loss": 1.351, "step": 1228 }, { "epoch": 5.7530719719134, "grad_norm": 0.4825573265552521, "learning_rate": 4.52954001015695e-05, "loss": 1.3512, "step": 1229 }, { "epoch": 5.757753071971914, "grad_norm": 0.5327299237251282, "learning_rate": 4.521382964292663e-05, "loss": 1.3836, "step": 1230 }, { "epoch": 5.762434172030427, "grad_norm": 0.49493491649627686, "learning_rate": 4.513227203844003e-05, "loss": 1.353, "step": 1231 }, { "epoch": 5.767115272088941, "grad_norm": 0.4785345196723938, "learning_rate": 4.5050727507147905e-05, "loss": 1.362, "step": 1232 }, { "epoch": 5.771796372147454, "grad_norm": 0.5051293969154358, "learning_rate": 4.496919626805334e-05, "loss": 1.3773, "step": 1233 }, { "epoch": 5.776477472205968, "grad_norm": 0.48809775710105896, "learning_rate": 4.488767854012376e-05, "loss": 1.3907, "step": 1234 }, { "epoch": 5.7811585722644825, "grad_norm": 0.5047741532325745, "learning_rate": 4.4806174542290266e-05, "loss": 1.3612, "step": 1235 }, { "epoch": 5.785839672322996, "grad_norm": 0.48427894711494446, "learning_rate": 4.472468449344709e-05, "loss": 1.3631, "step": 1236 }, { "epoch": 5.79052077238151, "grad_norm": 0.49840694665908813, "learning_rate": 4.4643208612451043e-05, "loss": 1.3762, "step": 1237 }, { "epoch": 5.795201872440023, "grad_norm": 0.5097300410270691, "learning_rate": 4.456174711812082e-05, "loss": 1.3545, "step": 1238 }, { "epoch": 5.799882972498537, "grad_norm": 0.5060076117515564, "learning_rate": 4.448030022923652e-05, "loss": 1.3726, "step": 1239 }, { "epoch": 5.804564072557051, "grad_norm": 0.4817342460155487, "learning_rate": 4.4398868164539e-05, "loss": 1.3556, "step": 1240 }, { "epoch": 5.809245172615564, "grad_norm": 0.48569947481155396, "learning_rate": 4.4317451142729324e-05, "loss": 1.3296, "step": 1241 }, { "epoch": 5.813926272674078, "grad_norm": 0.5207067131996155, "learning_rate": 4.423604938246815e-05, "loss": 1.3719, "step": 1242 }, { "epoch": 5.818607372732592, "grad_norm": 0.496518075466156, "learning_rate": 4.4154663102375104e-05, "loss": 1.3624, "step": 1243 }, { "epoch": 5.823288472791106, "grad_norm": 0.49017754197120667, "learning_rate": 4.407329252102831e-05, "loss": 1.3624, "step": 1244 }, { "epoch": 5.82796957284962, "grad_norm": 0.491558700799942, "learning_rate": 4.399193785696366e-05, "loss": 1.3518, "step": 1245 }, { "epoch": 5.832650672908134, "grad_norm": 0.48818331956863403, "learning_rate": 4.3910599328674354e-05, "loss": 1.3409, "step": 1246 }, { "epoch": 5.837331772966647, "grad_norm": 0.4837343096733093, "learning_rate": 4.3829277154610246e-05, "loss": 1.3607, "step": 1247 }, { "epoch": 5.842012873025161, "grad_norm": 0.49654173851013184, "learning_rate": 4.374797155317722e-05, "loss": 1.35, "step": 1248 }, { "epoch": 5.846693973083674, "grad_norm": 0.49282020330429077, "learning_rate": 4.36666827427367e-05, "loss": 1.3615, "step": 1249 }, { "epoch": 5.851375073142188, "grad_norm": 0.48395588994026184, "learning_rate": 4.3585410941605e-05, "loss": 1.364, "step": 1250 }, { "epoch": 5.856056173200702, "grad_norm": 0.4863456189632416, "learning_rate": 4.3504156368052765e-05, "loss": 1.3643, "step": 1251 }, { "epoch": 5.860737273259216, "grad_norm": 0.4858078062534332, "learning_rate": 4.3422919240304344e-05, "loss": 1.3664, "step": 1252 }, { "epoch": 5.86541837331773, "grad_norm": 0.49061718583106995, "learning_rate": 4.334169977653725e-05, "loss": 1.3704, "step": 1253 }, { "epoch": 5.870099473376244, "grad_norm": 0.4844505488872528, "learning_rate": 4.326049819488157e-05, "loss": 1.3413, "step": 1254 }, { "epoch": 5.874780573434757, "grad_norm": 0.4844004213809967, "learning_rate": 4.3179314713419314e-05, "loss": 1.356, "step": 1255 }, { "epoch": 5.879461673493271, "grad_norm": 0.48914822936058044, "learning_rate": 4.3098149550183936e-05, "loss": 1.3519, "step": 1256 }, { "epoch": 5.884142773551785, "grad_norm": 0.4801008105278015, "learning_rate": 4.30170029231597e-05, "loss": 1.3625, "step": 1257 }, { "epoch": 5.888823873610298, "grad_norm": 0.4835020899772644, "learning_rate": 4.2935875050281e-05, "loss": 1.3461, "step": 1258 }, { "epoch": 5.893504973668812, "grad_norm": 0.5038294196128845, "learning_rate": 4.285476614943199e-05, "loss": 1.3209, "step": 1259 }, { "epoch": 5.8981860737273255, "grad_norm": 0.4965617060661316, "learning_rate": 4.277367643844574e-05, "loss": 1.3918, "step": 1260 }, { "epoch": 5.90286717378584, "grad_norm": 0.4906851351261139, "learning_rate": 4.2692606135103884e-05, "loss": 1.3669, "step": 1261 }, { "epoch": 5.907548273844354, "grad_norm": 0.48122575879096985, "learning_rate": 4.26115554571359e-05, "loss": 1.3443, "step": 1262 }, { "epoch": 5.912229373902867, "grad_norm": 0.48655739426612854, "learning_rate": 4.253052462221851e-05, "loss": 1.3694, "step": 1263 }, { "epoch": 5.916910473961381, "grad_norm": 0.4788222312927246, "learning_rate": 4.244951384797525e-05, "loss": 1.3731, "step": 1264 }, { "epoch": 5.921591574019895, "grad_norm": 0.5071664452552795, "learning_rate": 4.236852335197566e-05, "loss": 1.3895, "step": 1265 }, { "epoch": 5.926272674078408, "grad_norm": 0.49297115206718445, "learning_rate": 4.228755335173488e-05, "loss": 1.362, "step": 1266 }, { "epoch": 5.930953774136922, "grad_norm": 0.49331197142601013, "learning_rate": 4.2206604064713054e-05, "loss": 1.3823, "step": 1267 }, { "epoch": 5.935634874195436, "grad_norm": 0.4724307060241699, "learning_rate": 4.212567570831457e-05, "loss": 1.3621, "step": 1268 }, { "epoch": 5.9403159742539495, "grad_norm": 0.4933937191963196, "learning_rate": 4.204476849988773e-05, "loss": 1.3606, "step": 1269 }, { "epoch": 5.944997074312464, "grad_norm": 0.4997156858444214, "learning_rate": 4.1963882656723954e-05, "loss": 1.3673, "step": 1270 }, { "epoch": 5.949678174370977, "grad_norm": 0.49473080039024353, "learning_rate": 4.1883018396057307e-05, "loss": 1.3366, "step": 1271 }, { "epoch": 5.954359274429491, "grad_norm": 0.5106769800186157, "learning_rate": 4.180217593506394e-05, "loss": 1.3577, "step": 1272 }, { "epoch": 5.959040374488005, "grad_norm": 0.4847675561904907, "learning_rate": 4.1721355490861367e-05, "loss": 1.3707, "step": 1273 }, { "epoch": 5.963721474546518, "grad_norm": 0.5004474520683289, "learning_rate": 4.1640557280508044e-05, "loss": 1.3667, "step": 1274 }, { "epoch": 5.968402574605032, "grad_norm": 0.5049918293952942, "learning_rate": 4.1559781521002664e-05, "loss": 1.3726, "step": 1275 }, { "epoch": 5.973083674663546, "grad_norm": 0.49814268946647644, "learning_rate": 4.147902842928366e-05, "loss": 1.3621, "step": 1276 }, { "epoch": 5.9777647747220595, "grad_norm": 0.5119578838348389, "learning_rate": 4.139829822222858e-05, "loss": 1.3846, "step": 1277 }, { "epoch": 5.9824458747805735, "grad_norm": 0.4856437146663666, "learning_rate": 4.131759111665349e-05, "loss": 1.341, "step": 1278 }, { "epoch": 5.987126974839088, "grad_norm": 0.5015390515327454, "learning_rate": 4.1236907329312435e-05, "loss": 1.3391, "step": 1279 }, { "epoch": 5.991808074897601, "grad_norm": 0.5185198187828064, "learning_rate": 4.115624707689683e-05, "loss": 1.3637, "step": 1280 }, { "epoch": 5.996489174956115, "grad_norm": 0.5117372274398804, "learning_rate": 4.107561057603486e-05, "loss": 1.3667, "step": 1281 }, { "epoch": 6.001170275014628, "grad_norm": 1.6847134828567505, "learning_rate": 4.0994998043290964e-05, "loss": 1.3563, "step": 1282 }, { "epoch": 6.005851375073142, "grad_norm": 0.5279262065887451, "learning_rate": 4.0914409695165165e-05, "loss": 1.3405, "step": 1283 }, { "epoch": 6.010532475131656, "grad_norm": 0.5033664703369141, "learning_rate": 4.0833845748092586e-05, "loss": 1.3468, "step": 1284 }, { "epoch": 6.015213575190169, "grad_norm": 0.5114166736602783, "learning_rate": 4.075330641844275e-05, "loss": 1.3727, "step": 1285 }, { "epoch": 6.0198946752486835, "grad_norm": 0.5349748134613037, "learning_rate": 4.067279192251911e-05, "loss": 1.3489, "step": 1286 }, { "epoch": 6.024575775307198, "grad_norm": 0.5186452269554138, "learning_rate": 4.059230247655843e-05, "loss": 1.3473, "step": 1287 }, { "epoch": 6.029256875365711, "grad_norm": 0.5082773566246033, "learning_rate": 4.051183829673016e-05, "loss": 1.3607, "step": 1288 }, { "epoch": 6.033937975424225, "grad_norm": 0.5214784741401672, "learning_rate": 4.043139959913593e-05, "loss": 1.3429, "step": 1289 }, { "epoch": 6.038619075482738, "grad_norm": 0.5377727746963501, "learning_rate": 4.035098659980891e-05, "loss": 1.3523, "step": 1290 }, { "epoch": 6.043300175541252, "grad_norm": 0.48703083395957947, "learning_rate": 4.0270599514713256e-05, "loss": 1.3616, "step": 1291 }, { "epoch": 6.047981275599766, "grad_norm": 0.5291474461555481, "learning_rate": 4.019023855974354e-05, "loss": 1.3612, "step": 1292 }, { "epoch": 6.052662375658279, "grad_norm": 0.541254460811615, "learning_rate": 4.0109903950724134e-05, "loss": 1.3449, "step": 1293 }, { "epoch": 6.057343475716793, "grad_norm": 0.49508151412010193, "learning_rate": 4.002959590340869e-05, "loss": 1.3784, "step": 1294 }, { "epoch": 6.0620245757753075, "grad_norm": 0.4925527572631836, "learning_rate": 3.994931463347946e-05, "loss": 1.3626, "step": 1295 }, { "epoch": 6.066705675833821, "grad_norm": 0.522050142288208, "learning_rate": 3.9869060356546855e-05, "loss": 1.3445, "step": 1296 }, { "epoch": 6.071386775892335, "grad_norm": 0.498424232006073, "learning_rate": 3.978883328814875e-05, "loss": 1.3489, "step": 1297 }, { "epoch": 6.076067875950849, "grad_norm": 0.4769817590713501, "learning_rate": 3.9708633643749924e-05, "loss": 1.354, "step": 1298 }, { "epoch": 6.080748976009362, "grad_norm": 0.4856712818145752, "learning_rate": 3.9628461638741584e-05, "loss": 1.3503, "step": 1299 }, { "epoch": 6.085430076067876, "grad_norm": 0.48528093099594116, "learning_rate": 3.954831748844059e-05, "loss": 1.3485, "step": 1300 }, { "epoch": 6.090111176126389, "grad_norm": 0.489759236574173, "learning_rate": 3.94682014080891e-05, "loss": 1.3547, "step": 1301 }, { "epoch": 6.094792276184903, "grad_norm": 0.5019160509109497, "learning_rate": 3.9388113612853855e-05, "loss": 1.3499, "step": 1302 }, { "epoch": 6.0994733762434175, "grad_norm": 0.49624067544937134, "learning_rate": 3.9308054317825564e-05, "loss": 1.3506, "step": 1303 }, { "epoch": 6.104154476301931, "grad_norm": 0.5070812702178955, "learning_rate": 3.9228023738018513e-05, "loss": 1.3611, "step": 1304 }, { "epoch": 6.108835576360445, "grad_norm": 0.4832218885421753, "learning_rate": 3.914802208836973e-05, "loss": 1.3443, "step": 1305 }, { "epoch": 6.113516676418959, "grad_norm": 0.4949527382850647, "learning_rate": 3.906804958373868e-05, "loss": 1.3631, "step": 1306 }, { "epoch": 6.118197776477472, "grad_norm": 0.4915694296360016, "learning_rate": 3.898810643890648e-05, "loss": 1.351, "step": 1307 }, { "epoch": 6.122878876535986, "grad_norm": 0.5031571984291077, "learning_rate": 3.8908192868575366e-05, "loss": 1.347, "step": 1308 }, { "epoch": 6.1275599765945, "grad_norm": 0.4988706707954407, "learning_rate": 3.882830908736825e-05, "loss": 1.3601, "step": 1309 }, { "epoch": 6.132241076653013, "grad_norm": 0.49566569924354553, "learning_rate": 3.874845530982792e-05, "loss": 1.3587, "step": 1310 }, { "epoch": 6.136922176711527, "grad_norm": 0.4733045995235443, "learning_rate": 3.866863175041666e-05, "loss": 1.3419, "step": 1311 }, { "epoch": 6.141603276770041, "grad_norm": 0.48297199606895447, "learning_rate": 3.858883862351561e-05, "loss": 1.3464, "step": 1312 }, { "epoch": 6.146284376828555, "grad_norm": 0.4759235084056854, "learning_rate": 3.85090761434241e-05, "loss": 1.3634, "step": 1313 }, { "epoch": 6.150965476887069, "grad_norm": 0.4884612560272217, "learning_rate": 3.842934452435923e-05, "loss": 1.3608, "step": 1314 }, { "epoch": 6.155646576945582, "grad_norm": 0.48609334230422974, "learning_rate": 3.8349643980455165e-05, "loss": 1.3485, "step": 1315 }, { "epoch": 6.160327677004096, "grad_norm": 0.48751768469810486, "learning_rate": 3.8269974725762624e-05, "loss": 1.3482, "step": 1316 }, { "epoch": 6.16500877706261, "grad_norm": 0.49484318494796753, "learning_rate": 3.8190336974248356e-05, "loss": 1.3699, "step": 1317 }, { "epoch": 6.169689877121123, "grad_norm": 0.5006123781204224, "learning_rate": 3.811073093979439e-05, "loss": 1.3687, "step": 1318 }, { "epoch": 6.174370977179637, "grad_norm": 0.4835350215435028, "learning_rate": 3.803115683619766e-05, "loss": 1.3669, "step": 1319 }, { "epoch": 6.1790520772381505, "grad_norm": 0.482292115688324, "learning_rate": 3.7951614877169284e-05, "loss": 1.3527, "step": 1320 }, { "epoch": 6.183733177296665, "grad_norm": 0.5049312114715576, "learning_rate": 3.787210527633411e-05, "loss": 1.3698, "step": 1321 }, { "epoch": 6.188414277355179, "grad_norm": 0.5008333921432495, "learning_rate": 3.779262824723005e-05, "loss": 1.3593, "step": 1322 }, { "epoch": 6.193095377413692, "grad_norm": 0.4848458170890808, "learning_rate": 3.771318400330752e-05, "loss": 1.3598, "step": 1323 }, { "epoch": 6.197776477472206, "grad_norm": 0.4954104721546173, "learning_rate": 3.763377275792893e-05, "loss": 1.3414, "step": 1324 }, { "epoch": 6.20245757753072, "grad_norm": 0.47016122937202454, "learning_rate": 3.755439472436801e-05, "loss": 1.3665, "step": 1325 }, { "epoch": 6.207138677589233, "grad_norm": 0.5091321468353271, "learning_rate": 3.747505011580936e-05, "loss": 1.3651, "step": 1326 }, { "epoch": 6.211819777647747, "grad_norm": 0.4931752383708954, "learning_rate": 3.739573914534776e-05, "loss": 1.3484, "step": 1327 }, { "epoch": 6.216500877706261, "grad_norm": 0.49978458881378174, "learning_rate": 3.7316462025987666e-05, "loss": 1.3557, "step": 1328 }, { "epoch": 6.2211819777647746, "grad_norm": 0.5076282024383545, "learning_rate": 3.7237218970642626e-05, "loss": 1.368, "step": 1329 }, { "epoch": 6.225863077823289, "grad_norm": 0.5032605528831482, "learning_rate": 3.715801019213467e-05, "loss": 1.3485, "step": 1330 }, { "epoch": 6.230544177881802, "grad_norm": 0.515126645565033, "learning_rate": 3.7078835903193824e-05, "loss": 1.3915, "step": 1331 }, { "epoch": 6.235225277940316, "grad_norm": 0.5021912455558777, "learning_rate": 3.6999696316457465e-05, "loss": 1.366, "step": 1332 }, { "epoch": 6.23990637799883, "grad_norm": 0.4850209057331085, "learning_rate": 3.6920591644469736e-05, "loss": 1.3302, "step": 1333 }, { "epoch": 6.244587478057343, "grad_norm": 0.4773012697696686, "learning_rate": 3.684152209968107e-05, "loss": 1.3373, "step": 1334 }, { "epoch": 6.249268578115857, "grad_norm": 0.486314058303833, "learning_rate": 3.67624878944475e-05, "loss": 1.3466, "step": 1335 }, { "epoch": 6.253949678174371, "grad_norm": 0.500917375087738, "learning_rate": 3.6683489241030176e-05, "loss": 1.3574, "step": 1336 }, { "epoch": 6.2586307782328845, "grad_norm": 0.4872746765613556, "learning_rate": 3.66045263515948e-05, "loss": 1.3672, "step": 1337 }, { "epoch": 6.263311878291399, "grad_norm": 0.48777884244918823, "learning_rate": 3.6525599438210956e-05, "loss": 1.3494, "step": 1338 }, { "epoch": 6.267992978349913, "grad_norm": 0.48457857966423035, "learning_rate": 3.6446708712851666e-05, "loss": 1.3252, "step": 1339 }, { "epoch": 6.272674078408426, "grad_norm": 0.49213770031929016, "learning_rate": 3.6367854387392716e-05, "loss": 1.3592, "step": 1340 }, { "epoch": 6.27735517846694, "grad_norm": 0.4870537221431732, "learning_rate": 3.628903667361216e-05, "loss": 1.3518, "step": 1341 }, { "epoch": 6.282036278525453, "grad_norm": 0.48960310220718384, "learning_rate": 3.621025578318975e-05, "loss": 1.3404, "step": 1342 }, { "epoch": 6.286717378583967, "grad_norm": 0.48424863815307617, "learning_rate": 3.6131511927706264e-05, "loss": 1.3386, "step": 1343 }, { "epoch": 6.291398478642481, "grad_norm": 0.5022472739219666, "learning_rate": 3.605280531864311e-05, "loss": 1.3462, "step": 1344 }, { "epoch": 6.296079578700994, "grad_norm": 0.483844131231308, "learning_rate": 3.597413616738157e-05, "loss": 1.3318, "step": 1345 }, { "epoch": 6.3007606787595085, "grad_norm": 0.48008736968040466, "learning_rate": 3.589550468520242e-05, "loss": 1.3731, "step": 1346 }, { "epoch": 6.305441778818023, "grad_norm": 0.48773452639579773, "learning_rate": 3.581691108328517e-05, "loss": 1.3305, "step": 1347 }, { "epoch": 6.310122878876536, "grad_norm": 0.4856274724006653, "learning_rate": 3.573835557270768e-05, "loss": 1.3369, "step": 1348 }, { "epoch": 6.31480397893505, "grad_norm": 0.5108739137649536, "learning_rate": 3.5659838364445505e-05, "loss": 1.3618, "step": 1349 }, { "epoch": 6.319485078993564, "grad_norm": 0.47577667236328125, "learning_rate": 3.558135966937123e-05, "loss": 1.3557, "step": 1350 }, { "epoch": 6.324166179052077, "grad_norm": 0.49220508337020874, "learning_rate": 3.550291969825415e-05, "loss": 1.3505, "step": 1351 }, { "epoch": 6.328847279110591, "grad_norm": 0.47751569747924805, "learning_rate": 3.542451866175944e-05, "loss": 1.3495, "step": 1352 }, { "epoch": 6.333528379169104, "grad_norm": 0.47069454193115234, "learning_rate": 3.5346156770447755e-05, "loss": 1.3436, "step": 1353 }, { "epoch": 6.3382094792276185, "grad_norm": 0.49512651562690735, "learning_rate": 3.526783423477468e-05, "loss": 1.3386, "step": 1354 }, { "epoch": 6.3428905792861325, "grad_norm": 0.48573729395866394, "learning_rate": 3.518955126508997e-05, "loss": 1.3515, "step": 1355 }, { "epoch": 6.347571679344646, "grad_norm": 0.478813111782074, "learning_rate": 3.511130807163724e-05, "loss": 1.3693, "step": 1356 }, { "epoch": 6.35225277940316, "grad_norm": 0.48378655314445496, "learning_rate": 3.503310486455319e-05, "loss": 1.3762, "step": 1357 }, { "epoch": 6.356933879461674, "grad_norm": 0.4784218966960907, "learning_rate": 3.495494185386721e-05, "loss": 1.3584, "step": 1358 }, { "epoch": 6.361614979520187, "grad_norm": 0.4655744135379791, "learning_rate": 3.487681924950067e-05, "loss": 1.3628, "step": 1359 }, { "epoch": 6.366296079578701, "grad_norm": 0.4663156270980835, "learning_rate": 3.4798737261266456e-05, "loss": 1.363, "step": 1360 }, { "epoch": 6.370977179637215, "grad_norm": 0.48525822162628174, "learning_rate": 3.472069609886835e-05, "loss": 1.3482, "step": 1361 }, { "epoch": 6.375658279695728, "grad_norm": 0.4797033667564392, "learning_rate": 3.464269597190051e-05, "loss": 1.3476, "step": 1362 }, { "epoch": 6.3803393797542425, "grad_norm": 0.48956650495529175, "learning_rate": 3.4564737089846864e-05, "loss": 1.3509, "step": 1363 }, { "epoch": 6.385020479812756, "grad_norm": 0.48538145422935486, "learning_rate": 3.4486819662080595e-05, "loss": 1.3626, "step": 1364 }, { "epoch": 6.38970157987127, "grad_norm": 0.4890855550765991, "learning_rate": 3.440894389786352e-05, "loss": 1.3428, "step": 1365 }, { "epoch": 6.394382679929784, "grad_norm": 0.4814460873603821, "learning_rate": 3.43311100063456e-05, "loss": 1.3474, "step": 1366 }, { "epoch": 6.399063779988297, "grad_norm": 0.48000338673591614, "learning_rate": 3.42533181965643e-05, "loss": 1.3375, "step": 1367 }, { "epoch": 6.403744880046811, "grad_norm": 0.46856164932250977, "learning_rate": 3.4175568677444094e-05, "loss": 1.3479, "step": 1368 }, { "epoch": 6.408425980105325, "grad_norm": 0.4865092933177948, "learning_rate": 3.4097861657795864e-05, "loss": 1.3561, "step": 1369 }, { "epoch": 6.413107080163838, "grad_norm": 0.4856918752193451, "learning_rate": 3.4020197346316354e-05, "loss": 1.3588, "step": 1370 }, { "epoch": 6.417788180222352, "grad_norm": 0.4821678400039673, "learning_rate": 3.394257595158761e-05, "loss": 1.3443, "step": 1371 }, { "epoch": 6.422469280280866, "grad_norm": 0.4700441360473633, "learning_rate": 3.386499768207642e-05, "loss": 1.3538, "step": 1372 }, { "epoch": 6.42715038033938, "grad_norm": 0.48163434863090515, "learning_rate": 3.378746274613372e-05, "loss": 1.3491, "step": 1373 }, { "epoch": 6.431831480397894, "grad_norm": 0.47138711810112, "learning_rate": 3.370997135199413e-05, "loss": 1.3282, "step": 1374 }, { "epoch": 6.436512580456407, "grad_norm": 0.4872971475124359, "learning_rate": 3.3632523707775266e-05, "loss": 1.3651, "step": 1375 }, { "epoch": 6.441193680514921, "grad_norm": 0.4702310860157013, "learning_rate": 3.355512002147729e-05, "loss": 1.3585, "step": 1376 }, { "epoch": 6.445874780573435, "grad_norm": 0.47843244671821594, "learning_rate": 3.347776050098226e-05, "loss": 1.3806, "step": 1377 }, { "epoch": 6.450555880631948, "grad_norm": 0.47986504435539246, "learning_rate": 3.340044535405367e-05, "loss": 1.3363, "step": 1378 }, { "epoch": 6.455236980690462, "grad_norm": 0.4765663743019104, "learning_rate": 3.332317478833582e-05, "loss": 1.3649, "step": 1379 }, { "epoch": 6.4599180807489756, "grad_norm": 0.47521644830703735, "learning_rate": 3.3245949011353264e-05, "loss": 1.341, "step": 1380 }, { "epoch": 6.46459918080749, "grad_norm": 0.48216161131858826, "learning_rate": 3.3168768230510285e-05, "loss": 1.3613, "step": 1381 }, { "epoch": 6.469280280866004, "grad_norm": 0.47645995020866394, "learning_rate": 3.30916326530903e-05, "loss": 1.3461, "step": 1382 }, { "epoch": 6.473961380924517, "grad_norm": 0.4928266406059265, "learning_rate": 3.301454248625536e-05, "loss": 1.3645, "step": 1383 }, { "epoch": 6.478642480983031, "grad_norm": 0.48274198174476624, "learning_rate": 3.293749793704554e-05, "loss": 1.3732, "step": 1384 }, { "epoch": 6.483323581041545, "grad_norm": 0.48078644275665283, "learning_rate": 3.286049921237836e-05, "loss": 1.3342, "step": 1385 }, { "epoch": 6.488004681100058, "grad_norm": 0.4662472903728485, "learning_rate": 3.278354651904836e-05, "loss": 1.3518, "step": 1386 }, { "epoch": 6.492685781158572, "grad_norm": 0.4929116368293762, "learning_rate": 3.270664006372633e-05, "loss": 1.3618, "step": 1387 }, { "epoch": 6.497366881217086, "grad_norm": 0.4652104079723358, "learning_rate": 3.262978005295901e-05, "loss": 1.3246, "step": 1388 }, { "epoch": 6.5020479812756, "grad_norm": 0.4658075273036957, "learning_rate": 3.255296669316834e-05, "loss": 1.3454, "step": 1389 }, { "epoch": 6.506729081334114, "grad_norm": 0.4817008674144745, "learning_rate": 3.247620019065092e-05, "loss": 1.3584, "step": 1390 }, { "epoch": 6.511410181392627, "grad_norm": 0.4935586154460907, "learning_rate": 3.239948075157764e-05, "loss": 1.3621, "step": 1391 }, { "epoch": 6.516091281451141, "grad_norm": 0.4779127240180969, "learning_rate": 3.2322808581992825e-05, "loss": 1.3482, "step": 1392 }, { "epoch": 6.520772381509655, "grad_norm": 0.48167383670806885, "learning_rate": 3.2246183887813995e-05, "loss": 1.3475, "step": 1393 }, { "epoch": 6.525453481568168, "grad_norm": 0.4878900647163391, "learning_rate": 3.216960687483112e-05, "loss": 1.345, "step": 1394 }, { "epoch": 6.530134581626682, "grad_norm": 0.4895574748516083, "learning_rate": 3.209307774870603e-05, "loss": 1.3432, "step": 1395 }, { "epoch": 6.534815681685196, "grad_norm": 0.4896577298641205, "learning_rate": 3.2016596714972105e-05, "loss": 1.3488, "step": 1396 }, { "epoch": 6.5394967817437095, "grad_norm": 0.4758218824863434, "learning_rate": 3.19401639790334e-05, "loss": 1.3542, "step": 1397 }, { "epoch": 6.544177881802224, "grad_norm": 0.48330479860305786, "learning_rate": 3.1863779746164355e-05, "loss": 1.3713, "step": 1398 }, { "epoch": 6.548858981860738, "grad_norm": 0.4900497794151306, "learning_rate": 3.178744422150915e-05, "loss": 1.3462, "step": 1399 }, { "epoch": 6.553540081919251, "grad_norm": 0.4635152518749237, "learning_rate": 3.1711157610081096e-05, "loss": 1.3494, "step": 1400 }, { "epoch": 6.558221181977765, "grad_norm": 0.481869101524353, "learning_rate": 3.1634920116762176e-05, "loss": 1.362, "step": 1401 }, { "epoch": 6.562902282036278, "grad_norm": 0.49324488639831543, "learning_rate": 3.1558731946302435e-05, "loss": 1.3497, "step": 1402 }, { "epoch": 6.567583382094792, "grad_norm": 0.48508989810943604, "learning_rate": 3.148259330331945e-05, "loss": 1.3583, "step": 1403 }, { "epoch": 6.572264482153306, "grad_norm": 0.48708096146583557, "learning_rate": 3.1406504392297864e-05, "loss": 1.3572, "step": 1404 }, { "epoch": 6.5769455822118195, "grad_norm": 0.5007489919662476, "learning_rate": 3.133046541758862e-05, "loss": 1.353, "step": 1405 }, { "epoch": 6.5816266822703335, "grad_norm": 0.46692708134651184, "learning_rate": 3.1254476583408645e-05, "loss": 1.3486, "step": 1406 }, { "epoch": 6.586307782328848, "grad_norm": 0.45835116505622864, "learning_rate": 3.117853809384016e-05, "loss": 1.324, "step": 1407 }, { "epoch": 6.590988882387361, "grad_norm": 0.49309778213500977, "learning_rate": 3.11026501528302e-05, "loss": 1.3584, "step": 1408 }, { "epoch": 6.595669982445875, "grad_norm": 0.48946860432624817, "learning_rate": 3.1026812964190044e-05, "loss": 1.3495, "step": 1409 }, { "epoch": 6.600351082504389, "grad_norm": 0.501943051815033, "learning_rate": 3.0951026731594635e-05, "loss": 1.3578, "step": 1410 }, { "epoch": 6.605032182562902, "grad_norm": 0.48318126797676086, "learning_rate": 3.0875291658582104e-05, "loss": 1.3469, "step": 1411 }, { "epoch": 6.609713282621416, "grad_norm": 0.4796597957611084, "learning_rate": 3.079960794855315e-05, "loss": 1.3554, "step": 1412 }, { "epoch": 6.614394382679929, "grad_norm": 0.48338282108306885, "learning_rate": 3.072397580477054e-05, "loss": 1.3621, "step": 1413 }, { "epoch": 6.6190754827384435, "grad_norm": 0.47976669669151306, "learning_rate": 3.0648395430358564e-05, "loss": 1.3513, "step": 1414 }, { "epoch": 6.623756582796958, "grad_norm": 0.4931601881980896, "learning_rate": 3.057286702830246e-05, "loss": 1.3551, "step": 1415 }, { "epoch": 6.628437682855471, "grad_norm": 0.48094189167022705, "learning_rate": 3.0497390801447883e-05, "loss": 1.3491, "step": 1416 }, { "epoch": 6.633118782913985, "grad_norm": 0.48256900906562805, "learning_rate": 3.0421966952500358e-05, "loss": 1.3491, "step": 1417 }, { "epoch": 6.637799882972499, "grad_norm": 0.5100786685943604, "learning_rate": 3.0346595684024755e-05, "loss": 1.3603, "step": 1418 }, { "epoch": 6.642480983031012, "grad_norm": 0.492291122674942, "learning_rate": 3.0271277198444735e-05, "loss": 1.3553, "step": 1419 }, { "epoch": 6.647162083089526, "grad_norm": 0.47800853848457336, "learning_rate": 3.019601169804216e-05, "loss": 1.3443, "step": 1420 }, { "epoch": 6.65184318314804, "grad_norm": 0.4766044020652771, "learning_rate": 3.0120799384956642e-05, "loss": 1.3411, "step": 1421 }, { "epoch": 6.656524283206553, "grad_norm": 0.46603795886039734, "learning_rate": 3.0045640461184915e-05, "loss": 1.337, "step": 1422 }, { "epoch": 6.6612053832650675, "grad_norm": 0.5020335912704468, "learning_rate": 2.9970535128580336e-05, "loss": 1.3384, "step": 1423 }, { "epoch": 6.665886483323581, "grad_norm": 0.4919477701187134, "learning_rate": 2.9895483588852337e-05, "loss": 1.347, "step": 1424 }, { "epoch": 6.670567583382095, "grad_norm": 0.48846378922462463, "learning_rate": 2.9820486043565854e-05, "loss": 1.3286, "step": 1425 }, { "epoch": 6.675248683440609, "grad_norm": 0.4913786053657532, "learning_rate": 2.974554269414086e-05, "loss": 1.356, "step": 1426 }, { "epoch": 6.679929783499122, "grad_norm": 0.47736066579818726, "learning_rate": 2.9670653741851728e-05, "loss": 1.3642, "step": 1427 }, { "epoch": 6.684610883557636, "grad_norm": 0.4830758273601532, "learning_rate": 2.959581938782675e-05, "loss": 1.345, "step": 1428 }, { "epoch": 6.68929198361615, "grad_norm": 0.4828513562679291, "learning_rate": 2.9521039833047614e-05, "loss": 1.3532, "step": 1429 }, { "epoch": 6.693973083674663, "grad_norm": 0.4844878017902374, "learning_rate": 2.944631527834879e-05, "loss": 1.3532, "step": 1430 }, { "epoch": 6.6986541837331774, "grad_norm": 0.4727339744567871, "learning_rate": 2.937164592441708e-05, "loss": 1.3393, "step": 1431 }, { "epoch": 6.7033352837916915, "grad_norm": 0.4912458062171936, "learning_rate": 2.9297031971790955e-05, "loss": 1.3598, "step": 1432 }, { "epoch": 6.708016383850205, "grad_norm": 0.48164111375808716, "learning_rate": 2.9222473620860198e-05, "loss": 1.3473, "step": 1433 }, { "epoch": 6.712697483908719, "grad_norm": 0.48862725496292114, "learning_rate": 2.9147971071865216e-05, "loss": 1.3608, "step": 1434 }, { "epoch": 6.717378583967232, "grad_norm": 0.49266523122787476, "learning_rate": 2.9073524524896513e-05, "loss": 1.3581, "step": 1435 }, { "epoch": 6.722059684025746, "grad_norm": 0.4824602007865906, "learning_rate": 2.8999134179894244e-05, "loss": 1.3519, "step": 1436 }, { "epoch": 6.72674078408426, "grad_norm": 0.48006853461265564, "learning_rate": 2.8924800236647597e-05, "loss": 1.3383, "step": 1437 }, { "epoch": 6.731421884142773, "grad_norm": 0.48835617303848267, "learning_rate": 2.8850522894794307e-05, "loss": 1.3685, "step": 1438 }, { "epoch": 6.736102984201287, "grad_norm": 0.49173474311828613, "learning_rate": 2.877630235382007e-05, "loss": 1.3541, "step": 1439 }, { "epoch": 6.740784084259801, "grad_norm": 0.4922891855239868, "learning_rate": 2.870213881305802e-05, "loss": 1.3433, "step": 1440 }, { "epoch": 6.745465184318315, "grad_norm": 0.4828687012195587, "learning_rate": 2.862803247168825e-05, "loss": 1.3671, "step": 1441 }, { "epoch": 6.750146284376829, "grad_norm": 0.4786439836025238, "learning_rate": 2.855398352873718e-05, "loss": 1.337, "step": 1442 }, { "epoch": 6.754827384435342, "grad_norm": 0.5012813806533813, "learning_rate": 2.8479992183077142e-05, "loss": 1.3564, "step": 1443 }, { "epoch": 6.759508484493856, "grad_norm": 0.49249881505966187, "learning_rate": 2.8406058633425737e-05, "loss": 1.3373, "step": 1444 }, { "epoch": 6.76418958455237, "grad_norm": 0.47396138310432434, "learning_rate": 2.833218307834532e-05, "loss": 1.3354, "step": 1445 }, { "epoch": 6.768870684610883, "grad_norm": 0.47091707587242126, "learning_rate": 2.825836571624254e-05, "loss": 1.3607, "step": 1446 }, { "epoch": 6.773551784669397, "grad_norm": 0.47440093755722046, "learning_rate": 2.818460674536773e-05, "loss": 1.3427, "step": 1447 }, { "epoch": 6.778232884727911, "grad_norm": 0.4885600507259369, "learning_rate": 2.8110906363814426e-05, "loss": 1.3415, "step": 1448 }, { "epoch": 6.782913984786425, "grad_norm": 0.4825218915939331, "learning_rate": 2.8037264769518802e-05, "loss": 1.3577, "step": 1449 }, { "epoch": 6.787595084844939, "grad_norm": 0.5013914704322815, "learning_rate": 2.7963682160259107e-05, "loss": 1.3404, "step": 1450 }, { "epoch": 6.792276184903452, "grad_norm": 0.4755021035671234, "learning_rate": 2.7890158733655237e-05, "loss": 1.3505, "step": 1451 }, { "epoch": 6.796957284961966, "grad_norm": 0.48797109723091125, "learning_rate": 2.781669468716811e-05, "loss": 1.328, "step": 1452 }, { "epoch": 6.80163838502048, "grad_norm": 0.48389190435409546, "learning_rate": 2.774329021809916e-05, "loss": 1.3503, "step": 1453 }, { "epoch": 6.806319485078993, "grad_norm": 0.48728588223457336, "learning_rate": 2.766994552358987e-05, "loss": 1.3429, "step": 1454 }, { "epoch": 6.811000585137507, "grad_norm": 0.48784175515174866, "learning_rate": 2.7596660800621078e-05, "loss": 1.3491, "step": 1455 }, { "epoch": 6.815681685196021, "grad_norm": 0.48684337735176086, "learning_rate": 2.7523436246012686e-05, "loss": 1.3521, "step": 1456 }, { "epoch": 6.8203627852545345, "grad_norm": 0.4770592749118805, "learning_rate": 2.7450272056422864e-05, "loss": 1.357, "step": 1457 }, { "epoch": 6.825043885313049, "grad_norm": 0.47391945123672485, "learning_rate": 2.737716842834781e-05, "loss": 1.3396, "step": 1458 }, { "epoch": 6.829724985371563, "grad_norm": 0.5123258829116821, "learning_rate": 2.730412555812099e-05, "loss": 1.3551, "step": 1459 }, { "epoch": 6.834406085430076, "grad_norm": 0.47993412613868713, "learning_rate": 2.7231143641912672e-05, "loss": 1.3518, "step": 1460 }, { "epoch": 6.83908718548859, "grad_norm": 0.4489990472793579, "learning_rate": 2.7158222875729472e-05, "loss": 1.3236, "step": 1461 }, { "epoch": 6.843768285547103, "grad_norm": 0.4816996455192566, "learning_rate": 2.708536345541371e-05, "loss": 1.335, "step": 1462 }, { "epoch": 6.848449385605617, "grad_norm": 0.48490139842033386, "learning_rate": 2.7012565576643056e-05, "loss": 1.3511, "step": 1463 }, { "epoch": 6.853130485664131, "grad_norm": 0.47340846061706543, "learning_rate": 2.693982943492983e-05, "loss": 1.3428, "step": 1464 }, { "epoch": 6.8578115857226445, "grad_norm": 0.4921325743198395, "learning_rate": 2.6867155225620523e-05, "loss": 1.3538, "step": 1465 }, { "epoch": 6.862492685781159, "grad_norm": 0.4751735329627991, "learning_rate": 2.6794543143895356e-05, "loss": 1.3534, "step": 1466 }, { "epoch": 6.867173785839673, "grad_norm": 0.4749985635280609, "learning_rate": 2.672199338476763e-05, "loss": 1.3496, "step": 1467 }, { "epoch": 6.871854885898186, "grad_norm": 0.4956899881362915, "learning_rate": 2.6649506143083302e-05, "loss": 1.355, "step": 1468 }, { "epoch": 6.8765359859567, "grad_norm": 0.48751288652420044, "learning_rate": 2.6577081613520487e-05, "loss": 1.3493, "step": 1469 }, { "epoch": 6.881217086015214, "grad_norm": 0.4699368476867676, "learning_rate": 2.650471999058875e-05, "loss": 1.3395, "step": 1470 }, { "epoch": 6.885898186073727, "grad_norm": 0.484819233417511, "learning_rate": 2.643242146862881e-05, "loss": 1.3556, "step": 1471 }, { "epoch": 6.890579286132241, "grad_norm": 0.48918503522872925, "learning_rate": 2.6360186241811847e-05, "loss": 1.3558, "step": 1472 }, { "epoch": 6.895260386190754, "grad_norm": 0.4830945134162903, "learning_rate": 2.6288014504139104e-05, "loss": 1.3523, "step": 1473 }, { "epoch": 6.8999414862492685, "grad_norm": 0.46876898407936096, "learning_rate": 2.6215906449441273e-05, "loss": 1.3617, "step": 1474 }, { "epoch": 6.904622586307783, "grad_norm": 0.49341315031051636, "learning_rate": 2.6143862271378052e-05, "loss": 1.3386, "step": 1475 }, { "epoch": 6.909303686366296, "grad_norm": 0.4862746298313141, "learning_rate": 2.607188216343756e-05, "loss": 1.3538, "step": 1476 }, { "epoch": 6.91398478642481, "grad_norm": 0.4845553934574127, "learning_rate": 2.5999966318935832e-05, "loss": 1.3513, "step": 1477 }, { "epoch": 6.918665886483324, "grad_norm": 0.472523033618927, "learning_rate": 2.5928114931016333e-05, "loss": 1.3533, "step": 1478 }, { "epoch": 6.923346986541837, "grad_norm": 0.4786662757396698, "learning_rate": 2.5856328192649426e-05, "loss": 1.3594, "step": 1479 }, { "epoch": 6.928028086600351, "grad_norm": 0.4927366077899933, "learning_rate": 2.578460629663182e-05, "loss": 1.3549, "step": 1480 }, { "epoch": 6.932709186658865, "grad_norm": 0.4627937972545624, "learning_rate": 2.571294943558612e-05, "loss": 1.3319, "step": 1481 }, { "epoch": 6.9373902867173785, "grad_norm": 0.4778825044631958, "learning_rate": 2.5641357801960187e-05, "loss": 1.3407, "step": 1482 }, { "epoch": 6.9420713867758925, "grad_norm": 0.4876656234264374, "learning_rate": 2.556983158802677e-05, "loss": 1.3611, "step": 1483 }, { "epoch": 6.946752486834406, "grad_norm": 0.5010808110237122, "learning_rate": 2.5498370985882926e-05, "loss": 1.3416, "step": 1484 }, { "epoch": 6.95143358689292, "grad_norm": 0.47430136799812317, "learning_rate": 2.542697618744945e-05, "loss": 1.3295, "step": 1485 }, { "epoch": 6.956114686951434, "grad_norm": 0.49215957522392273, "learning_rate": 2.535564738447047e-05, "loss": 1.3425, "step": 1486 }, { "epoch": 6.960795787009947, "grad_norm": 0.4640463590621948, "learning_rate": 2.528438476851279e-05, "loss": 1.3567, "step": 1487 }, { "epoch": 6.965476887068461, "grad_norm": 0.4978911578655243, "learning_rate": 2.5213188530965525e-05, "loss": 1.3512, "step": 1488 }, { "epoch": 6.970157987126975, "grad_norm": 0.4663708806037903, "learning_rate": 2.5142058863039498e-05, "loss": 1.3495, "step": 1489 }, { "epoch": 6.974839087185488, "grad_norm": 0.48394128680229187, "learning_rate": 2.507099595576674e-05, "loss": 1.3511, "step": 1490 }, { "epoch": 6.9795201872440025, "grad_norm": 0.49097976088523865, "learning_rate": 2.500000000000001e-05, "loss": 1.33, "step": 1491 }, { "epoch": 6.9842012873025165, "grad_norm": 0.47492849826812744, "learning_rate": 2.4929071186412194e-05, "loss": 1.3337, "step": 1492 }, { "epoch": 6.98888238736103, "grad_norm": 0.48000726103782654, "learning_rate": 2.485820970549592e-05, "loss": 1.3532, "step": 1493 }, { "epoch": 6.993563487419544, "grad_norm": 0.4719773232936859, "learning_rate": 2.4787415747562958e-05, "loss": 1.3438, "step": 1494 }, { "epoch": 6.998244587478057, "grad_norm": 0.4728144407272339, "learning_rate": 2.4716689502743724e-05, "loss": 1.3398, "step": 1495 }, { "epoch": 7.002925687536571, "grad_norm": 1.8404959440231323, "learning_rate": 2.4646031160986803e-05, "loss": 1.2867, "step": 1496 }, { "epoch": 7.007606787595085, "grad_norm": 0.4895849823951721, "learning_rate": 2.4575440912058374e-05, "loss": 1.3655, "step": 1497 }, { "epoch": 7.012287887653598, "grad_norm": 0.5031997561454773, "learning_rate": 2.450491894554176e-05, "loss": 1.3445, "step": 1498 }, { "epoch": 7.016968987712112, "grad_norm": 0.4948151111602783, "learning_rate": 2.4434465450836917e-05, "loss": 1.3692, "step": 1499 }, { "epoch": 7.0216500877706265, "grad_norm": 0.47969523072242737, "learning_rate": 2.4364080617159886e-05, "loss": 1.3499, "step": 1500 }, { "epoch": 7.02633118782914, "grad_norm": 0.4878917932510376, "learning_rate": 2.429376463354232e-05, "loss": 1.3335, "step": 1501 }, { "epoch": 7.031012287887654, "grad_norm": 0.4990362524986267, "learning_rate": 2.4223517688830933e-05, "loss": 1.3581, "step": 1502 }, { "epoch": 7.035693387946167, "grad_norm": 0.4753541648387909, "learning_rate": 2.415333997168705e-05, "loss": 1.3447, "step": 1503 }, { "epoch": 7.040374488004681, "grad_norm": 0.4919619858264923, "learning_rate": 2.4083231670586082e-05, "loss": 1.3306, "step": 1504 }, { "epoch": 7.045055588063195, "grad_norm": 0.46794456243515015, "learning_rate": 2.4013192973816944e-05, "loss": 1.3356, "step": 1505 }, { "epoch": 7.049736688121708, "grad_norm": 0.46644270420074463, "learning_rate": 2.3943224069481734e-05, "loss": 1.3382, "step": 1506 }, { "epoch": 7.054417788180222, "grad_norm": 0.4738415479660034, "learning_rate": 2.387332514549499e-05, "loss": 1.3437, "step": 1507 }, { "epoch": 7.059098888238736, "grad_norm": 0.4804893136024475, "learning_rate": 2.3803496389583364e-05, "loss": 1.364, "step": 1508 }, { "epoch": 7.06377998829725, "grad_norm": 0.4757007360458374, "learning_rate": 2.373373798928507e-05, "loss": 1.3311, "step": 1509 }, { "epoch": 7.068461088355764, "grad_norm": 0.4703696370124817, "learning_rate": 2.36640501319493e-05, "loss": 1.3387, "step": 1510 }, { "epoch": 7.073142188414278, "grad_norm": 0.4751465916633606, "learning_rate": 2.3594433004735906e-05, "loss": 1.3535, "step": 1511 }, { "epoch": 7.077823288472791, "grad_norm": 0.4735172986984253, "learning_rate": 2.3524886794614653e-05, "loss": 1.3424, "step": 1512 }, { "epoch": 7.082504388531305, "grad_norm": 0.48852401971817017, "learning_rate": 2.3455411688364925e-05, "loss": 1.339, "step": 1513 }, { "epoch": 7.087185488589818, "grad_norm": 0.47311729192733765, "learning_rate": 2.3386007872575143e-05, "loss": 1.342, "step": 1514 }, { "epoch": 7.091866588648332, "grad_norm": 0.466234028339386, "learning_rate": 2.3316675533642214e-05, "loss": 1.3322, "step": 1515 }, { "epoch": 7.096547688706846, "grad_norm": 0.46330374479293823, "learning_rate": 2.3247414857771115e-05, "loss": 1.3402, "step": 1516 }, { "epoch": 7.10122878876536, "grad_norm": 0.48585692048072815, "learning_rate": 2.317822603097436e-05, "loss": 1.3419, "step": 1517 }, { "epoch": 7.105909888823874, "grad_norm": 0.4767219126224518, "learning_rate": 2.3109109239071492e-05, "loss": 1.355, "step": 1518 }, { "epoch": 7.110590988882388, "grad_norm": 0.4704568088054657, "learning_rate": 2.304006466768861e-05, "loss": 1.3285, "step": 1519 }, { "epoch": 7.115272088940901, "grad_norm": 0.4667569100856781, "learning_rate": 2.29710925022578e-05, "loss": 1.3553, "step": 1520 }, { "epoch": 7.119953188999415, "grad_norm": 0.4830743074417114, "learning_rate": 2.2902192928016734e-05, "loss": 1.3438, "step": 1521 }, { "epoch": 7.124634289057928, "grad_norm": 0.4988841712474823, "learning_rate": 2.2833366130008117e-05, "loss": 1.3273, "step": 1522 }, { "epoch": 7.129315389116442, "grad_norm": 0.4610385596752167, "learning_rate": 2.276461229307921e-05, "loss": 1.3582, "step": 1523 }, { "epoch": 7.133996489174956, "grad_norm": 0.4775535762310028, "learning_rate": 2.2695931601881276e-05, "loss": 1.3579, "step": 1524 }, { "epoch": 7.1386775892334695, "grad_norm": 0.4786890745162964, "learning_rate": 2.262732424086918e-05, "loss": 1.3439, "step": 1525 }, { "epoch": 7.143358689291984, "grad_norm": 0.45839619636535645, "learning_rate": 2.2558790394300838e-05, "loss": 1.3358, "step": 1526 }, { "epoch": 7.148039789350498, "grad_norm": 0.47785043716430664, "learning_rate": 2.249033024623672e-05, "loss": 1.3269, "step": 1527 }, { "epoch": 7.152720889409011, "grad_norm": 0.4758484363555908, "learning_rate": 2.2421943980539384e-05, "loss": 1.3318, "step": 1528 }, { "epoch": 7.157401989467525, "grad_norm": 0.48065611720085144, "learning_rate": 2.23536317808729e-05, "loss": 1.3469, "step": 1529 }, { "epoch": 7.162083089526039, "grad_norm": 0.45679306983947754, "learning_rate": 2.22853938307025e-05, "loss": 1.3517, "step": 1530 }, { "epoch": 7.166764189584552, "grad_norm": 0.4631989002227783, "learning_rate": 2.2217230313293953e-05, "loss": 1.3512, "step": 1531 }, { "epoch": 7.171445289643066, "grad_norm": 0.47275522351264954, "learning_rate": 2.2149141411713154e-05, "loss": 1.3471, "step": 1532 }, { "epoch": 7.1761263897015795, "grad_norm": 0.48173314332962036, "learning_rate": 2.2081127308825612e-05, "loss": 1.3281, "step": 1533 }, { "epoch": 7.1808074897600935, "grad_norm": 0.47281208634376526, "learning_rate": 2.201318818729588e-05, "loss": 1.3564, "step": 1534 }, { "epoch": 7.185488589818608, "grad_norm": 0.45339658856391907, "learning_rate": 2.1945324229587217e-05, "loss": 1.3358, "step": 1535 }, { "epoch": 7.190169689877121, "grad_norm": 0.4726531505584717, "learning_rate": 2.187753561796097e-05, "loss": 1.3333, "step": 1536 }, { "epoch": 7.194850789935635, "grad_norm": 0.46895188093185425, "learning_rate": 2.180982253447615e-05, "loss": 1.3366, "step": 1537 }, { "epoch": 7.199531889994149, "grad_norm": 0.4762912094593048, "learning_rate": 2.174218516098893e-05, "loss": 1.3633, "step": 1538 }, { "epoch": 7.204212990052662, "grad_norm": 0.48002806305885315, "learning_rate": 2.1674623679152105e-05, "loss": 1.3578, "step": 1539 }, { "epoch": 7.208894090111176, "grad_norm": 0.47279906272888184, "learning_rate": 2.1607138270414695e-05, "loss": 1.3504, "step": 1540 }, { "epoch": 7.21357519016969, "grad_norm": 0.45677801966667175, "learning_rate": 2.153972911602139e-05, "loss": 1.3378, "step": 1541 }, { "epoch": 7.2182562902282035, "grad_norm": 0.4792294502258301, "learning_rate": 2.14723963970121e-05, "loss": 1.3568, "step": 1542 }, { "epoch": 7.2229373902867176, "grad_norm": 0.49012744426727295, "learning_rate": 2.1405140294221453e-05, "loss": 1.3102, "step": 1543 }, { "epoch": 7.227618490345231, "grad_norm": 0.47090527415275574, "learning_rate": 2.1337960988278265e-05, "loss": 1.3445, "step": 1544 }, { "epoch": 7.232299590403745, "grad_norm": 0.46279802918434143, "learning_rate": 2.1270858659605158e-05, "loss": 1.3388, "step": 1545 }, { "epoch": 7.236980690462259, "grad_norm": 0.4582454562187195, "learning_rate": 2.1203833488418014e-05, "loss": 1.3283, "step": 1546 }, { "epoch": 7.241661790520772, "grad_norm": 0.46114638447761536, "learning_rate": 2.1136885654725412e-05, "loss": 1.3387, "step": 1547 }, { "epoch": 7.246342890579286, "grad_norm": 0.47302740812301636, "learning_rate": 2.107001533832837e-05, "loss": 1.3528, "step": 1548 }, { "epoch": 7.2510239906378, "grad_norm": 0.45719102025032043, "learning_rate": 2.1003222718819575e-05, "loss": 1.3551, "step": 1549 }, { "epoch": 7.255705090696313, "grad_norm": 0.4898391366004944, "learning_rate": 2.093650797558313e-05, "loss": 1.342, "step": 1550 }, { "epoch": 7.2603861907548275, "grad_norm": 0.4674202799797058, "learning_rate": 2.0869871287793985e-05, "loss": 1.3365, "step": 1551 }, { "epoch": 7.265067290813342, "grad_norm": 0.45634809136390686, "learning_rate": 2.080331283441738e-05, "loss": 1.3345, "step": 1552 }, { "epoch": 7.269748390871855, "grad_norm": 0.4628802239894867, "learning_rate": 2.0736832794208573e-05, "loss": 1.3492, "step": 1553 }, { "epoch": 7.274429490930369, "grad_norm": 0.45460382103919983, "learning_rate": 2.0670431345712092e-05, "loss": 1.3191, "step": 1554 }, { "epoch": 7.279110590988882, "grad_norm": 0.4698864817619324, "learning_rate": 2.0604108667261483e-05, "loss": 1.3391, "step": 1555 }, { "epoch": 7.283791691047396, "grad_norm": 0.4649961292743683, "learning_rate": 2.0537864936978722e-05, "loss": 1.3407, "step": 1556 }, { "epoch": 7.28847279110591, "grad_norm": 0.4733186960220337, "learning_rate": 2.0471700332773715e-05, "loss": 1.3436, "step": 1557 }, { "epoch": 7.293153891164423, "grad_norm": 0.4652688503265381, "learning_rate": 2.04056150323439e-05, "loss": 1.3421, "step": 1558 }, { "epoch": 7.297834991222937, "grad_norm": 0.47101065516471863, "learning_rate": 2.0339609213173726e-05, "loss": 1.3424, "step": 1559 }, { "epoch": 7.3025160912814515, "grad_norm": 0.47479957342147827, "learning_rate": 2.0273683052534175e-05, "loss": 1.3319, "step": 1560 }, { "epoch": 7.307197191339965, "grad_norm": 0.4701590836048126, "learning_rate": 2.02078367274823e-05, "loss": 1.3557, "step": 1561 }, { "epoch": 7.311878291398479, "grad_norm": 0.46992096304893494, "learning_rate": 2.0142070414860704e-05, "loss": 1.3161, "step": 1562 }, { "epoch": 7.316559391456993, "grad_norm": 0.4736483693122864, "learning_rate": 2.0076384291297134e-05, "loss": 1.3502, "step": 1563 }, { "epoch": 7.321240491515506, "grad_norm": 0.478202760219574, "learning_rate": 2.0010778533203982e-05, "loss": 1.3343, "step": 1564 }, { "epoch": 7.32592159157402, "grad_norm": 0.48485836386680603, "learning_rate": 1.9945253316777785e-05, "loss": 1.3362, "step": 1565 }, { "epoch": 7.330602691632533, "grad_norm": 0.47226443886756897, "learning_rate": 1.987980881799879e-05, "loss": 1.3349, "step": 1566 }, { "epoch": 7.335283791691047, "grad_norm": 0.47480282187461853, "learning_rate": 1.981444521263041e-05, "loss": 1.3419, "step": 1567 }, { "epoch": 7.3399648917495615, "grad_norm": 0.4802916944026947, "learning_rate": 1.974916267621887e-05, "loss": 1.343, "step": 1568 }, { "epoch": 7.344645991808075, "grad_norm": 0.4808948338031769, "learning_rate": 1.9683961384092626e-05, "loss": 1.3161, "step": 1569 }, { "epoch": 7.349327091866589, "grad_norm": 0.47402530908584595, "learning_rate": 1.9618841511361957e-05, "loss": 1.3419, "step": 1570 }, { "epoch": 7.354008191925103, "grad_norm": 0.46208834648132324, "learning_rate": 1.9553803232918482e-05, "loss": 1.333, "step": 1571 }, { "epoch": 7.358689291983616, "grad_norm": 0.4791894257068634, "learning_rate": 1.9488846723434646e-05, "loss": 1.3353, "step": 1572 }, { "epoch": 7.36337039204213, "grad_norm": 0.4807760417461395, "learning_rate": 1.9423972157363318e-05, "loss": 1.3629, "step": 1573 }, { "epoch": 7.368051492100643, "grad_norm": 0.49156269431114197, "learning_rate": 1.935917970893729e-05, "loss": 1.3448, "step": 1574 }, { "epoch": 7.372732592159157, "grad_norm": 0.46572548151016235, "learning_rate": 1.9294469552168813e-05, "loss": 1.3447, "step": 1575 }, { "epoch": 7.377413692217671, "grad_norm": 0.4842897057533264, "learning_rate": 1.9229841860849134e-05, "loss": 1.326, "step": 1576 }, { "epoch": 7.382094792276185, "grad_norm": 0.46771901845932007, "learning_rate": 1.916529680854799e-05, "loss": 1.3459, "step": 1577 }, { "epoch": 7.386775892334699, "grad_norm": 0.4856923520565033, "learning_rate": 1.910083456861321e-05, "loss": 1.3448, "step": 1578 }, { "epoch": 7.391456992393213, "grad_norm": 0.4657568335533142, "learning_rate": 1.90364553141702e-05, "loss": 1.3459, "step": 1579 }, { "epoch": 7.396138092451726, "grad_norm": 0.4637135863304138, "learning_rate": 1.89721592181215e-05, "loss": 1.3061, "step": 1580 }, { "epoch": 7.40081919251024, "grad_norm": 0.4781709909439087, "learning_rate": 1.890794645314633e-05, "loss": 1.3539, "step": 1581 }, { "epoch": 7.405500292568753, "grad_norm": 0.460147500038147, "learning_rate": 1.8843817191700047e-05, "loss": 1.3321, "step": 1582 }, { "epoch": 7.410181392627267, "grad_norm": 0.4599703252315521, "learning_rate": 1.8779771606013808e-05, "loss": 1.3311, "step": 1583 }, { "epoch": 7.414862492685781, "grad_norm": 0.46007290482521057, "learning_rate": 1.871580986809402e-05, "loss": 1.339, "step": 1584 }, { "epoch": 7.4195435927442945, "grad_norm": 0.4541688561439514, "learning_rate": 1.8651932149721897e-05, "loss": 1.3403, "step": 1585 }, { "epoch": 7.424224692802809, "grad_norm": 0.4677172899246216, "learning_rate": 1.8588138622453027e-05, "loss": 1.3507, "step": 1586 }, { "epoch": 7.428905792861323, "grad_norm": 0.472500741481781, "learning_rate": 1.8524429457616832e-05, "loss": 1.3553, "step": 1587 }, { "epoch": 7.433586892919836, "grad_norm": 0.4556261897087097, "learning_rate": 1.846080482631623e-05, "loss": 1.3387, "step": 1588 }, { "epoch": 7.43826799297835, "grad_norm": 0.4568939208984375, "learning_rate": 1.839726489942703e-05, "loss": 1.3512, "step": 1589 }, { "epoch": 7.442949093036864, "grad_norm": 0.46659862995147705, "learning_rate": 1.8333809847597642e-05, "loss": 1.3337, "step": 1590 }, { "epoch": 7.447630193095377, "grad_norm": 0.4842297434806824, "learning_rate": 1.8270439841248492e-05, "loss": 1.3537, "step": 1591 }, { "epoch": 7.452311293153891, "grad_norm": 0.4623686373233795, "learning_rate": 1.820715505057155e-05, "loss": 1.3189, "step": 1592 }, { "epoch": 7.4569923932124045, "grad_norm": 0.45595666766166687, "learning_rate": 1.8143955645530007e-05, "loss": 1.3282, "step": 1593 }, { "epoch": 7.4616734932709186, "grad_norm": 0.4729546904563904, "learning_rate": 1.8080841795857633e-05, "loss": 1.3528, "step": 1594 }, { "epoch": 7.466354593329433, "grad_norm": 0.47479796409606934, "learning_rate": 1.8017813671058547e-05, "loss": 1.3553, "step": 1595 }, { "epoch": 7.471035693387946, "grad_norm": 0.48458370566368103, "learning_rate": 1.7954871440406562e-05, "loss": 1.3396, "step": 1596 }, { "epoch": 7.47571679344646, "grad_norm": 0.4696284830570221, "learning_rate": 1.789201527294479e-05, "loss": 1.3267, "step": 1597 }, { "epoch": 7.480397893504974, "grad_norm": 0.45866280794143677, "learning_rate": 1.7829245337485267e-05, "loss": 1.3247, "step": 1598 }, { "epoch": 7.485078993563487, "grad_norm": 0.46521177887916565, "learning_rate": 1.7766561802608373e-05, "loss": 1.3272, "step": 1599 }, { "epoch": 7.489760093622001, "grad_norm": 0.4628099799156189, "learning_rate": 1.7703964836662462e-05, "loss": 1.3311, "step": 1600 }, { "epoch": 7.494441193680515, "grad_norm": 0.47476711869239807, "learning_rate": 1.7641454607763464e-05, "loss": 1.3509, "step": 1601 }, { "epoch": 7.4991222937390285, "grad_norm": 0.45366212725639343, "learning_rate": 1.7579031283794236e-05, "loss": 1.3437, "step": 1602 }, { "epoch": 7.503803393797543, "grad_norm": 0.45894238352775574, "learning_rate": 1.751669503240434e-05, "loss": 1.3451, "step": 1603 }, { "epoch": 7.508484493856056, "grad_norm": 0.49631863832473755, "learning_rate": 1.7454446021009408e-05, "loss": 1.3531, "step": 1604 }, { "epoch": 7.51316559391457, "grad_norm": 0.4589841067790985, "learning_rate": 1.739228441679081e-05, "loss": 1.3305, "step": 1605 }, { "epoch": 7.517846693973084, "grad_norm": 0.4836769104003906, "learning_rate": 1.7330210386695216e-05, "loss": 1.3564, "step": 1606 }, { "epoch": 7.522527794031597, "grad_norm": 0.46482783555984497, "learning_rate": 1.726822409743401e-05, "loss": 1.3357, "step": 1607 }, { "epoch": 7.527208894090111, "grad_norm": 0.4549084007740021, "learning_rate": 1.7206325715483e-05, "loss": 1.3272, "step": 1608 }, { "epoch": 7.531889994148625, "grad_norm": 0.45632073283195496, "learning_rate": 1.7144515407081845e-05, "loss": 1.3506, "step": 1609 }, { "epoch": 7.536571094207138, "grad_norm": 0.4481995105743408, "learning_rate": 1.7082793338233716e-05, "loss": 1.3287, "step": 1610 }, { "epoch": 7.5412521942656525, "grad_norm": 0.46668148040771484, "learning_rate": 1.702115967470477e-05, "loss": 1.3391, "step": 1611 }, { "epoch": 7.545933294324167, "grad_norm": 0.4636210501194, "learning_rate": 1.6959614582023752e-05, "loss": 1.3302, "step": 1612 }, { "epoch": 7.55061439438268, "grad_norm": 0.4625440239906311, "learning_rate": 1.6898158225481548e-05, "loss": 1.3625, "step": 1613 }, { "epoch": 7.555295494441194, "grad_norm": 0.46369972825050354, "learning_rate": 1.683679077013066e-05, "loss": 1.3406, "step": 1614 }, { "epoch": 7.559976594499707, "grad_norm": 0.46081992983818054, "learning_rate": 1.6775512380784903e-05, "loss": 1.3516, "step": 1615 }, { "epoch": 7.564657694558221, "grad_norm": 0.4579129219055176, "learning_rate": 1.6714323222018858e-05, "loss": 1.335, "step": 1616 }, { "epoch": 7.569338794616735, "grad_norm": 0.47029703855514526, "learning_rate": 1.665322345816746e-05, "loss": 1.345, "step": 1617 }, { "epoch": 7.574019894675248, "grad_norm": 0.45141252875328064, "learning_rate": 1.6592213253325578e-05, "loss": 1.3341, "step": 1618 }, { "epoch": 7.5787009947337625, "grad_norm": 0.44940832257270813, "learning_rate": 1.65312927713475e-05, "loss": 1.3287, "step": 1619 }, { "epoch": 7.5833820947922765, "grad_norm": 0.45708319544792175, "learning_rate": 1.647046217584661e-05, "loss": 1.3354, "step": 1620 }, { "epoch": 7.58806319485079, "grad_norm": 0.4675891399383545, "learning_rate": 1.640972163019484e-05, "loss": 1.3383, "step": 1621 }, { "epoch": 7.592744294909304, "grad_norm": 0.4550844132900238, "learning_rate": 1.63490712975223e-05, "loss": 1.3535, "step": 1622 }, { "epoch": 7.597425394967818, "grad_norm": 0.46031132340431213, "learning_rate": 1.6288511340716833e-05, "loss": 1.3197, "step": 1623 }, { "epoch": 7.602106495026331, "grad_norm": 0.46827811002731323, "learning_rate": 1.6228041922423498e-05, "loss": 1.3385, "step": 1624 }, { "epoch": 7.606787595084845, "grad_norm": 0.4582699239253998, "learning_rate": 1.6167663205044247e-05, "loss": 1.341, "step": 1625 }, { "epoch": 7.611468695143358, "grad_norm": 0.46605154871940613, "learning_rate": 1.6107375350737437e-05, "loss": 1.342, "step": 1626 }, { "epoch": 7.616149795201872, "grad_norm": 0.46738141775131226, "learning_rate": 1.6047178521417376e-05, "loss": 1.3368, "step": 1627 }, { "epoch": 7.6208308952603865, "grad_norm": 0.47278207540512085, "learning_rate": 1.5987072878753933e-05, "loss": 1.3454, "step": 1628 }, { "epoch": 7.6255119953189, "grad_norm": 0.46338996291160583, "learning_rate": 1.5927058584172034e-05, "loss": 1.3399, "step": 1629 }, { "epoch": 7.630193095377414, "grad_norm": 0.4683593511581421, "learning_rate": 1.5867135798851306e-05, "loss": 1.341, "step": 1630 }, { "epoch": 7.634874195435928, "grad_norm": 0.46051323413848877, "learning_rate": 1.580730468372561e-05, "loss": 1.3181, "step": 1631 }, { "epoch": 7.639555295494441, "grad_norm": 0.4732109010219574, "learning_rate": 1.5747565399482604e-05, "loss": 1.3449, "step": 1632 }, { "epoch": 7.644236395552955, "grad_norm": 0.4626827836036682, "learning_rate": 1.5687918106563326e-05, "loss": 1.3269, "step": 1633 }, { "epoch": 7.648917495611469, "grad_norm": 0.4615435004234314, "learning_rate": 1.5628362965161724e-05, "loss": 1.3449, "step": 1634 }, { "epoch": 7.653598595669982, "grad_norm": 0.46386954188346863, "learning_rate": 1.556890013522428e-05, "loss": 1.3274, "step": 1635 }, { "epoch": 7.658279695728496, "grad_norm": 0.4549250304698944, "learning_rate": 1.5509529776449577e-05, "loss": 1.3291, "step": 1636 }, { "epoch": 7.66296079578701, "grad_norm": 0.4708329737186432, "learning_rate": 1.545025204828777e-05, "loss": 1.3441, "step": 1637 }, { "epoch": 7.667641895845524, "grad_norm": 0.4606434404850006, "learning_rate": 1.5391067109940366e-05, "loss": 1.3346, "step": 1638 }, { "epoch": 7.672322995904038, "grad_norm": 0.4673614501953125, "learning_rate": 1.5331975120359536e-05, "loss": 1.3247, "step": 1639 }, { "epoch": 7.677004095962551, "grad_norm": 0.45525097846984863, "learning_rate": 1.5272976238247903e-05, "loss": 1.3584, "step": 1640 }, { "epoch": 7.681685196021065, "grad_norm": 0.4707421660423279, "learning_rate": 1.5214070622058013e-05, "loss": 1.3507, "step": 1641 }, { "epoch": 7.686366296079578, "grad_norm": 0.46960994601249695, "learning_rate": 1.5155258429991882e-05, "loss": 1.342, "step": 1642 }, { "epoch": 7.691047396138092, "grad_norm": 0.46895959973335266, "learning_rate": 1.5096539820000721e-05, "loss": 1.3518, "step": 1643 }, { "epoch": 7.695728496196606, "grad_norm": 0.47289028763771057, "learning_rate": 1.5037914949784299e-05, "loss": 1.3365, "step": 1644 }, { "epoch": 7.7004095962551204, "grad_norm": 0.46617165207862854, "learning_rate": 1.4979383976790695e-05, "loss": 1.34, "step": 1645 }, { "epoch": 7.705090696313634, "grad_norm": 0.45962589979171753, "learning_rate": 1.49209470582158e-05, "loss": 1.3341, "step": 1646 }, { "epoch": 7.709771796372148, "grad_norm": 0.4578133523464203, "learning_rate": 1.4862604351002867e-05, "loss": 1.3283, "step": 1647 }, { "epoch": 7.714452896430661, "grad_norm": 0.4768761098384857, "learning_rate": 1.480435601184218e-05, "loss": 1.3328, "step": 1648 }, { "epoch": 7.719133996489175, "grad_norm": 0.45739585161209106, "learning_rate": 1.4746202197170544e-05, "loss": 1.319, "step": 1649 }, { "epoch": 7.723815096547689, "grad_norm": 0.4618678092956543, "learning_rate": 1.4688143063170923e-05, "loss": 1.3188, "step": 1650 }, { "epoch": 7.728496196606202, "grad_norm": 0.4686064124107361, "learning_rate": 1.463017876577199e-05, "loss": 1.3439, "step": 1651 }, { "epoch": 7.733177296664716, "grad_norm": 0.47376856207847595, "learning_rate": 1.4572309460647693e-05, "loss": 1.3228, "step": 1652 }, { "epoch": 7.7378583967232295, "grad_norm": 0.4711912274360657, "learning_rate": 1.4514535303216892e-05, "loss": 1.3317, "step": 1653 }, { "epoch": 7.742539496781744, "grad_norm": 0.45707976818084717, "learning_rate": 1.4456856448642903e-05, "loss": 1.3257, "step": 1654 }, { "epoch": 7.747220596840258, "grad_norm": 0.4613284170627594, "learning_rate": 1.4399273051833084e-05, "loss": 1.3464, "step": 1655 }, { "epoch": 7.751901696898771, "grad_norm": 0.4727132022380829, "learning_rate": 1.434178526743844e-05, "loss": 1.3484, "step": 1656 }, { "epoch": 7.756582796957285, "grad_norm": 0.4638311564922333, "learning_rate": 1.4284393249853145e-05, "loss": 1.3378, "step": 1657 }, { "epoch": 7.761263897015799, "grad_norm": 0.4656262695789337, "learning_rate": 1.422709715321422e-05, "loss": 1.3348, "step": 1658 }, { "epoch": 7.765944997074312, "grad_norm": 0.47884249687194824, "learning_rate": 1.416989713140105e-05, "loss": 1.3641, "step": 1659 }, { "epoch": 7.770626097132826, "grad_norm": 0.4640868604183197, "learning_rate": 1.4112793338035002e-05, "loss": 1.2996, "step": 1660 }, { "epoch": 7.77530719719134, "grad_norm": 0.4635297656059265, "learning_rate": 1.4055785926479021e-05, "loss": 1.3456, "step": 1661 }, { "epoch": 7.7799882972498535, "grad_norm": 0.4484289586544037, "learning_rate": 1.399887504983714e-05, "loss": 1.333, "step": 1662 }, { "epoch": 7.784669397308368, "grad_norm": 0.4604371190071106, "learning_rate": 1.3942060860954192e-05, "loss": 1.3154, "step": 1663 }, { "epoch": 7.789350497366881, "grad_norm": 0.45666757225990295, "learning_rate": 1.388534351241531e-05, "loss": 1.3377, "step": 1664 }, { "epoch": 7.794031597425395, "grad_norm": 0.4564230442047119, "learning_rate": 1.3828723156545553e-05, "loss": 1.3194, "step": 1665 }, { "epoch": 7.798712697483909, "grad_norm": 0.4641699492931366, "learning_rate": 1.3772199945409497e-05, "loss": 1.348, "step": 1666 }, { "epoch": 7.803393797542422, "grad_norm": 0.4646790027618408, "learning_rate": 1.3715774030810773e-05, "loss": 1.3318, "step": 1667 }, { "epoch": 7.808074897600936, "grad_norm": 0.46432945132255554, "learning_rate": 1.3659445564291757e-05, "loss": 1.3358, "step": 1668 }, { "epoch": 7.81275599765945, "grad_norm": 0.46811240911483765, "learning_rate": 1.3603214697133071e-05, "loss": 1.3521, "step": 1669 }, { "epoch": 7.8174370977179635, "grad_norm": 0.4617379307746887, "learning_rate": 1.3547081580353249e-05, "loss": 1.3268, "step": 1670 }, { "epoch": 7.8221181977764775, "grad_norm": 0.45402026176452637, "learning_rate": 1.3491046364708293e-05, "loss": 1.3137, "step": 1671 }, { "epoch": 7.826799297834992, "grad_norm": 0.45435112714767456, "learning_rate": 1.3435109200691232e-05, "loss": 1.3199, "step": 1672 }, { "epoch": 7.831480397893505, "grad_norm": 0.4637871980667114, "learning_rate": 1.3379270238531798e-05, "loss": 1.3306, "step": 1673 }, { "epoch": 7.836161497952019, "grad_norm": 0.4626833498477936, "learning_rate": 1.3323529628195985e-05, "loss": 1.3166, "step": 1674 }, { "epoch": 7.840842598010532, "grad_norm": 0.46381741762161255, "learning_rate": 1.3267887519385624e-05, "loss": 1.3437, "step": 1675 }, { "epoch": 7.845523698069046, "grad_norm": 0.4669346511363983, "learning_rate": 1.3212344061538034e-05, "loss": 1.3285, "step": 1676 }, { "epoch": 7.85020479812756, "grad_norm": 0.45426100492477417, "learning_rate": 1.3156899403825534e-05, "loss": 1.3491, "step": 1677 }, { "epoch": 7.854885898186073, "grad_norm": 0.476627916097641, "learning_rate": 1.3101553695155156e-05, "loss": 1.3265, "step": 1678 }, { "epoch": 7.8595669982445875, "grad_norm": 0.4632202386856079, "learning_rate": 1.3046307084168119e-05, "loss": 1.3334, "step": 1679 }, { "epoch": 7.864248098303102, "grad_norm": 0.4600007236003876, "learning_rate": 1.2991159719239582e-05, "loss": 1.3446, "step": 1680 }, { "epoch": 7.868929198361615, "grad_norm": 0.4647494852542877, "learning_rate": 1.2936111748478114e-05, "loss": 1.3327, "step": 1681 }, { "epoch": 7.873610298420129, "grad_norm": 0.4659038484096527, "learning_rate": 1.2881163319725303e-05, "loss": 1.3418, "step": 1682 }, { "epoch": 7.878291398478643, "grad_norm": 0.4638218879699707, "learning_rate": 1.2826314580555487e-05, "loss": 1.3293, "step": 1683 }, { "epoch": 7.882972498537156, "grad_norm": 0.467208594083786, "learning_rate": 1.277156567827516e-05, "loss": 1.3503, "step": 1684 }, { "epoch": 7.88765359859567, "grad_norm": 0.45110321044921875, "learning_rate": 1.2716916759922797e-05, "loss": 1.3304, "step": 1685 }, { "epoch": 7.892334698654183, "grad_norm": 0.45999690890312195, "learning_rate": 1.2662367972268297e-05, "loss": 1.3333, "step": 1686 }, { "epoch": 7.897015798712697, "grad_norm": 0.46465185284614563, "learning_rate": 1.2607919461812606e-05, "loss": 1.3489, "step": 1687 }, { "epoch": 7.9016968987712115, "grad_norm": 0.47059544920921326, "learning_rate": 1.255357137478742e-05, "loss": 1.3456, "step": 1688 }, { "epoch": 7.906377998829725, "grad_norm": 0.4738016724586487, "learning_rate": 1.2499323857154671e-05, "loss": 1.3287, "step": 1689 }, { "epoch": 7.911059098888239, "grad_norm": 0.47977060079574585, "learning_rate": 1.2445177054606216e-05, "loss": 1.3477, "step": 1690 }, { "epoch": 7.915740198946753, "grad_norm": 0.47547101974487305, "learning_rate": 1.239113111256347e-05, "loss": 1.3249, "step": 1691 }, { "epoch": 7.920421299005266, "grad_norm": 0.45657679438591003, "learning_rate": 1.2337186176176891e-05, "loss": 1.3387, "step": 1692 }, { "epoch": 7.92510239906378, "grad_norm": 0.4775516390800476, "learning_rate": 1.2283342390325724e-05, "loss": 1.344, "step": 1693 }, { "epoch": 7.929783499122294, "grad_norm": 0.46691226959228516, "learning_rate": 1.22295998996175e-05, "loss": 1.3408, "step": 1694 }, { "epoch": 7.934464599180807, "grad_norm": 0.4656122624874115, "learning_rate": 1.2175958848387765e-05, "loss": 1.3272, "step": 1695 }, { "epoch": 7.9391456992393215, "grad_norm": 0.4623662531375885, "learning_rate": 1.2122419380699585e-05, "loss": 1.3214, "step": 1696 }, { "epoch": 7.943826799297835, "grad_norm": 0.46682363748550415, "learning_rate": 1.2068981640343229e-05, "loss": 1.3343, "step": 1697 }, { "epoch": 7.948507899356349, "grad_norm": 0.4599725902080536, "learning_rate": 1.2015645770835764e-05, "loss": 1.3421, "step": 1698 }, { "epoch": 7.953188999414863, "grad_norm": 0.46313902735710144, "learning_rate": 1.1962411915420618e-05, "loss": 1.322, "step": 1699 }, { "epoch": 7.957870099473376, "grad_norm": 0.44506651163101196, "learning_rate": 1.1909280217067292e-05, "loss": 1.328, "step": 1700 }, { "epoch": 7.96255119953189, "grad_norm": 0.4529295563697815, "learning_rate": 1.1856250818470905e-05, "loss": 1.317, "step": 1701 }, { "epoch": 7.967232299590404, "grad_norm": 0.4700178802013397, "learning_rate": 1.180332386205183e-05, "loss": 1.3333, "step": 1702 }, { "epoch": 7.971913399648917, "grad_norm": 0.4718801975250244, "learning_rate": 1.1750499489955325e-05, "loss": 1.3283, "step": 1703 }, { "epoch": 7.976594499707431, "grad_norm": 0.4550124406814575, "learning_rate": 1.1697777844051105e-05, "loss": 1.3333, "step": 1704 }, { "epoch": 7.9812755997659455, "grad_norm": 0.4479660391807556, "learning_rate": 1.164515906593303e-05, "loss": 1.3239, "step": 1705 }, { "epoch": 7.985956699824459, "grad_norm": 0.45768308639526367, "learning_rate": 1.1592643296918676e-05, "loss": 1.336, "step": 1706 }, { "epoch": 7.990637799882973, "grad_norm": 0.463931679725647, "learning_rate": 1.1540230678048968e-05, "loss": 1.3118, "step": 1707 }, { "epoch": 7.995318899941486, "grad_norm": 0.4714531898498535, "learning_rate": 1.148792135008782e-05, "loss": 1.3385, "step": 1708 }, { "epoch": 8.0, "grad_norm": 1.5786850452423096, "learning_rate": 1.1435715453521695e-05, "loss": 1.2702, "step": 1709 }, { "epoch": 8.004681100058514, "grad_norm": 0.4669012129306793, "learning_rate": 1.1383613128559306e-05, "loss": 1.3242, "step": 1710 }, { "epoch": 8.009362200117028, "grad_norm": 0.47372370958328247, "learning_rate": 1.1331614515131207e-05, "loss": 1.3339, "step": 1711 }, { "epoch": 8.01404330017554, "grad_norm": 0.45693066716194153, "learning_rate": 1.1279719752889401e-05, "loss": 1.3129, "step": 1712 }, { "epoch": 8.018724400234055, "grad_norm": 0.47375747561454773, "learning_rate": 1.1227928981207003e-05, "loss": 1.3499, "step": 1713 }, { "epoch": 8.023405500292569, "grad_norm": 0.4803107976913452, "learning_rate": 1.1176242339177789e-05, "loss": 1.3435, "step": 1714 }, { "epoch": 8.028086600351083, "grad_norm": 0.4778917133808136, "learning_rate": 1.1124659965615925e-05, "loss": 1.3598, "step": 1715 }, { "epoch": 8.032767700409597, "grad_norm": 0.4473143219947815, "learning_rate": 1.1073181999055538e-05, "loss": 1.3386, "step": 1716 }, { "epoch": 8.03744880046811, "grad_norm": 0.46125826239585876, "learning_rate": 1.1021808577750332e-05, "loss": 1.3061, "step": 1717 }, { "epoch": 8.042129900526623, "grad_norm": 0.46183252334594727, "learning_rate": 1.0970539839673255e-05, "loss": 1.3459, "step": 1718 }, { "epoch": 8.046811000585137, "grad_norm": 0.4538736343383789, "learning_rate": 1.0919375922516084e-05, "loss": 1.327, "step": 1719 }, { "epoch": 8.051492100643651, "grad_norm": 0.4843650460243225, "learning_rate": 1.0868316963689113e-05, "loss": 1.3228, "step": 1720 }, { "epoch": 8.056173200702165, "grad_norm": 0.4713257849216461, "learning_rate": 1.0817363100320693e-05, "loss": 1.3251, "step": 1721 }, { "epoch": 8.06085430076068, "grad_norm": 0.46485134959220886, "learning_rate": 1.0766514469257006e-05, "loss": 1.3323, "step": 1722 }, { "epoch": 8.065535400819192, "grad_norm": 0.4672873914241791, "learning_rate": 1.0715771207061558e-05, "loss": 1.3446, "step": 1723 }, { "epoch": 8.070216500877706, "grad_norm": 0.46149742603302, "learning_rate": 1.0665133450014858e-05, "loss": 1.3405, "step": 1724 }, { "epoch": 8.07489760093622, "grad_norm": 0.4756418764591217, "learning_rate": 1.0614601334114099e-05, "loss": 1.3565, "step": 1725 }, { "epoch": 8.079578700994734, "grad_norm": 0.4548105299472809, "learning_rate": 1.05641749950727e-05, "loss": 1.3311, "step": 1726 }, { "epoch": 8.084259801053248, "grad_norm": 0.4672495126724243, "learning_rate": 1.0513854568320075e-05, "loss": 1.3208, "step": 1727 }, { "epoch": 8.088940901111762, "grad_norm": 0.4649643003940582, "learning_rate": 1.0463640189001145e-05, "loss": 1.3411, "step": 1728 }, { "epoch": 8.093622001170274, "grad_norm": 0.4581691324710846, "learning_rate": 1.0413531991975995e-05, "loss": 1.3272, "step": 1729 }, { "epoch": 8.098303101228788, "grad_norm": 0.4784697890281677, "learning_rate": 1.0363530111819591e-05, "loss": 1.3027, "step": 1730 }, { "epoch": 8.102984201287303, "grad_norm": 0.4707012474536896, "learning_rate": 1.0313634682821321e-05, "loss": 1.3103, "step": 1731 }, { "epoch": 8.107665301345817, "grad_norm": 0.4484752416610718, "learning_rate": 1.0263845838984692e-05, "loss": 1.346, "step": 1732 }, { "epoch": 8.11234640140433, "grad_norm": 0.458938330411911, "learning_rate": 1.021416371402701e-05, "loss": 1.3482, "step": 1733 }, { "epoch": 8.117027501462843, "grad_norm": 0.4532400071620941, "learning_rate": 1.016458844137887e-05, "loss": 1.3382, "step": 1734 }, { "epoch": 8.121708601521357, "grad_norm": 0.4518270194530487, "learning_rate": 1.0115120154183976e-05, "loss": 1.3185, "step": 1735 }, { "epoch": 8.126389701579871, "grad_norm": 0.45866602659225464, "learning_rate": 1.0065758985298651e-05, "loss": 1.355, "step": 1736 }, { "epoch": 8.131070801638385, "grad_norm": 0.45490261912345886, "learning_rate": 1.0016505067291543e-05, "loss": 1.3502, "step": 1737 }, { "epoch": 8.1357519016969, "grad_norm": 0.45964813232421875, "learning_rate": 9.967358532443305e-06, "loss": 1.3151, "step": 1738 }, { "epoch": 8.140433001755412, "grad_norm": 0.4557470679283142, "learning_rate": 9.918319512746128e-06, "loss": 1.308, "step": 1739 }, { "epoch": 8.145114101813926, "grad_norm": 0.45845234394073486, "learning_rate": 9.869388139903496e-06, "loss": 1.3277, "step": 1740 }, { "epoch": 8.14979520187244, "grad_norm": 0.4491667151451111, "learning_rate": 9.820564545329736e-06, "loss": 1.3318, "step": 1741 }, { "epoch": 8.154476301930954, "grad_norm": 0.4657641053199768, "learning_rate": 9.771848860149774e-06, "loss": 1.3297, "step": 1742 }, { "epoch": 8.159157401989468, "grad_norm": 0.4560374915599823, "learning_rate": 9.723241215198692e-06, "loss": 1.3404, "step": 1743 }, { "epoch": 8.163838502047982, "grad_norm": 0.4573279619216919, "learning_rate": 9.674741741021425e-06, "loss": 1.3341, "step": 1744 }, { "epoch": 8.168519602106494, "grad_norm": 0.4648374915122986, "learning_rate": 9.626350567872389e-06, "loss": 1.337, "step": 1745 }, { "epoch": 8.173200702165008, "grad_norm": 0.45301535725593567, "learning_rate": 9.578067825715126e-06, "loss": 1.3112, "step": 1746 }, { "epoch": 8.177881802223522, "grad_norm": 0.46872368454933167, "learning_rate": 9.529893644221976e-06, "loss": 1.3534, "step": 1747 }, { "epoch": 8.182562902282037, "grad_norm": 0.44515344500541687, "learning_rate": 9.481828152773725e-06, "loss": 1.3264, "step": 1748 }, { "epoch": 8.18724400234055, "grad_norm": 0.46138012409210205, "learning_rate": 9.433871480459245e-06, "loss": 1.3399, "step": 1749 }, { "epoch": 8.191925102399063, "grad_norm": 0.448217511177063, "learning_rate": 9.386023756075169e-06, "loss": 1.3195, "step": 1750 }, { "epoch": 8.196606202457577, "grad_norm": 0.4604373872280121, "learning_rate": 9.338285108125488e-06, "loss": 1.3468, "step": 1751 }, { "epoch": 8.201287302516091, "grad_norm": 0.45444729924201965, "learning_rate": 9.290655664821296e-06, "loss": 1.3239, "step": 1752 }, { "epoch": 8.205968402574605, "grad_norm": 0.46766898036003113, "learning_rate": 9.243135554080367e-06, "loss": 1.308, "step": 1753 }, { "epoch": 8.21064950263312, "grad_norm": 0.462932288646698, "learning_rate": 9.19572490352686e-06, "loss": 1.3439, "step": 1754 }, { "epoch": 8.215330602691633, "grad_norm": 0.4593763053417206, "learning_rate": 9.148423840490954e-06, "loss": 1.315, "step": 1755 }, { "epoch": 8.220011702750146, "grad_norm": 0.4483340084552765, "learning_rate": 9.101232492008487e-06, "loss": 1.3394, "step": 1756 }, { "epoch": 8.22469280280866, "grad_norm": 0.4449338912963867, "learning_rate": 9.054150984820675e-06, "loss": 1.3198, "step": 1757 }, { "epoch": 8.229373902867174, "grad_norm": 0.4492490887641907, "learning_rate": 9.007179445373715e-06, "loss": 1.3308, "step": 1758 }, { "epoch": 8.234055002925688, "grad_norm": 0.45008864998817444, "learning_rate": 8.960317999818458e-06, "loss": 1.3189, "step": 1759 }, { "epoch": 8.238736102984202, "grad_norm": 0.45013290643692017, "learning_rate": 8.913566774010107e-06, "loss": 1.3102, "step": 1760 }, { "epoch": 8.243417203042714, "grad_norm": 0.44994527101516724, "learning_rate": 8.866925893507805e-06, "loss": 1.3277, "step": 1761 }, { "epoch": 8.248098303101228, "grad_norm": 0.46309390664100647, "learning_rate": 8.820395483574368e-06, "loss": 1.3304, "step": 1762 }, { "epoch": 8.252779403159742, "grad_norm": 0.4659731388092041, "learning_rate": 8.773975669175938e-06, "loss": 1.3284, "step": 1763 }, { "epoch": 8.257460503218256, "grad_norm": 0.4545668959617615, "learning_rate": 8.727666574981597e-06, "loss": 1.3347, "step": 1764 }, { "epoch": 8.26214160327677, "grad_norm": 0.4606211483478546, "learning_rate": 8.681468325363106e-06, "loss": 1.3235, "step": 1765 }, { "epoch": 8.266822703335285, "grad_norm": 0.4521343410015106, "learning_rate": 8.635381044394486e-06, "loss": 1.3262, "step": 1766 }, { "epoch": 8.271503803393797, "grad_norm": 0.45838385820388794, "learning_rate": 8.589404855851762e-06, "loss": 1.3327, "step": 1767 }, { "epoch": 8.276184903452311, "grad_norm": 0.45257940888404846, "learning_rate": 8.543539883212615e-06, "loss": 1.3334, "step": 1768 }, { "epoch": 8.280866003510825, "grad_norm": 0.4488631784915924, "learning_rate": 8.497786249655965e-06, "loss": 1.3299, "step": 1769 }, { "epoch": 8.285547103569339, "grad_norm": 0.45357194542884827, "learning_rate": 8.452144078061818e-06, "loss": 1.3388, "step": 1770 }, { "epoch": 8.290228203627853, "grad_norm": 0.4556363821029663, "learning_rate": 8.406613491010723e-06, "loss": 1.3151, "step": 1771 }, { "epoch": 8.294909303686365, "grad_norm": 0.44982144236564636, "learning_rate": 8.361194610783612e-06, "loss": 1.3407, "step": 1772 }, { "epoch": 8.29959040374488, "grad_norm": 0.4660349488258362, "learning_rate": 8.31588755936139e-06, "loss": 1.3342, "step": 1773 }, { "epoch": 8.304271503803394, "grad_norm": 0.45148947834968567, "learning_rate": 8.2706924584246e-06, "loss": 1.3419, "step": 1774 }, { "epoch": 8.308952603861908, "grad_norm": 0.4520352780818939, "learning_rate": 8.225609429353187e-06, "loss": 1.3123, "step": 1775 }, { "epoch": 8.313633703920422, "grad_norm": 0.4512474238872528, "learning_rate": 8.18063859322602e-06, "loss": 1.3502, "step": 1776 }, { "epoch": 8.318314803978936, "grad_norm": 0.4652462899684906, "learning_rate": 8.13578007082072e-06, "loss": 1.3465, "step": 1777 }, { "epoch": 8.322995904037448, "grad_norm": 0.44895434379577637, "learning_rate": 8.091033982613245e-06, "loss": 1.3161, "step": 1778 }, { "epoch": 8.327677004095962, "grad_norm": 0.453987717628479, "learning_rate": 8.046400448777574e-06, "loss": 1.3343, "step": 1779 }, { "epoch": 8.332358104154476, "grad_norm": 0.4530025124549866, "learning_rate": 8.001879589185423e-06, "loss": 1.3368, "step": 1780 }, { "epoch": 8.33703920421299, "grad_norm": 0.45067164301872253, "learning_rate": 7.957471523405895e-06, "loss": 1.3291, "step": 1781 }, { "epoch": 8.341720304271504, "grad_norm": 0.44522637128829956, "learning_rate": 7.913176370705167e-06, "loss": 1.3405, "step": 1782 }, { "epoch": 8.346401404330017, "grad_norm": 0.4582405388355255, "learning_rate": 7.868994250046163e-06, "loss": 1.3564, "step": 1783 }, { "epoch": 8.35108250438853, "grad_norm": 0.455962210893631, "learning_rate": 7.824925280088219e-06, "loss": 1.3292, "step": 1784 }, { "epoch": 8.355763604447045, "grad_norm": 0.45344966650009155, "learning_rate": 7.780969579186814e-06, "loss": 1.3404, "step": 1785 }, { "epoch": 8.360444704505559, "grad_norm": 0.451633483171463, "learning_rate": 7.737127265393206e-06, "loss": 1.3244, "step": 1786 }, { "epoch": 8.365125804564073, "grad_norm": 0.46836763620376587, "learning_rate": 7.693398456454142e-06, "loss": 1.3558, "step": 1787 }, { "epoch": 8.369806904622587, "grad_norm": 0.4535449147224426, "learning_rate": 7.649783269811523e-06, "loss": 1.3454, "step": 1788 }, { "epoch": 8.3744880046811, "grad_norm": 0.44840291142463684, "learning_rate": 7.606281822602079e-06, "loss": 1.3136, "step": 1789 }, { "epoch": 8.379169104739614, "grad_norm": 0.4514668583869934, "learning_rate": 7.562894231657103e-06, "loss": 1.3278, "step": 1790 }, { "epoch": 8.383850204798128, "grad_norm": 0.4571569561958313, "learning_rate": 7.519620613502082e-06, "loss": 1.3273, "step": 1791 }, { "epoch": 8.388531304856642, "grad_norm": 0.45401352643966675, "learning_rate": 7.476461084356423e-06, "loss": 1.3273, "step": 1792 }, { "epoch": 8.393212404915156, "grad_norm": 0.45479846000671387, "learning_rate": 7.433415760133128e-06, "loss": 1.3231, "step": 1793 }, { "epoch": 8.397893504973668, "grad_norm": 0.45161619782447815, "learning_rate": 7.390484756438448e-06, "loss": 1.3104, "step": 1794 }, { "epoch": 8.402574605032182, "grad_norm": 0.4587078392505646, "learning_rate": 7.347668188571643e-06, "loss": 1.3049, "step": 1795 }, { "epoch": 8.407255705090696, "grad_norm": 0.4491417407989502, "learning_rate": 7.304966171524619e-06, "loss": 1.3255, "step": 1796 }, { "epoch": 8.41193680514921, "grad_norm": 0.46726784110069275, "learning_rate": 7.26237881998163e-06, "loss": 1.3332, "step": 1797 }, { "epoch": 8.416617905207724, "grad_norm": 0.45452451705932617, "learning_rate": 7.219906248318998e-06, "loss": 1.3464, "step": 1798 }, { "epoch": 8.421299005266238, "grad_norm": 0.44723427295684814, "learning_rate": 7.177548570604731e-06, "loss": 1.3075, "step": 1799 }, { "epoch": 8.42598010532475, "grad_norm": 0.4556967616081238, "learning_rate": 7.135305900598321e-06, "loss": 1.3382, "step": 1800 }, { "epoch": 8.430661205383265, "grad_norm": 0.4641600549221039, "learning_rate": 7.093178351750357e-06, "loss": 1.3465, "step": 1801 }, { "epoch": 8.435342305441779, "grad_norm": 0.4607059955596924, "learning_rate": 7.051166037202261e-06, "loss": 1.3453, "step": 1802 }, { "epoch": 8.440023405500293, "grad_norm": 0.4600765109062195, "learning_rate": 7.009269069785973e-06, "loss": 1.3375, "step": 1803 }, { "epoch": 8.444704505558807, "grad_norm": 0.467617928981781, "learning_rate": 6.967487562023622e-06, "loss": 1.3491, "step": 1804 }, { "epoch": 8.44938560561732, "grad_norm": 0.4564569592475891, "learning_rate": 6.925821626127282e-06, "loss": 1.3369, "step": 1805 }, { "epoch": 8.454066705675833, "grad_norm": 0.44401881098747253, "learning_rate": 6.8842713739986075e-06, "loss": 1.3317, "step": 1806 }, { "epoch": 8.458747805734347, "grad_norm": 0.4608115255832672, "learning_rate": 6.84283691722859e-06, "loss": 1.3376, "step": 1807 }, { "epoch": 8.463428905792862, "grad_norm": 0.44086068868637085, "learning_rate": 6.801518367097226e-06, "loss": 1.3354, "step": 1808 }, { "epoch": 8.468110005851376, "grad_norm": 0.45129072666168213, "learning_rate": 6.760315834573194e-06, "loss": 1.3327, "step": 1809 }, { "epoch": 8.472791105909888, "grad_norm": 0.4539918303489685, "learning_rate": 6.719229430313623e-06, "loss": 1.3101, "step": 1810 }, { "epoch": 8.477472205968402, "grad_norm": 0.45194685459136963, "learning_rate": 6.678259264663711e-06, "loss": 1.3517, "step": 1811 }, { "epoch": 8.482153306026916, "grad_norm": 0.45311862230300903, "learning_rate": 6.637405447656541e-06, "loss": 1.3246, "step": 1812 }, { "epoch": 8.48683440608543, "grad_norm": 0.44423723220825195, "learning_rate": 6.5966680890126865e-06, "loss": 1.3205, "step": 1813 }, { "epoch": 8.491515506143944, "grad_norm": 0.45881083607673645, "learning_rate": 6.556047298139917e-06, "loss": 1.3379, "step": 1814 }, { "epoch": 8.496196606202458, "grad_norm": 0.4573395550251007, "learning_rate": 6.515543184132999e-06, "loss": 1.3407, "step": 1815 }, { "epoch": 8.50087770626097, "grad_norm": 0.4606288969516754, "learning_rate": 6.475155855773279e-06, "loss": 1.3165, "step": 1816 }, { "epoch": 8.505558806319485, "grad_norm": 0.464618444442749, "learning_rate": 6.434885421528513e-06, "loss": 1.3339, "step": 1817 }, { "epoch": 8.510239906377999, "grad_norm": 0.45734116435050964, "learning_rate": 6.394731989552494e-06, "loss": 1.3336, "step": 1818 }, { "epoch": 8.514921006436513, "grad_norm": 0.44989585876464844, "learning_rate": 6.354695667684757e-06, "loss": 1.3429, "step": 1819 }, { "epoch": 8.519602106495027, "grad_norm": 0.44147491455078125, "learning_rate": 6.314776563450353e-06, "loss": 1.3215, "step": 1820 }, { "epoch": 8.524283206553541, "grad_norm": 0.448146790266037, "learning_rate": 6.274974784059495e-06, "loss": 1.3327, "step": 1821 }, { "epoch": 8.528964306612053, "grad_norm": 0.4638636112213135, "learning_rate": 6.235290436407304e-06, "loss": 1.3297, "step": 1822 }, { "epoch": 8.533645406670567, "grad_norm": 0.4473375082015991, "learning_rate": 6.195723627073552e-06, "loss": 1.3237, "step": 1823 }, { "epoch": 8.538326506729081, "grad_norm": 0.44932791590690613, "learning_rate": 6.156274462322292e-06, "loss": 1.3407, "step": 1824 }, { "epoch": 8.543007606787596, "grad_norm": 0.47016915678977966, "learning_rate": 6.1169430481016475e-06, "loss": 1.3339, "step": 1825 }, { "epoch": 8.54768870684611, "grad_norm": 0.44749605655670166, "learning_rate": 6.077729490043477e-06, "loss": 1.3432, "step": 1826 }, { "epoch": 8.552369806904622, "grad_norm": 0.4496007263660431, "learning_rate": 6.038633893463125e-06, "loss": 1.3372, "step": 1827 }, { "epoch": 8.557050906963136, "grad_norm": 0.4475160539150238, "learning_rate": 5.999656363359174e-06, "loss": 1.3329, "step": 1828 }, { "epoch": 8.56173200702165, "grad_norm": 0.4565197825431824, "learning_rate": 5.960797004413038e-06, "loss": 1.3349, "step": 1829 }, { "epoch": 8.566413107080164, "grad_norm": 0.4637001156806946, "learning_rate": 5.922055920988817e-06, "loss": 1.3355, "step": 1830 }, { "epoch": 8.571094207138678, "grad_norm": 0.4525499939918518, "learning_rate": 5.883433217132927e-06, "loss": 1.3355, "step": 1831 }, { "epoch": 8.57577530719719, "grad_norm": 0.45388099551200867, "learning_rate": 5.844928996573873e-06, "loss": 1.3177, "step": 1832 }, { "epoch": 8.580456407255705, "grad_norm": 0.45121920108795166, "learning_rate": 5.806543362721945e-06, "loss": 1.3092, "step": 1833 }, { "epoch": 8.585137507314219, "grad_norm": 0.4436412453651428, "learning_rate": 5.768276418668955e-06, "loss": 1.3184, "step": 1834 }, { "epoch": 8.589818607372733, "grad_norm": 0.44664260745048523, "learning_rate": 5.730128267187939e-06, "loss": 1.3334, "step": 1835 }, { "epoch": 8.594499707431247, "grad_norm": 0.4499116539955139, "learning_rate": 5.692099010732893e-06, "loss": 1.3212, "step": 1836 }, { "epoch": 8.59918080748976, "grad_norm": 0.4458361566066742, "learning_rate": 5.654188751438505e-06, "loss": 1.3272, "step": 1837 }, { "epoch": 8.603861907548273, "grad_norm": 0.4464578330516815, "learning_rate": 5.616397591119871e-06, "loss": 1.3164, "step": 1838 }, { "epoch": 8.608543007606787, "grad_norm": 0.4481458067893982, "learning_rate": 5.5787256312722245e-06, "loss": 1.3402, "step": 1839 }, { "epoch": 8.613224107665301, "grad_norm": 0.4532814025878906, "learning_rate": 5.5411729730706795e-06, "loss": 1.3215, "step": 1840 }, { "epoch": 8.617905207723815, "grad_norm": 0.44967561960220337, "learning_rate": 5.503739717369899e-06, "loss": 1.3291, "step": 1841 }, { "epoch": 8.62258630778233, "grad_norm": 0.4423333406448364, "learning_rate": 5.466425964703914e-06, "loss": 1.3176, "step": 1842 }, { "epoch": 8.627267407840842, "grad_norm": 0.45483851432800293, "learning_rate": 5.42923181528579e-06, "loss": 1.337, "step": 1843 }, { "epoch": 8.631948507899356, "grad_norm": 0.44831711053848267, "learning_rate": 5.392157369007378e-06, "loss": 1.3405, "step": 1844 }, { "epoch": 8.63662960795787, "grad_norm": 0.4486755430698395, "learning_rate": 5.355202725439046e-06, "loss": 1.3194, "step": 1845 }, { "epoch": 8.641310708016384, "grad_norm": 0.44130879640579224, "learning_rate": 5.318367983829392e-06, "loss": 1.3231, "step": 1846 }, { "epoch": 8.645991808074898, "grad_norm": 0.4567999839782715, "learning_rate": 5.281653243105011e-06, "loss": 1.351, "step": 1847 }, { "epoch": 8.650672908133412, "grad_norm": 0.4551226496696472, "learning_rate": 5.245058601870223e-06, "loss": 1.333, "step": 1848 }, { "epoch": 8.655354008191924, "grad_norm": 0.4408561885356903, "learning_rate": 5.2085841584067765e-06, "loss": 1.3289, "step": 1849 }, { "epoch": 8.660035108250439, "grad_norm": 0.4413094222545624, "learning_rate": 5.172230010673623e-06, "loss": 1.3271, "step": 1850 }, { "epoch": 8.664716208308953, "grad_norm": 0.44439423084259033, "learning_rate": 5.135996256306619e-06, "loss": 1.335, "step": 1851 }, { "epoch": 8.669397308367467, "grad_norm": 0.44356393814086914, "learning_rate": 5.099882992618299e-06, "loss": 1.3437, "step": 1852 }, { "epoch": 8.67407840842598, "grad_norm": 0.46065422892570496, "learning_rate": 5.063890316597592e-06, "loss": 1.3389, "step": 1853 }, { "epoch": 8.678759508484493, "grad_norm": 0.4485087990760803, "learning_rate": 5.028018324909562e-06, "loss": 1.3382, "step": 1854 }, { "epoch": 8.683440608543007, "grad_norm": 0.44895005226135254, "learning_rate": 4.992267113895172e-06, "loss": 1.3381, "step": 1855 }, { "epoch": 8.688121708601521, "grad_norm": 0.4537106454372406, "learning_rate": 4.956636779570972e-06, "loss": 1.3266, "step": 1856 }, { "epoch": 8.692802808660035, "grad_norm": 0.46100160479545593, "learning_rate": 4.921127417628896e-06, "loss": 1.3495, "step": 1857 }, { "epoch": 8.69748390871855, "grad_norm": 0.4382075071334839, "learning_rate": 4.885739123435984e-06, "loss": 1.3377, "step": 1858 }, { "epoch": 8.702165008777062, "grad_norm": 0.4552956223487854, "learning_rate": 4.850471992034128e-06, "loss": 1.3324, "step": 1859 }, { "epoch": 8.706846108835576, "grad_norm": 0.4699152112007141, "learning_rate": 4.8153261181398125e-06, "loss": 1.3317, "step": 1860 }, { "epoch": 8.71152720889409, "grad_norm": 0.46199172735214233, "learning_rate": 4.780301596143843e-06, "loss": 1.3389, "step": 1861 }, { "epoch": 8.716208308952604, "grad_norm": 0.44489288330078125, "learning_rate": 4.745398520111138e-06, "loss": 1.3299, "step": 1862 }, { "epoch": 8.720889409011118, "grad_norm": 0.4437996745109558, "learning_rate": 4.7106169837804525e-06, "loss": 1.3249, "step": 1863 }, { "epoch": 8.725570509069632, "grad_norm": 0.44859573245048523, "learning_rate": 4.675957080564087e-06, "loss": 1.3419, "step": 1864 }, { "epoch": 8.730251609128144, "grad_norm": 0.44944876432418823, "learning_rate": 4.641418903547723e-06, "loss": 1.3171, "step": 1865 }, { "epoch": 8.734932709186658, "grad_norm": 0.4558664858341217, "learning_rate": 4.60700254549008e-06, "loss": 1.3359, "step": 1866 }, { "epoch": 8.739613809245173, "grad_norm": 0.4526291787624359, "learning_rate": 4.5727080988227365e-06, "loss": 1.3089, "step": 1867 }, { "epoch": 8.744294909303687, "grad_norm": 0.4543936550617218, "learning_rate": 4.5385356556498616e-06, "loss": 1.3268, "step": 1868 }, { "epoch": 8.7489760093622, "grad_norm": 0.44007402658462524, "learning_rate": 4.504485307747913e-06, "loss": 1.3264, "step": 1869 }, { "epoch": 8.753657109420715, "grad_norm": 0.4477139115333557, "learning_rate": 4.470557146565507e-06, "loss": 1.3395, "step": 1870 }, { "epoch": 8.758338209479227, "grad_norm": 0.4843866527080536, "learning_rate": 4.436751263223055e-06, "loss": 1.3241, "step": 1871 }, { "epoch": 8.763019309537741, "grad_norm": 0.45234960317611694, "learning_rate": 4.40306774851259e-06, "loss": 1.3434, "step": 1872 }, { "epoch": 8.767700409596255, "grad_norm": 0.4426249563694, "learning_rate": 4.369506692897479e-06, "loss": 1.3105, "step": 1873 }, { "epoch": 8.77238150965477, "grad_norm": 0.44732552766799927, "learning_rate": 4.3360681865122325e-06, "loss": 1.3296, "step": 1874 }, { "epoch": 8.777062609713283, "grad_norm": 0.4477425813674927, "learning_rate": 4.302752319162212e-06, "loss": 1.364, "step": 1875 }, { "epoch": 8.781743709771796, "grad_norm": 0.4506896734237671, "learning_rate": 4.269559180323418e-06, "loss": 1.3366, "step": 1876 }, { "epoch": 8.78642480983031, "grad_norm": 0.44847944378852844, "learning_rate": 4.23648885914224e-06, "loss": 1.3518, "step": 1877 }, { "epoch": 8.791105909888824, "grad_norm": 0.442094087600708, "learning_rate": 4.20354144443521e-06, "loss": 1.3154, "step": 1878 }, { "epoch": 8.795787009947338, "grad_norm": 0.4490923285484314, "learning_rate": 4.170717024688775e-06, "loss": 1.3304, "step": 1879 }, { "epoch": 8.800468110005852, "grad_norm": 0.4516635537147522, "learning_rate": 4.1380156880590545e-06, "loss": 1.325, "step": 1880 }, { "epoch": 8.805149210064364, "grad_norm": 0.4438028931617737, "learning_rate": 4.105437522371619e-06, "loss": 1.3166, "step": 1881 }, { "epoch": 8.809830310122878, "grad_norm": 0.45484694838523865, "learning_rate": 4.072982615121235e-06, "loss": 1.3366, "step": 1882 }, { "epoch": 8.814511410181392, "grad_norm": 0.4515603482723236, "learning_rate": 4.040651053471605e-06, "loss": 1.337, "step": 1883 }, { "epoch": 8.819192510239906, "grad_norm": 0.44962701201438904, "learning_rate": 4.008442924255207e-06, "loss": 1.3385, "step": 1884 }, { "epoch": 8.82387361029842, "grad_norm": 0.4372497797012329, "learning_rate": 3.976358313973e-06, "loss": 1.3294, "step": 1885 }, { "epoch": 8.828554710356935, "grad_norm": 0.4433598518371582, "learning_rate": 3.944397308794201e-06, "loss": 1.3325, "step": 1886 }, { "epoch": 8.833235810415447, "grad_norm": 0.4615612030029297, "learning_rate": 3.912559994556086e-06, "loss": 1.3363, "step": 1887 }, { "epoch": 8.837916910473961, "grad_norm": 0.4531205892562866, "learning_rate": 3.880846456763704e-06, "loss": 1.3281, "step": 1888 }, { "epoch": 8.842598010532475, "grad_norm": 0.453538715839386, "learning_rate": 3.849256780589705e-06, "loss": 1.3542, "step": 1889 }, { "epoch": 8.84727911059099, "grad_norm": 0.44648537039756775, "learning_rate": 3.81779105087407e-06, "loss": 1.3061, "step": 1890 }, { "epoch": 8.851960210649503, "grad_norm": 0.4507054090499878, "learning_rate": 3.7864493521239164e-06, "loss": 1.3451, "step": 1891 }, { "epoch": 8.856641310708017, "grad_norm": 0.4552081823348999, "learning_rate": 3.75523176851324e-06, "loss": 1.3292, "step": 1892 }, { "epoch": 8.86132241076653, "grad_norm": 0.4661564826965332, "learning_rate": 3.724138383882697e-06, "loss": 1.3446, "step": 1893 }, { "epoch": 8.866003510825044, "grad_norm": 0.4496918022632599, "learning_rate": 3.6931692817393894e-06, "loss": 1.3254, "step": 1894 }, { "epoch": 8.870684610883558, "grad_norm": 0.45760682225227356, "learning_rate": 3.6623245452566455e-06, "loss": 1.3331, "step": 1895 }, { "epoch": 8.875365710942072, "grad_norm": 0.4535819888114929, "learning_rate": 3.631604257273774e-06, "loss": 1.3239, "step": 1896 }, { "epoch": 8.880046811000586, "grad_norm": 0.44883638620376587, "learning_rate": 3.601008500295866e-06, "loss": 1.346, "step": 1897 }, { "epoch": 8.884727911059098, "grad_norm": 0.4418095350265503, "learning_rate": 3.5705373564935364e-06, "loss": 1.3128, "step": 1898 }, { "epoch": 8.889409011117612, "grad_norm": 0.4573911428451538, "learning_rate": 3.5401909077027508e-06, "loss": 1.32, "step": 1899 }, { "epoch": 8.894090111176126, "grad_norm": 0.44246190786361694, "learning_rate": 3.5099692354245818e-06, "loss": 1.3316, "step": 1900 }, { "epoch": 8.89877121123464, "grad_norm": 0.4418052136898041, "learning_rate": 3.4798724208249866e-06, "loss": 1.3308, "step": 1901 }, { "epoch": 8.903452311293155, "grad_norm": 0.44418641924858093, "learning_rate": 3.449900544734602e-06, "loss": 1.338, "step": 1902 }, { "epoch": 8.908133411351667, "grad_norm": 0.4427943825721741, "learning_rate": 3.420053687648489e-06, "loss": 1.3242, "step": 1903 }, { "epoch": 8.91281451141018, "grad_norm": 0.44212570786476135, "learning_rate": 3.390331929725982e-06, "loss": 1.3209, "step": 1904 }, { "epoch": 8.917495611468695, "grad_norm": 0.4524332284927368, "learning_rate": 3.3607353507904283e-06, "loss": 1.3122, "step": 1905 }, { "epoch": 8.922176711527209, "grad_norm": 0.4490658640861511, "learning_rate": 3.3312640303289566e-06, "loss": 1.3247, "step": 1906 }, { "epoch": 8.926857811585723, "grad_norm": 0.44071638584136963, "learning_rate": 3.3019180474923527e-06, "loss": 1.336, "step": 1907 }, { "epoch": 8.931538911644237, "grad_norm": 0.4507541060447693, "learning_rate": 3.2726974810947107e-06, "loss": 1.3551, "step": 1908 }, { "epoch": 8.93622001170275, "grad_norm": 0.443243145942688, "learning_rate": 3.24360240961335e-06, "loss": 1.3267, "step": 1909 }, { "epoch": 8.940901111761264, "grad_norm": 0.46107861399650574, "learning_rate": 3.2146329111885256e-06, "loss": 1.3407, "step": 1910 }, { "epoch": 8.945582211819778, "grad_norm": 0.44938021898269653, "learning_rate": 3.18578906362324e-06, "loss": 1.3299, "step": 1911 }, { "epoch": 8.950263311878292, "grad_norm": 0.4389936923980713, "learning_rate": 3.157070944383056e-06, "loss": 1.3161, "step": 1912 }, { "epoch": 8.954944411936806, "grad_norm": 0.45983871817588806, "learning_rate": 3.1284786305958436e-06, "loss": 1.3426, "step": 1913 }, { "epoch": 8.959625511995318, "grad_norm": 0.4543863832950592, "learning_rate": 3.1000121990516274e-06, "loss": 1.3435, "step": 1914 }, { "epoch": 8.964306612053832, "grad_norm": 0.4446238577365875, "learning_rate": 3.0716717262023353e-06, "loss": 1.3197, "step": 1915 }, { "epoch": 8.968987712112346, "grad_norm": 0.4496222138404846, "learning_rate": 3.0434572881615997e-06, "loss": 1.3434, "step": 1916 }, { "epoch": 8.97366881217086, "grad_norm": 0.44096076488494873, "learning_rate": 3.0153689607045845e-06, "loss": 1.3389, "step": 1917 }, { "epoch": 8.978349912229374, "grad_norm": 0.44681215286254883, "learning_rate": 2.987406819267746e-06, "loss": 1.3135, "step": 1918 }, { "epoch": 8.983031012287888, "grad_norm": 0.45022034645080566, "learning_rate": 2.959570938948647e-06, "loss": 1.3247, "step": 1919 }, { "epoch": 8.9877121123464, "grad_norm": 0.4587979316711426, "learning_rate": 2.931861394505764e-06, "loss": 1.3403, "step": 1920 }, { "epoch": 8.992393212404915, "grad_norm": 0.44257092475891113, "learning_rate": 2.9042782603582507e-06, "loss": 1.3504, "step": 1921 }, { "epoch": 8.997074312463429, "grad_norm": 0.4537746012210846, "learning_rate": 2.876821610585784e-06, "loss": 1.3282, "step": 1922 }, { "epoch": 9.001755412521943, "grad_norm": 1.9876453876495361, "learning_rate": 2.8494915189283324e-06, "loss": 1.3033, "step": 1923 }, { "epoch": 9.006436512580457, "grad_norm": 0.44968175888061523, "learning_rate": 2.8222880587859713e-06, "loss": 1.3326, "step": 1924 }, { "epoch": 9.01111761263897, "grad_norm": 0.4442802667617798, "learning_rate": 2.795211303218698e-06, "loss": 1.3317, "step": 1925 }, { "epoch": 9.015798712697483, "grad_norm": 0.45361271500587463, "learning_rate": 2.7682613249461863e-06, "loss": 1.3406, "step": 1926 }, { "epoch": 9.020479812755998, "grad_norm": 0.4399416744709015, "learning_rate": 2.7414381963476564e-06, "loss": 1.316, "step": 1927 }, { "epoch": 9.025160912814512, "grad_norm": 0.4478228986263275, "learning_rate": 2.7147419894616387e-06, "loss": 1.3169, "step": 1928 }, { "epoch": 9.029842012873026, "grad_norm": 0.4422329366207123, "learning_rate": 2.688172775985792e-06, "loss": 1.3357, "step": 1929 }, { "epoch": 9.03452311293154, "grad_norm": 0.44650009274482727, "learning_rate": 2.661730627276726e-06, "loss": 1.3251, "step": 1930 }, { "epoch": 9.039204212990052, "grad_norm": 0.444793164730072, "learning_rate": 2.6354156143497622e-06, "loss": 1.3069, "step": 1931 }, { "epoch": 9.043885313048566, "grad_norm": 0.4586767852306366, "learning_rate": 2.6092278078788e-06, "loss": 1.3298, "step": 1932 }, { "epoch": 9.04856641310708, "grad_norm": 0.43380340933799744, "learning_rate": 2.5831672781960913e-06, "loss": 1.3245, "step": 1933 }, { "epoch": 9.053247513165594, "grad_norm": 0.4407908320426941, "learning_rate": 2.557234095292066e-06, "loss": 1.329, "step": 1934 }, { "epoch": 9.057928613224108, "grad_norm": 0.4531652629375458, "learning_rate": 2.531428328815155e-06, "loss": 1.3414, "step": 1935 }, { "epoch": 9.06260971328262, "grad_norm": 0.44452014565467834, "learning_rate": 2.505750048071548e-06, "loss": 1.333, "step": 1936 }, { "epoch": 9.067290813341135, "grad_norm": 0.445748507976532, "learning_rate": 2.4801993220250953e-06, "loss": 1.314, "step": 1937 }, { "epoch": 9.071971913399649, "grad_norm": 0.4494490921497345, "learning_rate": 2.4547762192970392e-06, "loss": 1.3561, "step": 1938 }, { "epoch": 9.076653013458163, "grad_norm": 0.4584897756576538, "learning_rate": 2.4294808081658906e-06, "loss": 1.333, "step": 1939 }, { "epoch": 9.081334113516677, "grad_norm": 0.45643746852874756, "learning_rate": 2.404313156567217e-06, "loss": 1.3133, "step": 1940 }, { "epoch": 9.086015213575191, "grad_norm": 0.4478752017021179, "learning_rate": 2.3792733320934346e-06, "loss": 1.3533, "step": 1941 }, { "epoch": 9.090696313633703, "grad_norm": 0.44859951734542847, "learning_rate": 2.3543614019936987e-06, "loss": 1.324, "step": 1942 }, { "epoch": 9.095377413692217, "grad_norm": 0.4443413317203522, "learning_rate": 2.3295774331736408e-06, "loss": 1.3408, "step": 1943 }, { "epoch": 9.100058513750731, "grad_norm": 0.45216187834739685, "learning_rate": 2.304921492195261e-06, "loss": 1.3462, "step": 1944 }, { "epoch": 9.104739613809246, "grad_norm": 0.45232582092285156, "learning_rate": 2.280393645276713e-06, "loss": 1.3386, "step": 1945 }, { "epoch": 9.10942071386776, "grad_norm": 0.4486156702041626, "learning_rate": 2.2559939582920963e-06, "loss": 1.3437, "step": 1946 }, { "epoch": 9.114101813926272, "grad_norm": 0.44333913922309875, "learning_rate": 2.2317224967713603e-06, "loss": 1.3391, "step": 1947 }, { "epoch": 9.118782913984786, "grad_norm": 0.44496697187423706, "learning_rate": 2.207579325900033e-06, "loss": 1.2984, "step": 1948 }, { "epoch": 9.1234640140433, "grad_norm": 0.44474706053733826, "learning_rate": 2.183564510519137e-06, "loss": 1.3285, "step": 1949 }, { "epoch": 9.128145114101814, "grad_norm": 0.44082364439964294, "learning_rate": 2.1596781151249524e-06, "loss": 1.3319, "step": 1950 }, { "epoch": 9.132826214160328, "grad_norm": 0.4394497275352478, "learning_rate": 2.1359202038688575e-06, "loss": 1.31, "step": 1951 }, { "epoch": 9.13750731421884, "grad_norm": 0.44066622853279114, "learning_rate": 2.112290840557174e-06, "loss": 1.3268, "step": 1952 }, { "epoch": 9.142188414277355, "grad_norm": 0.4529281258583069, "learning_rate": 2.088790088650977e-06, "loss": 1.3423, "step": 1953 }, { "epoch": 9.146869514335869, "grad_norm": 0.4531400799751282, "learning_rate": 2.065418011265924e-06, "loss": 1.3273, "step": 1954 }, { "epoch": 9.151550614394383, "grad_norm": 0.4527547359466553, "learning_rate": 2.042174671172126e-06, "loss": 1.3125, "step": 1955 }, { "epoch": 9.156231714452897, "grad_norm": 0.4387628138065338, "learning_rate": 2.0190601307939094e-06, "loss": 1.3394, "step": 1956 }, { "epoch": 9.160912814511411, "grad_norm": 0.4485406279563904, "learning_rate": 1.9960744522097007e-06, "loss": 1.3379, "step": 1957 }, { "epoch": 9.165593914569923, "grad_norm": 0.4442361891269684, "learning_rate": 1.973217697151836e-06, "loss": 1.3386, "step": 1958 }, { "epoch": 9.170275014628437, "grad_norm": 0.4519590735435486, "learning_rate": 1.95048992700641e-06, "loss": 1.3382, "step": 1959 }, { "epoch": 9.174956114686951, "grad_norm": 0.4366214871406555, "learning_rate": 1.92789120281312e-06, "loss": 1.3113, "step": 1960 }, { "epoch": 9.179637214745465, "grad_norm": 0.44564807415008545, "learning_rate": 1.9054215852650437e-06, "loss": 1.3216, "step": 1961 }, { "epoch": 9.18431831480398, "grad_norm": 0.45024698972702026, "learning_rate": 1.8830811347085697e-06, "loss": 1.3193, "step": 1962 }, { "epoch": 9.188999414862492, "grad_norm": 0.4466053247451782, "learning_rate": 1.8608699111431372e-06, "loss": 1.3144, "step": 1963 }, { "epoch": 9.193680514921006, "grad_norm": 0.44546517729759216, "learning_rate": 1.838787974221151e-06, "loss": 1.3189, "step": 1964 }, { "epoch": 9.19836161497952, "grad_norm": 0.4485017657279968, "learning_rate": 1.8168353832477947e-06, "loss": 1.3376, "step": 1965 }, { "epoch": 9.203042715038034, "grad_norm": 0.4487907290458679, "learning_rate": 1.7950121971808454e-06, "loss": 1.3314, "step": 1966 }, { "epoch": 9.207723815096548, "grad_norm": 0.44457539916038513, "learning_rate": 1.7733184746305698e-06, "loss": 1.3246, "step": 1967 }, { "epoch": 9.212404915155062, "grad_norm": 0.45234352350234985, "learning_rate": 1.7517542738595071e-06, "loss": 1.3324, "step": 1968 }, { "epoch": 9.217086015213575, "grad_norm": 0.44961783289909363, "learning_rate": 1.7303196527823585e-06, "loss": 1.3519, "step": 1969 }, { "epoch": 9.221767115272089, "grad_norm": 0.45592254400253296, "learning_rate": 1.7090146689658083e-06, "loss": 1.3364, "step": 1970 }, { "epoch": 9.226448215330603, "grad_norm": 0.460465669631958, "learning_rate": 1.687839379628381e-06, "loss": 1.3465, "step": 1971 }, { "epoch": 9.231129315389117, "grad_norm": 0.4440554678440094, "learning_rate": 1.666793841640285e-06, "loss": 1.3189, "step": 1972 }, { "epoch": 9.23581041544763, "grad_norm": 0.4493771195411682, "learning_rate": 1.6458781115232359e-06, "loss": 1.3234, "step": 1973 }, { "epoch": 9.240491515506143, "grad_norm": 0.4530016779899597, "learning_rate": 1.6250922454503548e-06, "loss": 1.3211, "step": 1974 }, { "epoch": 9.245172615564657, "grad_norm": 0.4415639638900757, "learning_rate": 1.604436299245965e-06, "loss": 1.3204, "step": 1975 }, { "epoch": 9.249853715623171, "grad_norm": 0.4418812394142151, "learning_rate": 1.5839103283854906e-06, "loss": 1.3197, "step": 1976 }, { "epoch": 9.254534815681685, "grad_norm": 0.4474630653858185, "learning_rate": 1.5635143879952575e-06, "loss": 1.3566, "step": 1977 }, { "epoch": 9.2592159157402, "grad_norm": 0.4447353184223175, "learning_rate": 1.5432485328523815e-06, "loss": 1.3272, "step": 1978 }, { "epoch": 9.263897015798713, "grad_norm": 0.44466260075569153, "learning_rate": 1.5231128173846087e-06, "loss": 1.3395, "step": 1979 }, { "epoch": 9.268578115857226, "grad_norm": 0.44978243112564087, "learning_rate": 1.5031072956701697e-06, "loss": 1.3233, "step": 1980 }, { "epoch": 9.27325921591574, "grad_norm": 0.449532687664032, "learning_rate": 1.4832320214376472e-06, "loss": 1.3264, "step": 1981 }, { "epoch": 9.277940315974254, "grad_norm": 0.4369581341743469, "learning_rate": 1.4634870480658035e-06, "loss": 1.3071, "step": 1982 }, { "epoch": 9.282621416032768, "grad_norm": 0.4466133713722229, "learning_rate": 1.4438724285834594e-06, "loss": 1.3364, "step": 1983 }, { "epoch": 9.287302516091282, "grad_norm": 0.4498167037963867, "learning_rate": 1.4243882156693477e-06, "loss": 1.3335, "step": 1984 }, { "epoch": 9.291983616149794, "grad_norm": 0.4429709017276764, "learning_rate": 1.4050344616519662e-06, "loss": 1.33, "step": 1985 }, { "epoch": 9.296664716208308, "grad_norm": 0.439984530210495, "learning_rate": 1.3858112185094418e-06, "loss": 1.3171, "step": 1986 }, { "epoch": 9.301345816266823, "grad_norm": 0.43798401951789856, "learning_rate": 1.3667185378694048e-06, "loss": 1.3388, "step": 1987 }, { "epoch": 9.306026916325337, "grad_norm": 0.4463445544242859, "learning_rate": 1.3477564710088098e-06, "loss": 1.3281, "step": 1988 }, { "epoch": 9.31070801638385, "grad_norm": 0.43593892455101013, "learning_rate": 1.328925068853837e-06, "loss": 1.3263, "step": 1989 }, { "epoch": 9.315389116442365, "grad_norm": 0.45093831419944763, "learning_rate": 1.3102243819797467e-06, "loss": 1.323, "step": 1990 }, { "epoch": 9.320070216500877, "grad_norm": 0.4551108181476593, "learning_rate": 1.2916544606107305e-06, "loss": 1.3109, "step": 1991 }, { "epoch": 9.324751316559391, "grad_norm": 0.44643521308898926, "learning_rate": 1.273215354619789e-06, "loss": 1.3328, "step": 1992 }, { "epoch": 9.329432416617905, "grad_norm": 0.4388735592365265, "learning_rate": 1.2549071135285917e-06, "loss": 1.3295, "step": 1993 }, { "epoch": 9.33411351667642, "grad_norm": 0.44772210717201233, "learning_rate": 1.2367297865073401e-06, "loss": 1.3407, "step": 1994 }, { "epoch": 9.338794616734933, "grad_norm": 0.44219115376472473, "learning_rate": 1.2186834223746612e-06, "loss": 1.332, "step": 1995 }, { "epoch": 9.343475716793446, "grad_norm": 0.4451064169406891, "learning_rate": 1.20076806959743e-06, "loss": 1.3138, "step": 1996 }, { "epoch": 9.34815681685196, "grad_norm": 0.4520408511161804, "learning_rate": 1.1829837762906915e-06, "loss": 1.3427, "step": 1997 }, { "epoch": 9.352837916910474, "grad_norm": 0.44814303517341614, "learning_rate": 1.1653305902174894e-06, "loss": 1.3238, "step": 1998 }, { "epoch": 9.357519016968988, "grad_norm": 0.44885024428367615, "learning_rate": 1.1478085587887488e-06, "loss": 1.3449, "step": 1999 }, { "epoch": 9.362200117027502, "grad_norm": 0.4502180814743042, "learning_rate": 1.1304177290631824e-06, "loss": 1.3253, "step": 2000 }, { "epoch": 9.366881217086016, "grad_norm": 0.44551151990890503, "learning_rate": 1.113158147747101e-06, "loss": 1.329, "step": 2001 }, { "epoch": 9.371562317144528, "grad_norm": 0.4621962010860443, "learning_rate": 1.0960298611943532e-06, "loss": 1.3406, "step": 2002 }, { "epoch": 9.376243417203042, "grad_norm": 0.4457390010356903, "learning_rate": 1.079032915406153e-06, "loss": 1.3477, "step": 2003 }, { "epoch": 9.380924517261557, "grad_norm": 0.43988901376724243, "learning_rate": 1.0621673560309797e-06, "loss": 1.3197, "step": 2004 }, { "epoch": 9.38560561732007, "grad_norm": 0.4548582136631012, "learning_rate": 1.0454332283644508e-06, "loss": 1.3194, "step": 2005 }, { "epoch": 9.390286717378585, "grad_norm": 0.4404996931552887, "learning_rate": 1.0288305773491936e-06, "loss": 1.3099, "step": 2006 }, { "epoch": 9.394967817437097, "grad_norm": 0.45231539011001587, "learning_rate": 1.01235944757474e-06, "loss": 1.3293, "step": 2007 }, { "epoch": 9.399648917495611, "grad_norm": 0.45580315589904785, "learning_rate": 9.960198832773881e-07, "loss": 1.3271, "step": 2008 }, { "epoch": 9.404330017554125, "grad_norm": 0.4584179222583771, "learning_rate": 9.79811928340102e-07, "loss": 1.3292, "step": 2009 }, { "epoch": 9.40901111761264, "grad_norm": 0.45283496379852295, "learning_rate": 9.637356262923725e-07, "loss": 1.3134, "step": 2010 }, { "epoch": 9.413692217671153, "grad_norm": 0.44626957178115845, "learning_rate": 9.477910203101181e-07, "loss": 1.3412, "step": 2011 }, { "epoch": 9.418373317729667, "grad_norm": 0.4469052255153656, "learning_rate": 9.31978153215557e-07, "loss": 1.341, "step": 2012 }, { "epoch": 9.42305441778818, "grad_norm": 0.44531938433647156, "learning_rate": 9.162970674771176e-07, "loss": 1.3298, "step": 2013 }, { "epoch": 9.427735517846694, "grad_norm": 0.44805341958999634, "learning_rate": 9.007478052092732e-07, "loss": 1.3138, "step": 2014 }, { "epoch": 9.432416617905208, "grad_norm": 0.4502425789833069, "learning_rate": 8.853304081724967e-07, "loss": 1.3216, "step": 2015 }, { "epoch": 9.437097717963722, "grad_norm": 0.45359623432159424, "learning_rate": 8.700449177730774e-07, "loss": 1.3235, "step": 2016 }, { "epoch": 9.441778818022236, "grad_norm": 0.43736910820007324, "learning_rate": 8.548913750630661e-07, "loss": 1.307, "step": 2017 }, { "epoch": 9.446459918080748, "grad_norm": 0.44915610551834106, "learning_rate": 8.398698207401412e-07, "loss": 1.3311, "step": 2018 }, { "epoch": 9.451141018139262, "grad_norm": 0.45041677355766296, "learning_rate": 8.249802951474927e-07, "loss": 1.3438, "step": 2019 }, { "epoch": 9.455822118197776, "grad_norm": 0.44452545046806335, "learning_rate": 8.102228382737331e-07, "loss": 1.3257, "step": 2020 }, { "epoch": 9.46050321825629, "grad_norm": 0.44057121872901917, "learning_rate": 7.955974897527752e-07, "loss": 1.332, "step": 2021 }, { "epoch": 9.465184318314805, "grad_norm": 0.44600802659988403, "learning_rate": 7.81104288863721e-07, "loss": 1.3383, "step": 2022 }, { "epoch": 9.469865418373317, "grad_norm": 0.45320701599121094, "learning_rate": 7.66743274530779e-07, "loss": 1.3159, "step": 2023 }, { "epoch": 9.474546518431831, "grad_norm": 0.44251635670661926, "learning_rate": 7.525144853231469e-07, "loss": 1.3332, "step": 2024 }, { "epoch": 9.479227618490345, "grad_norm": 0.4494597017765045, "learning_rate": 7.384179594548957e-07, "loss": 1.2961, "step": 2025 }, { "epoch": 9.483908718548859, "grad_norm": 0.4432707130908966, "learning_rate": 7.244537347848856e-07, "loss": 1.3427, "step": 2026 }, { "epoch": 9.488589818607373, "grad_norm": 0.44649192690849304, "learning_rate": 7.106218488166505e-07, "loss": 1.3363, "step": 2027 }, { "epoch": 9.493270918665887, "grad_norm": 0.4434208571910858, "learning_rate": 6.969223386983137e-07, "loss": 1.3351, "step": 2028 }, { "epoch": 9.4979520187244, "grad_norm": 0.4437309503555298, "learning_rate": 6.83355241222472e-07, "loss": 1.3356, "step": 2029 }, { "epoch": 9.502633118782914, "grad_norm": 0.4394586384296417, "learning_rate": 6.699205928261066e-07, "loss": 1.3243, "step": 2030 }, { "epoch": 9.507314218841428, "grad_norm": 0.4349606931209564, "learning_rate": 6.566184295904776e-07, "loss": 1.3393, "step": 2031 }, { "epoch": 9.511995318899942, "grad_norm": 0.4388129711151123, "learning_rate": 6.434487872410355e-07, "loss": 1.3338, "step": 2032 }, { "epoch": 9.516676418958456, "grad_norm": 0.44483059644699097, "learning_rate": 6.30411701147321e-07, "loss": 1.3174, "step": 2033 }, { "epoch": 9.52135751901697, "grad_norm": 0.44136685132980347, "learning_rate": 6.175072063228648e-07, "loss": 1.3547, "step": 2034 }, { "epoch": 9.526038619075482, "grad_norm": 0.450206995010376, "learning_rate": 6.047353374251108e-07, "loss": 1.3374, "step": 2035 }, { "epoch": 9.530719719133996, "grad_norm": 0.4519071877002716, "learning_rate": 5.920961287552929e-07, "loss": 1.3303, "step": 2036 }, { "epoch": 9.53540081919251, "grad_norm": 0.44036248326301575, "learning_rate": 5.795896142583801e-07, "loss": 1.3202, "step": 2037 }, { "epoch": 9.540081919251024, "grad_norm": 0.4422554671764374, "learning_rate": 5.672158275229489e-07, "loss": 1.3312, "step": 2038 }, { "epoch": 9.544763019309539, "grad_norm": 0.4471757709980011, "learning_rate": 5.549748017811274e-07, "loss": 1.3342, "step": 2039 }, { "epoch": 9.54944411936805, "grad_norm": 0.4468821883201599, "learning_rate": 5.428665699084789e-07, "loss": 1.3261, "step": 2040 }, { "epoch": 9.554125219426565, "grad_norm": 0.4374167025089264, "learning_rate": 5.308911644239245e-07, "loss": 1.3184, "step": 2041 }, { "epoch": 9.558806319485079, "grad_norm": 0.43856698274612427, "learning_rate": 5.190486174896648e-07, "loss": 1.3377, "step": 2042 }, { "epoch": 9.563487419543593, "grad_norm": 0.4368426501750946, "learning_rate": 5.073389609110579e-07, "loss": 1.3091, "step": 2043 }, { "epoch": 9.568168519602107, "grad_norm": 0.4541669189929962, "learning_rate": 4.95762226136598e-07, "loss": 1.3391, "step": 2044 }, { "epoch": 9.57284961966062, "grad_norm": 0.4393460750579834, "learning_rate": 4.843184442577641e-07, "loss": 1.3178, "step": 2045 }, { "epoch": 9.577530719719134, "grad_norm": 0.44958001375198364, "learning_rate": 4.730076460089716e-07, "loss": 1.3473, "step": 2046 }, { "epoch": 9.582211819777648, "grad_norm": 0.4427577257156372, "learning_rate": 4.618298617674932e-07, "loss": 1.3227, "step": 2047 }, { "epoch": 9.586892919836162, "grad_norm": 0.45502230525016785, "learning_rate": 4.507851215533543e-07, "loss": 1.3424, "step": 2048 }, { "epoch": 9.591574019894676, "grad_norm": 0.4533569812774658, "learning_rate": 4.3987345502927157e-07, "loss": 1.3534, "step": 2049 }, { "epoch": 9.59625511995319, "grad_norm": 0.44390031695365906, "learning_rate": 4.2909489150056994e-07, "loss": 1.3026, "step": 2050 }, { "epoch": 9.600936220011702, "grad_norm": 0.4406379461288452, "learning_rate": 4.184494599151045e-07, "loss": 1.3387, "step": 2051 }, { "epoch": 9.605617320070216, "grad_norm": 0.4497063159942627, "learning_rate": 4.0793718886316665e-07, "loss": 1.3011, "step": 2052 }, { "epoch": 9.61029842012873, "grad_norm": 0.4548267126083374, "learning_rate": 3.975581065774281e-07, "loss": 1.3051, "step": 2053 }, { "epoch": 9.614979520187244, "grad_norm": 0.446915864944458, "learning_rate": 3.873122409328578e-07, "loss": 1.3206, "step": 2054 }, { "epoch": 9.619660620245758, "grad_norm": 0.4462285041809082, "learning_rate": 3.7719961944664985e-07, "loss": 1.3205, "step": 2055 }, { "epoch": 9.62434172030427, "grad_norm": 0.4510735869407654, "learning_rate": 3.672202692781401e-07, "loss": 1.3192, "step": 2056 }, { "epoch": 9.629022820362785, "grad_norm": 0.4520784318447113, "learning_rate": 3.5737421722873423e-07, "loss": 1.3301, "step": 2057 }, { "epoch": 9.633703920421299, "grad_norm": 0.4418693780899048, "learning_rate": 3.476614897418573e-07, "loss": 1.325, "step": 2058 }, { "epoch": 9.638385020479813, "grad_norm": 0.43810001015663147, "learning_rate": 3.380821129028489e-07, "loss": 1.3326, "step": 2059 }, { "epoch": 9.643066120538327, "grad_norm": 0.44491061568260193, "learning_rate": 3.2863611243892923e-07, "loss": 1.3267, "step": 2060 }, { "epoch": 9.647747220596841, "grad_norm": 0.4490460753440857, "learning_rate": 3.193235137190831e-07, "loss": 1.3157, "step": 2061 }, { "epoch": 9.652428320655353, "grad_norm": 0.4372991919517517, "learning_rate": 3.101443417540595e-07, "loss": 1.3611, "step": 2062 }, { "epoch": 9.657109420713867, "grad_norm": 0.4481930434703827, "learning_rate": 3.010986211962219e-07, "loss": 1.3173, "step": 2063 }, { "epoch": 9.661790520772382, "grad_norm": 0.4486005902290344, "learning_rate": 2.921863763395538e-07, "loss": 1.3013, "step": 2064 }, { "epoch": 9.666471620830896, "grad_norm": 0.43497464060783386, "learning_rate": 2.8340763111956437e-07, "loss": 1.3223, "step": 2065 }, { "epoch": 9.67115272088941, "grad_norm": 0.4525913596153259, "learning_rate": 2.7476240911320506e-07, "loss": 1.3495, "step": 2066 }, { "epoch": 9.675833820947922, "grad_norm": 0.4425511360168457, "learning_rate": 2.6625073353884753e-07, "loss": 1.328, "step": 2067 }, { "epoch": 9.680514921006436, "grad_norm": 0.4437705874443054, "learning_rate": 2.578726272561782e-07, "loss": 1.3167, "step": 2068 }, { "epoch": 9.68519602106495, "grad_norm": 0.45476165413856506, "learning_rate": 2.4962811276618146e-07, "loss": 1.323, "step": 2069 }, { "epoch": 9.689877121123464, "grad_norm": 0.4444226324558258, "learning_rate": 2.415172122110343e-07, "loss": 1.3449, "step": 2070 }, { "epoch": 9.694558221181978, "grad_norm": 0.4542228877544403, "learning_rate": 2.3353994737408403e-07, "loss": 1.3285, "step": 2071 }, { "epoch": 9.69923932124049, "grad_norm": 0.4445129632949829, "learning_rate": 2.2569633967977067e-07, "loss": 1.3213, "step": 2072 }, { "epoch": 9.703920421299005, "grad_norm": 0.4466426372528076, "learning_rate": 2.1798641019356025e-07, "loss": 1.3333, "step": 2073 }, { "epoch": 9.708601521357519, "grad_norm": 0.43596237897872925, "learning_rate": 2.1041017962192267e-07, "loss": 1.3328, "step": 2074 }, { "epoch": 9.713282621416033, "grad_norm": 0.4403690695762634, "learning_rate": 2.0296766831223168e-07, "loss": 1.3189, "step": 2075 }, { "epoch": 9.717963721474547, "grad_norm": 0.4439951479434967, "learning_rate": 1.9565889625275946e-07, "loss": 1.322, "step": 2076 }, { "epoch": 9.722644821533061, "grad_norm": 0.44233083724975586, "learning_rate": 1.8848388307257104e-07, "loss": 1.3317, "step": 2077 }, { "epoch": 9.727325921591573, "grad_norm": 0.45015835762023926, "learning_rate": 1.8144264804150214e-07, "loss": 1.3242, "step": 2078 }, { "epoch": 9.732007021650087, "grad_norm": 0.44355273246765137, "learning_rate": 1.7453521007012031e-07, "loss": 1.3198, "step": 2079 }, { "epoch": 9.736688121708601, "grad_norm": 0.43759721517562866, "learning_rate": 1.6776158770963613e-07, "loss": 1.348, "step": 2080 }, { "epoch": 9.741369221767116, "grad_norm": 0.43967142701148987, "learning_rate": 1.611217991518754e-07, "loss": 1.3111, "step": 2081 }, { "epoch": 9.74605032182563, "grad_norm": 0.43936997652053833, "learning_rate": 1.5461586222924596e-07, "loss": 1.3315, "step": 2082 }, { "epoch": 9.750731421884144, "grad_norm": 0.444767028093338, "learning_rate": 1.4824379441464863e-07, "loss": 1.3306, "step": 2083 }, { "epoch": 9.755412521942656, "grad_norm": 0.44919633865356445, "learning_rate": 1.420056128214664e-07, "loss": 1.3198, "step": 2084 }, { "epoch": 9.76009362200117, "grad_norm": 0.4446408152580261, "learning_rate": 1.3590133420350315e-07, "loss": 1.3278, "step": 2085 }, { "epoch": 9.764774722059684, "grad_norm": 0.4420437812805176, "learning_rate": 1.2993097495493935e-07, "loss": 1.3339, "step": 2086 }, { "epoch": 9.769455822118198, "grad_norm": 0.4489443302154541, "learning_rate": 1.2409455111030422e-07, "loss": 1.3413, "step": 2087 }, { "epoch": 9.774136922176712, "grad_norm": 0.4377923607826233, "learning_rate": 1.1839207834439258e-07, "loss": 1.3148, "step": 2088 }, { "epoch": 9.778818022235225, "grad_norm": 0.4404031038284302, "learning_rate": 1.1282357197226478e-07, "loss": 1.3239, "step": 2089 }, { "epoch": 9.783499122293739, "grad_norm": 0.4418086111545563, "learning_rate": 1.0738904694919116e-07, "loss": 1.3247, "step": 2090 }, { "epoch": 9.788180222352253, "grad_norm": 0.4544632136821747, "learning_rate": 1.0208851787060215e-07, "loss": 1.3373, "step": 2091 }, { "epoch": 9.792861322410767, "grad_norm": 0.4450118839740753, "learning_rate": 9.692199897206044e-08, "loss": 1.3194, "step": 2092 }, { "epoch": 9.79754242246928, "grad_norm": 0.4508454203605652, "learning_rate": 9.18895041292278e-08, "loss": 1.3307, "step": 2093 }, { "epoch": 9.802223522527793, "grad_norm": 0.4486829340457916, "learning_rate": 8.699104685779835e-08, "loss": 1.3247, "step": 2094 }, { "epoch": 9.806904622586307, "grad_norm": 0.44039294123649597, "learning_rate": 8.222664031350413e-08, "loss": 1.3265, "step": 2095 }, { "epoch": 9.811585722644821, "grad_norm": 0.44150322675704956, "learning_rate": 7.759629729204854e-08, "loss": 1.3311, "step": 2096 }, { "epoch": 9.816266822703335, "grad_norm": 0.4456532895565033, "learning_rate": 7.310003022908407e-08, "loss": 1.336, "step": 2097 }, { "epoch": 9.82094792276185, "grad_norm": 0.4442593455314636, "learning_rate": 6.873785120017906e-08, "loss": 1.3007, "step": 2098 }, { "epoch": 9.825629022820364, "grad_norm": 0.4418592154979706, "learning_rate": 6.450977192077879e-08, "loss": 1.3156, "step": 2099 }, { "epoch": 9.830310122878876, "grad_norm": 0.4495104253292084, "learning_rate": 6.041580374618328e-08, "loss": 1.3298, "step": 2100 }, { "epoch": 9.83499122293739, "grad_norm": 0.44459906220436096, "learning_rate": 5.6455957671508466e-08, "loss": 1.3133, "step": 2101 }, { "epoch": 9.839672322995904, "grad_norm": 0.4395291805267334, "learning_rate": 5.263024433166397e-08, "loss": 1.3444, "step": 2102 }, { "epoch": 9.844353423054418, "grad_norm": 0.44066277146339417, "learning_rate": 4.89386740013198e-08, "loss": 1.3244, "step": 2103 }, { "epoch": 9.849034523112932, "grad_norm": 0.43951621651649475, "learning_rate": 4.5381256594878574e-08, "loss": 1.3599, "step": 2104 }, { "epoch": 9.853715623171446, "grad_norm": 0.44329434633255005, "learning_rate": 4.195800166644781e-08, "loss": 1.3467, "step": 2105 }, { "epoch": 9.858396723229959, "grad_norm": 0.44371113181114197, "learning_rate": 3.866891840982323e-08, "loss": 1.3412, "step": 2106 }, { "epoch": 9.863077823288473, "grad_norm": 0.43377214670181274, "learning_rate": 3.551401565844992e-08, "loss": 1.314, "step": 2107 }, { "epoch": 9.867758923346987, "grad_norm": 0.4374937415122986, "learning_rate": 3.249330188541122e-08, "loss": 1.3343, "step": 2108 }, { "epoch": 9.8724400234055, "grad_norm": 0.44057637453079224, "learning_rate": 2.960678520340099e-08, "loss": 1.3129, "step": 2109 }, { "epoch": 9.877121123464015, "grad_norm": 0.4521021246910095, "learning_rate": 2.685447336469582e-08, "loss": 1.3492, "step": 2110 }, { "epoch": 9.881802223522527, "grad_norm": 0.44130635261535645, "learning_rate": 2.423637376114396e-08, "loss": 1.3382, "step": 2111 }, { "epoch": 9.886483323581041, "grad_norm": 0.4505654573440552, "learning_rate": 2.1752493424148645e-08, "loss": 1.3508, "step": 2112 }, { "epoch": 9.891164423639555, "grad_norm": 0.452155202627182, "learning_rate": 1.94028390246348e-08, "loss": 1.3373, "step": 2113 }, { "epoch": 9.89584552369807, "grad_norm": 0.4516679048538208, "learning_rate": 1.718741687303238e-08, "loss": 1.3376, "step": 2114 }, { "epoch": 9.900526623756583, "grad_norm": 0.43587708473205566, "learning_rate": 1.5106232919276375e-08, "loss": 1.3504, "step": 2115 }, { "epoch": 9.905207723815096, "grad_norm": 0.456166535615921, "learning_rate": 1.31592927527735e-08, "loss": 1.3378, "step": 2116 }, { "epoch": 9.90988882387361, "grad_norm": 0.45621541142463684, "learning_rate": 1.1346601602396644e-08, "loss": 1.3327, "step": 2117 }, { "epoch": 9.914569923932124, "grad_norm": 0.4448513090610504, "learning_rate": 9.668164336473773e-09, "loss": 1.3376, "step": 2118 }, { "epoch": 9.919251023990638, "grad_norm": 0.4518662691116333, "learning_rate": 8.123985462749062e-09, "loss": 1.34, "step": 2119 }, { "epoch": 9.923932124049152, "grad_norm": 0.44186970591545105, "learning_rate": 6.714069128416212e-09, "loss": 1.3283, "step": 2120 }, { "epoch": 9.928613224107666, "grad_norm": 0.43974027037620544, "learning_rate": 5.438419120062932e-09, "loss": 1.3303, "step": 2121 }, { "epoch": 9.933294324166178, "grad_norm": 0.4513680636882782, "learning_rate": 4.297038863687597e-09, "loss": 1.3344, "step": 2122 }, { "epoch": 9.937975424224692, "grad_norm": 0.4399128556251526, "learning_rate": 3.2899314246825906e-09, "loss": 1.3169, "step": 2123 }, { "epoch": 9.942656524283207, "grad_norm": 0.4567125737667084, "learning_rate": 2.417099507817655e-09, "loss": 1.3343, "step": 2124 }, { "epoch": 9.94733762434172, "grad_norm": 0.4285227954387665, "learning_rate": 1.6785454572509908e-09, "loss": 1.3151, "step": 2125 }, { "epoch": 9.952018724400235, "grad_norm": 0.4506865441799164, "learning_rate": 1.0742712565070533e-09, "loss": 1.31, "step": 2126 }, { "epoch": 9.956699824458747, "grad_norm": 0.44049522280693054, "learning_rate": 6.04278528482105e-10, "loss": 1.3176, "step": 2127 }, { "epoch": 9.961380924517261, "grad_norm": 0.44814878702163696, "learning_rate": 2.68568535416458e-10, "loss": 1.3382, "step": 2128 }, { "epoch": 9.966062024575775, "grad_norm": 0.4420654773712158, "learning_rate": 6.714217893333264e-11, "loss": 1.326, "step": 2129 }, { "epoch": 9.97074312463429, "grad_norm": 0.4389389753341675, "learning_rate": 0.0, "loss": 1.3419, "step": 2130 }, { "epoch": 9.97074312463429, "step": 2130, "total_flos": 8.067940197159444e+17, "train_loss": 1.4460852847972385, "train_runtime": 7405.4046, "train_samples_per_second": 415.158, "train_steps_per_second": 0.288 } ], "logging_steps": 1.0, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 128, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.067940197159444e+17, "train_batch_size": 180, "trial_name": null, "trial_params": null }