diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,6143 +1,199 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.458762886597938, - "eval_steps": 97, - "global_step": 865, + "epoch": 0.03235198964736331, + "eval_steps": 386, + "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005154639175257732, - "grad_norm": 14.970044136047363, + "epoch": 0.0012940795858945326, + "grad_norm": 1.7405146360397339, "learning_rate": 2.0000000000000003e-06, - "loss": 9.1577, + "loss": 2.4269, "step": 1 }, { - "epoch": 0.005154639175257732, - "eval_loss": 9.00115966796875, - "eval_runtime": 22.9959, - "eval_samples_per_second": 7.132, - "eval_steps_per_second": 1.783, + "epoch": 0.0012940795858945326, + "eval_loss": 2.247628688812256, + "eval_runtime": 189.8853, + "eval_samples_per_second": 3.428, + "eval_steps_per_second": 0.858, "step": 1 }, { - "epoch": 0.010309278350515464, - "grad_norm": 15.33629035949707, + "epoch": 0.002588159171789065, + "grad_norm": 1.6643542051315308, "learning_rate": 4.000000000000001e-06, - "loss": 9.2777, + "loss": 2.2583, "step": 2 }, { - "epoch": 0.015463917525773196, - "grad_norm": 16.92699432373047, + "epoch": 0.0038822387576835974, + "grad_norm": 1.8690767288208008, "learning_rate": 6e-06, - "loss": 8.744, + "loss": 2.2696, "step": 3 }, { - "epoch": 0.020618556701030927, - "grad_norm": 19.3622989654541, + "epoch": 0.00517631834357813, + "grad_norm": 1.828118085861206, "learning_rate": 8.000000000000001e-06, - "loss": 8.545, + "loss": 2.3646, "step": 4 }, { - "epoch": 0.02577319587628866, - "grad_norm": 20.165861129760742, + "epoch": 0.006470397929472662, + "grad_norm": 1.9319926500320435, "learning_rate": 1e-05, - "loss": 9.7134, + "loss": 2.4196, "step": 5 }, { - "epoch": 0.030927835051546393, - "grad_norm": 18.959148406982422, + "epoch": 0.007764477515367195, + "grad_norm": 1.7723782062530518, "learning_rate": 1.2e-05, - "loss": 9.7526, + "loss": 2.4177, "step": 6 }, { - "epoch": 0.03608247422680412, - "grad_norm": 16.62849235534668, + "epoch": 0.009058557101261728, + "grad_norm": 1.9500815868377686, "learning_rate": 1.4000000000000001e-05, - "loss": 8.9248, + "loss": 2.3497, "step": 7 }, { - "epoch": 0.041237113402061855, - "grad_norm": 19.083742141723633, + "epoch": 0.01035263668715626, + "grad_norm": 2.3909075260162354, "learning_rate": 1.6000000000000003e-05, - "loss": 9.0543, + "loss": 2.405, "step": 8 }, { - "epoch": 0.04639175257731959, - "grad_norm": 16.922298431396484, + "epoch": 0.011646716273050793, + "grad_norm": 2.0620856285095215, "learning_rate": 1.8e-05, - "loss": 8.7968, + "loss": 2.4098, "step": 9 }, { - "epoch": 0.05154639175257732, - "grad_norm": 15.013986587524414, + "epoch": 0.012940795858945324, + "grad_norm": 1.8054910898208618, "learning_rate": 2e-05, - "loss": 8.8618, + "loss": 2.1233, "step": 10 }, { - "epoch": 0.05670103092783505, - "grad_norm": 16.929433822631836, + "epoch": 0.014234875444839857, + "grad_norm": 2.190964937210083, "learning_rate": 2.2000000000000003e-05, - "loss": 8.3198, + "loss": 2.3985, "step": 11 }, { - "epoch": 0.061855670103092786, - "grad_norm": 14.889703750610352, + "epoch": 0.01552895503073439, + "grad_norm": 1.9412921667099, "learning_rate": 2.4e-05, - "loss": 8.1047, + "loss": 2.462, "step": 12 }, { - "epoch": 0.06701030927835051, - "grad_norm": 15.865534782409668, + "epoch": 0.016823034616628922, + "grad_norm": 1.9161555767059326, "learning_rate": 2.6000000000000002e-05, - "loss": 7.6863, + "loss": 2.2118, "step": 13 }, { - "epoch": 0.07216494845360824, - "grad_norm": 15.19003963470459, + "epoch": 0.018117114202523456, + "grad_norm": 1.7161599397659302, "learning_rate": 2.8000000000000003e-05, - "loss": 7.779, + "loss": 2.2175, "step": 14 }, { - "epoch": 0.07731958762886598, - "grad_norm": 17.154193878173828, + "epoch": 0.019411193788417987, + "grad_norm": 2.173877000808716, "learning_rate": 3e-05, - "loss": 7.7273, + "loss": 2.2521, "step": 15 }, { - "epoch": 0.08247422680412371, - "grad_norm": 14.37552547454834, + "epoch": 0.02070527337431252, + "grad_norm": 2.0000555515289307, "learning_rate": 3.2000000000000005e-05, - "loss": 7.1202, + "loss": 2.1615, "step": 16 }, { - "epoch": 0.08762886597938144, - "grad_norm": 13.732095718383789, + "epoch": 0.021999352960207053, + "grad_norm": 1.5915080308914185, "learning_rate": 3.4000000000000007e-05, - "loss": 6.9298, + "loss": 1.9522, "step": 17 }, { - "epoch": 0.09278350515463918, - "grad_norm": 13.2391357421875, + "epoch": 0.023293432546101587, + "grad_norm": 1.6972448825836182, "learning_rate": 3.6e-05, - "loss": 6.308, + "loss": 1.7224, "step": 18 }, { - "epoch": 0.0979381443298969, - "grad_norm": 11.839639663696289, + "epoch": 0.024587512131996118, + "grad_norm": 1.7509772777557373, "learning_rate": 3.8e-05, - "loss": 6.1335, + "loss": 2.0414, "step": 19 }, { - "epoch": 0.10309278350515463, - "grad_norm": 11.819024085998535, + "epoch": 0.02588159171789065, + "grad_norm": 1.697340488433838, "learning_rate": 4e-05, - "loss": 5.7396, + "loss": 2.0427, "step": 20 }, { - "epoch": 0.10824742268041238, - "grad_norm": 11.764089584350586, + "epoch": 0.027175671303785183, + "grad_norm": 1.8733758926391602, "learning_rate": 4.2e-05, - "loss": 5.1633, + "loss": 1.6772, "step": 21 }, { - "epoch": 0.1134020618556701, - "grad_norm": 12.526201248168945, + "epoch": 0.028469750889679714, + "grad_norm": 1.6085255146026611, "learning_rate": 4.4000000000000006e-05, - "loss": 4.7586, + "loss": 1.6527, "step": 22 }, { - "epoch": 0.11855670103092783, - "grad_norm": 10.944100379943848, + "epoch": 0.029763830475574248, + "grad_norm": 1.5792337656021118, "learning_rate": 4.600000000000001e-05, - "loss": 4.47, + "loss": 1.6567, "step": 23 }, { - "epoch": 0.12371134020618557, - "grad_norm": 14.008569717407227, + "epoch": 0.03105791006146878, + "grad_norm": 1.4392567873001099, "learning_rate": 4.8e-05, - "loss": 4.2032, + "loss": 1.508, "step": 24 }, { - "epoch": 0.12886597938144329, - "grad_norm": 14.705824851989746, + "epoch": 0.03235198964736331, + "grad_norm": 1.5222433805465698, "learning_rate": 5e-05, - "loss": 3.2385, + "loss": 1.4606, "step": 25 - }, - { - "epoch": 0.13402061855670103, - "grad_norm": 16.897640228271484, - "learning_rate": 5.2000000000000004e-05, - "loss": 3.0181, - "step": 26 - }, - { - "epoch": 0.13917525773195877, - "grad_norm": 15.038681983947754, - "learning_rate": 5.4000000000000005e-05, - "loss": 2.171, - "step": 27 - }, - { - "epoch": 0.14432989690721648, - "grad_norm": 15.12243938446045, - "learning_rate": 5.6000000000000006e-05, - "loss": 2.1848, - "step": 28 - }, - { - "epoch": 0.14948453608247422, - "grad_norm": 12.384464263916016, - "learning_rate": 5.8e-05, - "loss": 1.6113, - "step": 29 - }, - { - "epoch": 0.15463917525773196, - "grad_norm": 10.626302719116211, - "learning_rate": 6e-05, - "loss": 0.8058, - "step": 30 - }, - { - "epoch": 0.15979381443298968, - "grad_norm": 10.231700897216797, - "learning_rate": 6.2e-05, - "loss": 1.1714, - "step": 31 - }, - { - "epoch": 0.16494845360824742, - "grad_norm": 11.243683815002441, - "learning_rate": 6.400000000000001e-05, - "loss": 0.8837, - "step": 32 - }, - { - "epoch": 0.17010309278350516, - "grad_norm": 13.52505874633789, - "learning_rate": 6.6e-05, - "loss": 0.9935, - "step": 33 - }, - { - "epoch": 0.17525773195876287, - "grad_norm": 11.277730941772461, - "learning_rate": 6.800000000000001e-05, - "loss": 0.7997, - "step": 34 - }, - { - "epoch": 0.18041237113402062, - "grad_norm": 8.148871421813965, - "learning_rate": 7e-05, - "loss": 0.5211, - "step": 35 - }, - { - "epoch": 0.18556701030927836, - "grad_norm": 7.315363883972168, - "learning_rate": 7.2e-05, - "loss": 0.5289, - "step": 36 - }, - { - "epoch": 0.19072164948453607, - "grad_norm": 11.134007453918457, - "learning_rate": 7.4e-05, - "loss": 0.4685, - "step": 37 - }, - { - "epoch": 0.1958762886597938, - "grad_norm": 12.332499504089355, - "learning_rate": 7.6e-05, - "loss": 0.6791, - "step": 38 - }, - { - "epoch": 0.20103092783505155, - "grad_norm": 9.159335136413574, - "learning_rate": 7.800000000000001e-05, - "loss": 0.6563, - "step": 39 - }, - { - "epoch": 0.20618556701030927, - "grad_norm": 5.620604038238525, - "learning_rate": 8e-05, - "loss": 0.3402, - "step": 40 - }, - { - "epoch": 0.211340206185567, - "grad_norm": 10.507390975952148, - "learning_rate": 8.2e-05, - "loss": 0.6456, - "step": 41 - }, - { - "epoch": 0.21649484536082475, - "grad_norm": 6.238654613494873, - "learning_rate": 8.4e-05, - "loss": 0.3945, - "step": 42 - }, - { - "epoch": 0.22164948453608246, - "grad_norm": 5.950511455535889, - "learning_rate": 8.6e-05, - "loss": 0.5033, - "step": 43 - }, - { - "epoch": 0.2268041237113402, - "grad_norm": 3.9956507682800293, - "learning_rate": 8.800000000000001e-05, - "loss": 0.4067, - "step": 44 - }, - { - "epoch": 0.23195876288659795, - "grad_norm": 6.797879695892334, - "learning_rate": 9e-05, - "loss": 0.4202, - "step": 45 - }, - { - "epoch": 0.23711340206185566, - "grad_norm": 5.048930644989014, - "learning_rate": 9.200000000000001e-05, - "loss": 0.3595, - "step": 46 - }, - { - "epoch": 0.2422680412371134, - "grad_norm": 4.503322124481201, - "learning_rate": 9.4e-05, - "loss": 0.3251, - "step": 47 - }, - { - "epoch": 0.24742268041237114, - "grad_norm": 6.15740966796875, - "learning_rate": 9.6e-05, - "loss": 0.5591, - "step": 48 - }, - { - "epoch": 0.25257731958762886, - "grad_norm": 4.286350727081299, - "learning_rate": 9.8e-05, - "loss": 0.3105, - "step": 49 - }, - { - "epoch": 0.25773195876288657, - "grad_norm": 5.595468044281006, - "learning_rate": 0.0001, - "loss": 0.2741, - "step": 50 - }, - { - "epoch": 0.26288659793814434, - "grad_norm": 3.9799349308013916, - "learning_rate": 9.999970848314005e-05, - "loss": 0.194, - "step": 51 - }, - { - "epoch": 0.26804123711340205, - "grad_norm": 5.833397388458252, - "learning_rate": 9.999883393595947e-05, - "loss": 0.316, - "step": 52 - }, - { - "epoch": 0.27319587628865977, - "grad_norm": 6.637378215789795, - "learning_rate": 9.999737636865609e-05, - "loss": 0.3404, - "step": 53 - }, - { - "epoch": 0.27835051546391754, - "grad_norm": 8.41789722442627, - "learning_rate": 9.99953357982261e-05, - "loss": 0.5317, - "step": 54 - }, - { - "epoch": 0.28350515463917525, - "grad_norm": 6.7356367111206055, - "learning_rate": 9.999271224846396e-05, - "loss": 0.3503, - "step": 55 - }, - { - "epoch": 0.28865979381443296, - "grad_norm": 3.6454246044158936, - "learning_rate": 9.998950574996199e-05, - "loss": 0.13, - "step": 56 - }, - { - "epoch": 0.29381443298969073, - "grad_norm": 4.760809898376465, - "learning_rate": 9.998571634011015e-05, - "loss": 0.3055, - "step": 57 - }, - { - "epoch": 0.29896907216494845, - "grad_norm": 4.675559043884277, - "learning_rate": 9.998134406309554e-05, - "loss": 0.2142, - "step": 58 - }, - { - "epoch": 0.30412371134020616, - "grad_norm": 5.181208610534668, - "learning_rate": 9.99763889699018e-05, - "loss": 0.332, - "step": 59 - }, - { - "epoch": 0.30927835051546393, - "grad_norm": 6.778941631317139, - "learning_rate": 9.99708511183087e-05, - "loss": 0.2931, - "step": 60 - }, - { - "epoch": 0.31443298969072164, - "grad_norm": 3.8104896545410156, - "learning_rate": 9.996473057289132e-05, - "loss": 0.1599, - "step": 61 - }, - { - "epoch": 0.31958762886597936, - "grad_norm": 4.8184123039245605, - "learning_rate": 9.995802740501933e-05, - "loss": 0.3225, - "step": 62 - }, - { - "epoch": 0.3247422680412371, - "grad_norm": 3.0463945865631104, - "learning_rate": 9.99507416928562e-05, - "loss": 0.2123, - "step": 63 - }, - { - "epoch": 0.32989690721649484, - "grad_norm": 3.5274605751037598, - "learning_rate": 9.994287352135825e-05, - "loss": 0.2277, - "step": 64 - }, - { - "epoch": 0.33505154639175255, - "grad_norm": 3.9188365936279297, - "learning_rate": 9.993442298227365e-05, - "loss": 0.2356, - "step": 65 - }, - { - "epoch": 0.3402061855670103, - "grad_norm": 3.182507038116455, - "learning_rate": 9.99253901741414e-05, - "loss": 0.1928, - "step": 66 - }, - { - "epoch": 0.34536082474226804, - "grad_norm": 5.660282135009766, - "learning_rate": 9.991577520229014e-05, - "loss": 0.2753, - "step": 67 - }, - { - "epoch": 0.35051546391752575, - "grad_norm": 4.6711201667785645, - "learning_rate": 9.99055781788369e-05, - "loss": 0.1546, - "step": 68 - }, - { - "epoch": 0.3556701030927835, - "grad_norm": 6.8249287605285645, - "learning_rate": 9.989479922268588e-05, - "loss": 0.3186, - "step": 69 - }, - { - "epoch": 0.36082474226804123, - "grad_norm": 4.09270715713501, - "learning_rate": 9.988343845952697e-05, - "loss": 0.289, - "step": 70 - }, - { - "epoch": 0.36597938144329895, - "grad_norm": 3.7769954204559326, - "learning_rate": 9.98714960218343e-05, - "loss": 0.2404, - "step": 71 - }, - { - "epoch": 0.3711340206185567, - "grad_norm": 5.468443870544434, - "learning_rate": 9.985897204886481e-05, - "loss": 0.3089, - "step": 72 - }, - { - "epoch": 0.37628865979381443, - "grad_norm": 3.5569119453430176, - "learning_rate": 9.98458666866564e-05, - "loss": 0.2668, - "step": 73 - }, - { - "epoch": 0.38144329896907214, - "grad_norm": 3.872802495956421, - "learning_rate": 9.983218008802648e-05, - "loss": 0.2248, - "step": 74 - }, - { - "epoch": 0.3865979381443299, - "grad_norm": 1.6926909685134888, - "learning_rate": 9.981791241257e-05, - "loss": 0.0595, - "step": 75 - }, - { - "epoch": 0.3917525773195876, - "grad_norm": 2.683844804763794, - "learning_rate": 9.98030638266577e-05, - "loss": 0.101, - "step": 76 - }, - { - "epoch": 0.39690721649484534, - "grad_norm": 2.6626157760620117, - "learning_rate": 9.978763450343407e-05, - "loss": 0.2987, - "step": 77 - }, - { - "epoch": 0.4020618556701031, - "grad_norm": 3.233513116836548, - "learning_rate": 9.977162462281544e-05, - "loss": 0.1434, - "step": 78 - }, - { - "epoch": 0.4072164948453608, - "grad_norm": 3.793926954269409, - "learning_rate": 9.975503437148783e-05, - "loss": 0.1916, - "step": 79 - }, - { - "epoch": 0.41237113402061853, - "grad_norm": 5.386580467224121, - "learning_rate": 9.973786394290474e-05, - "loss": 0.5362, - "step": 80 - }, - { - "epoch": 0.4175257731958763, - "grad_norm": 2.760202407836914, - "learning_rate": 9.972011353728496e-05, - "loss": 0.1676, - "step": 81 - }, - { - "epoch": 0.422680412371134, - "grad_norm": 2.2399468421936035, - "learning_rate": 9.970178336161018e-05, - "loss": 0.1063, - "step": 82 - }, - { - "epoch": 0.42783505154639173, - "grad_norm": 4.6277360916137695, - "learning_rate": 9.968287362962264e-05, - "loss": 0.3259, - "step": 83 - }, - { - "epoch": 0.4329896907216495, - "grad_norm": 2.2781248092651367, - "learning_rate": 9.96633845618225e-05, - "loss": 0.0777, - "step": 84 - }, - { - "epoch": 0.4381443298969072, - "grad_norm": 4.080474376678467, - "learning_rate": 9.96433163854655e-05, - "loss": 0.1406, - "step": 85 - }, - { - "epoch": 0.44329896907216493, - "grad_norm": 2.5774247646331787, - "learning_rate": 9.962266933456008e-05, - "loss": 0.2172, - "step": 86 - }, - { - "epoch": 0.4484536082474227, - "grad_norm": 2.8727529048919678, - "learning_rate": 9.96014436498648e-05, - "loss": 0.1189, - "step": 87 - }, - { - "epoch": 0.4536082474226804, - "grad_norm": 4.630618572235107, - "learning_rate": 9.957963957888542e-05, - "loss": 0.3777, - "step": 88 - }, - { - "epoch": 0.4587628865979381, - "grad_norm": 3.9078369140625, - "learning_rate": 9.955725737587214e-05, - "loss": 0.393, - "step": 89 - }, - { - "epoch": 0.4639175257731959, - "grad_norm": 4.288754940032959, - "learning_rate": 9.953429730181653e-05, - "loss": 0.3116, - "step": 90 - }, - { - "epoch": 0.4690721649484536, - "grad_norm": 4.296566486358643, - "learning_rate": 9.951075962444856e-05, - "loss": 0.2739, - "step": 91 - }, - { - "epoch": 0.4742268041237113, - "grad_norm": 3.088003635406494, - "learning_rate": 9.94866446182334e-05, - "loss": 0.2998, - "step": 92 - }, - { - "epoch": 0.4793814432989691, - "grad_norm": 2.5414445400238037, - "learning_rate": 9.94619525643683e-05, - "loss": 0.1035, - "step": 93 - }, - { - "epoch": 0.4845360824742268, - "grad_norm": 3.514005184173584, - "learning_rate": 9.943668375077925e-05, - "loss": 0.4353, - "step": 94 - }, - { - "epoch": 0.4896907216494845, - "grad_norm": 3.296506404876709, - "learning_rate": 9.941083847211765e-05, - "loss": 0.1903, - "step": 95 - }, - { - "epoch": 0.4948453608247423, - "grad_norm": 3.438098669052124, - "learning_rate": 9.938441702975689e-05, - "loss": 0.286, - "step": 96 - }, - { - "epoch": 0.5, - "grad_norm": 2.7748489379882812, - "learning_rate": 9.93574197317888e-05, - "loss": 0.1028, - "step": 97 - }, - { - "epoch": 0.5, - "eval_loss": 0.14588667452335358, - "eval_runtime": 23.0705, - "eval_samples_per_second": 7.109, - "eval_steps_per_second": 1.777, - "step": 97 - }, - { - "epoch": 0.5051546391752577, - "grad_norm": 2.874598264694214, - "learning_rate": 9.93298468930201e-05, - "loss": 0.212, - "step": 98 - }, - { - "epoch": 0.5103092783505154, - "grad_norm": 6.394598007202148, - "learning_rate": 9.930169883496867e-05, - "loss": 0.3593, - "step": 99 - }, - { - "epoch": 0.5154639175257731, - "grad_norm": 3.2982473373413086, - "learning_rate": 9.927297588585984e-05, - "loss": 0.2588, - "step": 100 - }, - { - "epoch": 0.520618556701031, - "grad_norm": 1.4498558044433594, - "learning_rate": 9.924367838062259e-05, - "loss": 0.0973, - "step": 101 - }, - { - "epoch": 0.5257731958762887, - "grad_norm": 4.289880275726318, - "learning_rate": 9.921380666088558e-05, - "loss": 0.3103, - "step": 102 - }, - { - "epoch": 0.5309278350515464, - "grad_norm": 2.9333765506744385, - "learning_rate": 9.91833610749732e-05, - "loss": 0.2417, - "step": 103 - }, - { - "epoch": 0.5360824742268041, - "grad_norm": 2.2084240913391113, - "learning_rate": 9.915234197790152e-05, - "loss": 0.0934, - "step": 104 - }, - { - "epoch": 0.5412371134020618, - "grad_norm": 4.149613857269287, - "learning_rate": 9.912074973137412e-05, - "loss": 0.4745, - "step": 105 - }, - { - "epoch": 0.5463917525773195, - "grad_norm": 1.4988398551940918, - "learning_rate": 9.908858470377793e-05, - "loss": 0.0984, - "step": 106 - }, - { - "epoch": 0.5515463917525774, - "grad_norm": 2.332045316696167, - "learning_rate": 9.905584727017884e-05, - "loss": 0.1449, - "step": 107 - }, - { - "epoch": 0.5567010309278351, - "grad_norm": 2.528547525405884, - "learning_rate": 9.90225378123174e-05, - "loss": 0.196, - "step": 108 - }, - { - "epoch": 0.5618556701030928, - "grad_norm": 1.529741883277893, - "learning_rate": 9.898865671860438e-05, - "loss": 0.1164, - "step": 109 - }, - { - "epoch": 0.5670103092783505, - "grad_norm": 2.2940495014190674, - "learning_rate": 9.895420438411616e-05, - "loss": 0.0998, - "step": 110 - }, - { - "epoch": 0.5721649484536082, - "grad_norm": 4.618880271911621, - "learning_rate": 9.891918121059019e-05, - "loss": 0.2512, - "step": 111 - }, - { - "epoch": 0.5773195876288659, - "grad_norm": 2.642099142074585, - "learning_rate": 9.888358760642029e-05, - "loss": 0.1523, - "step": 112 - }, - { - "epoch": 0.5824742268041238, - "grad_norm": 1.374038577079773, - "learning_rate": 9.884742398665191e-05, - "loss": 0.0801, - "step": 113 - }, - { - "epoch": 0.5876288659793815, - "grad_norm": 4.316773891448975, - "learning_rate": 9.881069077297723e-05, - "loss": 0.3471, - "step": 114 - }, - { - "epoch": 0.5927835051546392, - "grad_norm": 2.5112979412078857, - "learning_rate": 9.877338839373032e-05, - "loss": 0.121, - "step": 115 - }, - { - "epoch": 0.5979381443298969, - "grad_norm": 2.17130184173584, - "learning_rate": 9.873551728388203e-05, - "loss": 0.1461, - "step": 116 - }, - { - "epoch": 0.6030927835051546, - "grad_norm": 3.4084630012512207, - "learning_rate": 9.869707788503508e-05, - "loss": 0.1443, - "step": 117 - }, - { - "epoch": 0.6082474226804123, - "grad_norm": 1.7009655237197876, - "learning_rate": 9.865807064541877e-05, - "loss": 0.0999, - "step": 118 - }, - { - "epoch": 0.6134020618556701, - "grad_norm": 2.5199472904205322, - "learning_rate": 9.861849601988383e-05, - "loss": 0.1068, - "step": 119 - }, - { - "epoch": 0.6185567010309279, - "grad_norm": 2.2809436321258545, - "learning_rate": 9.857835446989707e-05, - "loss": 0.1214, - "step": 120 - }, - { - "epoch": 0.6237113402061856, - "grad_norm": 0.899928092956543, - "learning_rate": 9.853764646353605e-05, - "loss": 0.0325, - "step": 121 - }, - { - "epoch": 0.6288659793814433, - "grad_norm": 2.0530076026916504, - "learning_rate": 9.849637247548356e-05, - "loss": 0.1237, - "step": 122 - }, - { - "epoch": 0.634020618556701, - "grad_norm": 4.350875377655029, - "learning_rate": 9.845453298702216e-05, - "loss": 0.3754, - "step": 123 - }, - { - "epoch": 0.6391752577319587, - "grad_norm": 5.752893447875977, - "learning_rate": 9.841212848602846e-05, - "loss": 0.3497, - "step": 124 - }, - { - "epoch": 0.6443298969072165, - "grad_norm": 4.320384979248047, - "learning_rate": 9.836915946696759e-05, - "loss": 0.2156, - "step": 125 - }, - { - "epoch": 0.6494845360824743, - "grad_norm": 3.15738582611084, - "learning_rate": 9.832562643088724e-05, - "loss": 0.2713, - "step": 126 - }, - { - "epoch": 0.654639175257732, - "grad_norm": 1.3500375747680664, - "learning_rate": 9.828152988541201e-05, - "loss": 0.0746, - "step": 127 - }, - { - "epoch": 0.6597938144329897, - "grad_norm": 2.566340923309326, - "learning_rate": 9.823687034473735e-05, - "loss": 0.0502, - "step": 128 - }, - { - "epoch": 0.6649484536082474, - "grad_norm": 2.275930404663086, - "learning_rate": 9.81916483296236e-05, - "loss": 0.0944, - "step": 129 - }, - { - "epoch": 0.6701030927835051, - "grad_norm": 3.953136444091797, - "learning_rate": 9.814586436738998e-05, - "loss": 0.3553, - "step": 130 - }, - { - "epoch": 0.6752577319587629, - "grad_norm": 6.270533084869385, - "learning_rate": 9.809951899190835e-05, - "loss": 0.3139, - "step": 131 - }, - { - "epoch": 0.6804123711340206, - "grad_norm": 3.558518409729004, - "learning_rate": 9.805261274359705e-05, - "loss": 0.1273, - "step": 132 - }, - { - "epoch": 0.6855670103092784, - "grad_norm": 2.322179079055786, - "learning_rate": 9.800514616941457e-05, - "loss": 0.0881, - "step": 133 - }, - { - "epoch": 0.6907216494845361, - "grad_norm": 2.145798683166504, - "learning_rate": 9.795711982285316e-05, - "loss": 0.099, - "step": 134 - }, - { - "epoch": 0.6958762886597938, - "grad_norm": 2.0605945587158203, - "learning_rate": 9.790853426393245e-05, - "loss": 0.1114, - "step": 135 - }, - { - "epoch": 0.7010309278350515, - "grad_norm": 2.2030012607574463, - "learning_rate": 9.785939005919278e-05, - "loss": 0.0786, - "step": 136 - }, - { - "epoch": 0.7061855670103093, - "grad_norm": 2.1371002197265625, - "learning_rate": 9.780968778168874e-05, - "loss": 0.0472, - "step": 137 - }, - { - "epoch": 0.711340206185567, - "grad_norm": 2.5156478881835938, - "learning_rate": 9.77594280109824e-05, - "loss": 0.2225, - "step": 138 - }, - { - "epoch": 0.7164948453608248, - "grad_norm": 3.4769577980041504, - "learning_rate": 9.77086113331366e-05, - "loss": 0.1936, - "step": 139 - }, - { - "epoch": 0.7216494845360825, - "grad_norm": 2.1080641746520996, - "learning_rate": 9.765723834070804e-05, - "loss": 0.0703, - "step": 140 - }, - { - "epoch": 0.7268041237113402, - "grad_norm": 2.564608573913574, - "learning_rate": 9.760530963274048e-05, - "loss": 0.1402, - "step": 141 - }, - { - "epoch": 0.7319587628865979, - "grad_norm": 3.0748331546783447, - "learning_rate": 9.755282581475769e-05, - "loss": 0.1316, - "step": 142 - }, - { - "epoch": 0.7371134020618557, - "grad_norm": 2.478696346282959, - "learning_rate": 9.749978749875635e-05, - "loss": 0.1484, - "step": 143 - }, - { - "epoch": 0.7422680412371134, - "grad_norm": 2.250427722930908, - "learning_rate": 9.744619530319899e-05, - "loss": 0.1175, - "step": 144 - }, - { - "epoch": 0.7474226804123711, - "grad_norm": 4.462061405181885, - "learning_rate": 9.739204985300679e-05, - "loss": 0.1377, - "step": 145 - }, - { - "epoch": 0.7525773195876289, - "grad_norm": 3.592965841293335, - "learning_rate": 9.733735177955219e-05, - "loss": 0.158, - "step": 146 - }, - { - "epoch": 0.7577319587628866, - "grad_norm": 3.2920875549316406, - "learning_rate": 9.728210172065162e-05, - "loss": 0.1549, - "step": 147 - }, - { - "epoch": 0.7628865979381443, - "grad_norm": 3.6123318672180176, - "learning_rate": 9.722630032055803e-05, - "loss": 0.2841, - "step": 148 - }, - { - "epoch": 0.7680412371134021, - "grad_norm": 1.8174854516983032, - "learning_rate": 9.716994822995338e-05, - "loss": 0.0614, - "step": 149 - }, - { - "epoch": 0.7731958762886598, - "grad_norm": 3.1688687801361084, - "learning_rate": 9.711304610594104e-05, - "loss": 0.1094, - "step": 150 - }, - { - "epoch": 0.7783505154639175, - "grad_norm": 3.0467987060546875, - "learning_rate": 9.705559461203815e-05, - "loss": 0.1497, - "step": 151 - }, - { - "epoch": 0.7835051546391752, - "grad_norm": 3.366333484649658, - "learning_rate": 9.699759441816787e-05, - "loss": 0.2093, - "step": 152 - }, - { - "epoch": 0.788659793814433, - "grad_norm": 1.9911260604858398, - "learning_rate": 9.69390462006516e-05, - "loss": 0.0747, - "step": 153 - }, - { - "epoch": 0.7938144329896907, - "grad_norm": 2.2760016918182373, - "learning_rate": 9.687995064220102e-05, - "loss": 0.1229, - "step": 154 - }, - { - "epoch": 0.7989690721649485, - "grad_norm": 2.0865273475646973, - "learning_rate": 9.682030843191022e-05, - "loss": 0.106, - "step": 155 - }, - { - "epoch": 0.8041237113402062, - "grad_norm": 2.1471030712127686, - "learning_rate": 9.676012026524755e-05, - "loss": 0.1264, - "step": 156 - }, - { - "epoch": 0.8092783505154639, - "grad_norm": 1.8155637979507446, - "learning_rate": 9.669938684404766e-05, - "loss": 0.096, - "step": 157 - }, - { - "epoch": 0.8144329896907216, - "grad_norm": 1.922451138496399, - "learning_rate": 9.663810887650318e-05, - "loss": 0.1148, - "step": 158 - }, - { - "epoch": 0.8195876288659794, - "grad_norm": 2.0736405849456787, - "learning_rate": 9.657628707715655e-05, - "loss": 0.0584, - "step": 159 - }, - { - "epoch": 0.8247422680412371, - "grad_norm": 2.7208549976348877, - "learning_rate": 9.651392216689165e-05, - "loss": 0.2158, - "step": 160 - }, - { - "epoch": 0.8298969072164949, - "grad_norm": 1.785387396812439, - "learning_rate": 9.645101487292539e-05, - "loss": 0.0931, - "step": 161 - }, - { - "epoch": 0.8350515463917526, - "grad_norm": 3.9423131942749023, - "learning_rate": 9.638756592879922e-05, - "loss": 0.2373, - "step": 162 - }, - { - "epoch": 0.8402061855670103, - "grad_norm": 2.107962131500244, - "learning_rate": 9.632357607437065e-05, - "loss": 0.0856, - "step": 163 - }, - { - "epoch": 0.845360824742268, - "grad_norm": 3.6012613773345947, - "learning_rate": 9.625904605580452e-05, - "loss": 0.2367, - "step": 164 - }, - { - "epoch": 0.8505154639175257, - "grad_norm": 2.260842800140381, - "learning_rate": 9.619397662556435e-05, - "loss": 0.1051, - "step": 165 - }, - { - "epoch": 0.8556701030927835, - "grad_norm": 2.0315983295440674, - "learning_rate": 9.612836854240358e-05, - "loss": 0.0865, - "step": 166 - }, - { - "epoch": 0.8608247422680413, - "grad_norm": 1.805436372756958, - "learning_rate": 9.606222257135675e-05, - "loss": 0.0548, - "step": 167 - }, - { - "epoch": 0.865979381443299, - "grad_norm": 3.063310146331787, - "learning_rate": 9.599553948373045e-05, - "loss": 0.1935, - "step": 168 - }, - { - "epoch": 0.8711340206185567, - "grad_norm": 3.7688794136047363, - "learning_rate": 9.592832005709448e-05, - "loss": 0.3533, - "step": 169 - }, - { - "epoch": 0.8762886597938144, - "grad_norm": 3.773287534713745, - "learning_rate": 9.586056507527266e-05, - "loss": 0.1988, - "step": 170 - }, - { - "epoch": 0.8814432989690721, - "grad_norm": 2.5585780143737793, - "learning_rate": 9.579227532833377e-05, - "loss": 0.0867, - "step": 171 - }, - { - "epoch": 0.8865979381443299, - "grad_norm": 3.5680785179138184, - "learning_rate": 9.572345161258235e-05, - "loss": 0.2216, - "step": 172 - }, - { - "epoch": 0.8917525773195877, - "grad_norm": 2.894637107849121, - "learning_rate": 9.565409473054932e-05, - "loss": 0.2954, - "step": 173 - }, - { - "epoch": 0.8969072164948454, - "grad_norm": 2.3045082092285156, - "learning_rate": 9.558420549098268e-05, - "loss": 0.1069, - "step": 174 - }, - { - "epoch": 0.9020618556701031, - "grad_norm": 1.5784564018249512, - "learning_rate": 9.551378470883812e-05, - "loss": 0.0812, - "step": 175 - }, - { - "epoch": 0.9072164948453608, - "grad_norm": 0.7501576542854309, - "learning_rate": 9.544283320526943e-05, - "loss": 0.0496, - "step": 176 - }, - { - "epoch": 0.9123711340206185, - "grad_norm": 2.6611945629119873, - "learning_rate": 9.537135180761903e-05, - "loss": 0.1711, - "step": 177 - }, - { - "epoch": 0.9175257731958762, - "grad_norm": 2.171032190322876, - "learning_rate": 9.52993413494082e-05, - "loss": 0.1465, - "step": 178 - }, - { - "epoch": 0.9226804123711341, - "grad_norm": 1.442818522453308, - "learning_rate": 9.522680267032742e-05, - "loss": 0.0545, - "step": 179 - }, - { - "epoch": 0.9278350515463918, - "grad_norm": 1.5512781143188477, - "learning_rate": 9.515373661622664e-05, - "loss": 0.0637, - "step": 180 - }, - { - "epoch": 0.9329896907216495, - "grad_norm": 2.2158899307250977, - "learning_rate": 9.508014403910533e-05, - "loss": 0.142, - "step": 181 - }, - { - "epoch": 0.9381443298969072, - "grad_norm": 0.619306743144989, - "learning_rate": 9.500602579710256e-05, - "loss": 0.0265, - "step": 182 - }, - { - "epoch": 0.9432989690721649, - "grad_norm": 0.3452989459037781, - "learning_rate": 9.4931382754487e-05, - "loss": 0.0122, - "step": 183 - }, - { - "epoch": 0.9484536082474226, - "grad_norm": 2.468613386154175, - "learning_rate": 9.485621578164689e-05, - "loss": 0.1154, - "step": 184 - }, - { - "epoch": 0.9536082474226805, - "grad_norm": 1.288886547088623, - "learning_rate": 9.478052575507982e-05, - "loss": 0.0564, - "step": 185 - }, - { - "epoch": 0.9587628865979382, - "grad_norm": 1.4700998067855835, - "learning_rate": 9.470431355738257e-05, - "loss": 0.0598, - "step": 186 - }, - { - "epoch": 0.9639175257731959, - "grad_norm": 1.8623106479644775, - "learning_rate": 9.46275800772407e-05, - "loss": 0.0958, - "step": 187 - }, - { - "epoch": 0.9690721649484536, - "grad_norm": 3.3809943199157715, - "learning_rate": 9.45503262094184e-05, - "loss": 0.1816, - "step": 188 - }, - { - "epoch": 0.9742268041237113, - "grad_norm": 1.7910330295562744, - "learning_rate": 9.447255285474783e-05, - "loss": 0.0803, - "step": 189 - }, - { - "epoch": 0.979381443298969, - "grad_norm": 0.992155134677887, - "learning_rate": 9.439426092011875e-05, - "loss": 0.0589, - "step": 190 - }, - { - "epoch": 0.9845360824742269, - "grad_norm": 2.7251176834106445, - "learning_rate": 9.431545131846797e-05, - "loss": 0.1786, - "step": 191 - }, - { - "epoch": 0.9896907216494846, - "grad_norm": 1.8005549907684326, - "learning_rate": 9.423612496876855e-05, - "loss": 0.1158, - "step": 192 - }, - { - "epoch": 0.9948453608247423, - "grad_norm": 3.290626287460327, - "learning_rate": 9.415628279601923e-05, - "loss": 0.2714, - "step": 193 - }, - { - "epoch": 1.0, - "grad_norm": 1.748504400253296, - "learning_rate": 9.407592573123358e-05, - "loss": 0.1168, - "step": 194 - }, - { - "epoch": 1.0, - "eval_loss": 0.08979101479053497, - "eval_runtime": 23.0231, - "eval_samples_per_second": 7.123, - "eval_steps_per_second": 1.781, - "step": 194 - }, - { - "epoch": 1.0051546391752577, - "grad_norm": 2.3739848136901855, - "learning_rate": 9.39950547114292e-05, - "loss": 0.1064, - "step": 195 - }, - { - "epoch": 1.0103092783505154, - "grad_norm": 1.4507896900177002, - "learning_rate": 9.39136706796167e-05, - "loss": 0.0388, - "step": 196 - }, - { - "epoch": 1.0154639175257731, - "grad_norm": 0.6086592674255371, - "learning_rate": 9.383177458478878e-05, - "loss": 0.0182, - "step": 197 - }, - { - "epoch": 1.0206185567010309, - "grad_norm": 1.4689748287200928, - "learning_rate": 9.374936738190914e-05, - "loss": 0.0843, - "step": 198 - }, - { - "epoch": 1.0257731958762886, - "grad_norm": 4.061261177062988, - "learning_rate": 9.366645003190132e-05, - "loss": 0.219, - "step": 199 - }, - { - "epoch": 1.0309278350515463, - "grad_norm": 2.736973285675049, - "learning_rate": 9.358302350163757e-05, - "loss": 0.2438, - "step": 200 - }, - { - "epoch": 1.0360824742268042, - "grad_norm": 3.9617350101470947, - "learning_rate": 9.349908876392748e-05, - "loss": 0.2772, - "step": 201 - }, - { - "epoch": 1.041237113402062, - "grad_norm": 2.085519790649414, - "learning_rate": 9.341464679750669e-05, - "loss": 0.1464, - "step": 202 - }, - { - "epoch": 1.0463917525773196, - "grad_norm": 4.2934794425964355, - "learning_rate": 9.33296985870255e-05, - "loss": 0.1695, - "step": 203 - }, - { - "epoch": 1.0515463917525774, - "grad_norm": 1.9166585206985474, - "learning_rate": 9.32442451230373e-05, - "loss": 0.0841, - "step": 204 - }, - { - "epoch": 1.056701030927835, - "grad_norm": 3.341161012649536, - "learning_rate": 9.315828740198714e-05, - "loss": 0.3112, - "step": 205 - }, - { - "epoch": 1.0618556701030928, - "grad_norm": 0.9464735388755798, - "learning_rate": 9.30718264262e-05, - "loss": 0.0329, - "step": 206 - }, - { - "epoch": 1.0670103092783505, - "grad_norm": 1.8928874731063843, - "learning_rate": 9.298486320386919e-05, - "loss": 0.1272, - "step": 207 - }, - { - "epoch": 1.0721649484536082, - "grad_norm": 1.175891399383545, - "learning_rate": 9.289739874904449e-05, - "loss": 0.023, - "step": 208 - }, - { - "epoch": 1.077319587628866, - "grad_norm": 1.9952442646026611, - "learning_rate": 9.280943408162046e-05, - "loss": 0.1426, - "step": 209 - }, - { - "epoch": 1.0824742268041236, - "grad_norm": 0.945824384689331, - "learning_rate": 9.272097022732443e-05, - "loss": 0.0435, - "step": 210 - }, - { - "epoch": 1.0876288659793814, - "grad_norm": 3.451200485229492, - "learning_rate": 9.263200821770461e-05, - "loss": 0.2378, - "step": 211 - }, - { - "epoch": 1.0927835051546393, - "grad_norm": 1.6046141386032104, - "learning_rate": 9.254254909011804e-05, - "loss": 0.0563, - "step": 212 - }, - { - "epoch": 1.097938144329897, - "grad_norm": 3.0407392978668213, - "learning_rate": 9.245259388771845e-05, - "loss": 0.2115, - "step": 213 - }, - { - "epoch": 1.1030927835051547, - "grad_norm": 2.295111894607544, - "learning_rate": 9.236214365944418e-05, - "loss": 0.1761, - "step": 214 - }, - { - "epoch": 1.1082474226804124, - "grad_norm": 3.0166642665863037, - "learning_rate": 9.22711994600059e-05, - "loss": 0.1294, - "step": 215 - }, - { - "epoch": 1.1134020618556701, - "grad_norm": 0.8205422759056091, - "learning_rate": 9.217976234987428e-05, - "loss": 0.0279, - "step": 216 - }, - { - "epoch": 1.1185567010309279, - "grad_norm": 1.37704336643219, - "learning_rate": 9.208783339526773e-05, - "loss": 0.0329, - "step": 217 - }, - { - "epoch": 1.1237113402061856, - "grad_norm": 1.7456235885620117, - "learning_rate": 9.199541366813982e-05, - "loss": 0.0357, - "step": 218 - }, - { - "epoch": 1.1288659793814433, - "grad_norm": 0.9686729311943054, - "learning_rate": 9.190250424616693e-05, - "loss": 0.026, - "step": 219 - }, - { - "epoch": 1.134020618556701, - "grad_norm": 1.0743412971496582, - "learning_rate": 9.180910621273555e-05, - "loss": 0.0618, - "step": 220 - }, - { - "epoch": 1.1391752577319587, - "grad_norm": 1.515604853630066, - "learning_rate": 9.171522065692975e-05, - "loss": 0.0923, - "step": 221 - }, - { - "epoch": 1.1443298969072164, - "grad_norm": 1.8872064352035522, - "learning_rate": 9.162084867351842e-05, - "loss": 0.0943, - "step": 222 - }, - { - "epoch": 1.1494845360824741, - "grad_norm": 0.26518696546554565, - "learning_rate": 9.152599136294253e-05, - "loss": 0.0062, - "step": 223 - }, - { - "epoch": 1.1546391752577319, - "grad_norm": 2.0155885219573975, - "learning_rate": 9.14306498313023e-05, - "loss": 0.1104, - "step": 224 - }, - { - "epoch": 1.1597938144329896, - "grad_norm": 1.6406800746917725, - "learning_rate": 9.133482519034428e-05, - "loss": 0.0518, - "step": 225 - }, - { - "epoch": 1.1649484536082475, - "grad_norm": 2.0402371883392334, - "learning_rate": 9.123851855744843e-05, - "loss": 0.1368, - "step": 226 - }, - { - "epoch": 1.1701030927835052, - "grad_norm": 1.0710994005203247, - "learning_rate": 9.114173105561501e-05, - "loss": 0.0246, - "step": 227 - }, - { - "epoch": 1.175257731958763, - "grad_norm": 0.7198122143745422, - "learning_rate": 9.104446381345159e-05, - "loss": 0.0255, - "step": 228 - }, - { - "epoch": 1.1804123711340206, - "grad_norm": 1.8444178104400635, - "learning_rate": 9.094671796515978e-05, - "loss": 0.0691, - "step": 229 - }, - { - "epoch": 1.1855670103092784, - "grad_norm": 0.8436192274093628, - "learning_rate": 9.08484946505221e-05, - "loss": 0.0135, - "step": 230 - }, - { - "epoch": 1.190721649484536, - "grad_norm": 2.076083183288574, - "learning_rate": 9.074979501488867e-05, - "loss": 0.0766, - "step": 231 - }, - { - "epoch": 1.1958762886597938, - "grad_norm": 0.5280607342720032, - "learning_rate": 9.065062020916377e-05, - "loss": 0.0135, - "step": 232 - }, - { - "epoch": 1.2010309278350515, - "grad_norm": 5.431139945983887, - "learning_rate": 9.055097138979252e-05, - "loss": 0.3477, - "step": 233 - }, - { - "epoch": 1.2061855670103092, - "grad_norm": 2.9912867546081543, - "learning_rate": 9.045084971874738e-05, - "loss": 0.1651, - "step": 234 - }, - { - "epoch": 1.211340206185567, - "grad_norm": 3.116377830505371, - "learning_rate": 9.035025636351452e-05, - "loss": 0.0958, - "step": 235 - }, - { - "epoch": 1.2164948453608249, - "grad_norm": 2.6776368618011475, - "learning_rate": 9.024919249708035e-05, - "loss": 0.0679, - "step": 236 - }, - { - "epoch": 1.2216494845360826, - "grad_norm": 1.5956416130065918, - "learning_rate": 9.014765929791768e-05, - "loss": 0.0292, - "step": 237 - }, - { - "epoch": 1.2268041237113403, - "grad_norm": 2.6305689811706543, - "learning_rate": 9.004565794997209e-05, - "loss": 0.0979, - "step": 238 - }, - { - "epoch": 1.231958762886598, - "grad_norm": 0.8152199387550354, - "learning_rate": 8.994318964264809e-05, - "loss": 0.017, - "step": 239 - }, - { - "epoch": 1.2371134020618557, - "grad_norm": 2.0557334423065186, - "learning_rate": 8.984025557079523e-05, - "loss": 0.0471, - "step": 240 - }, - { - "epoch": 1.2422680412371134, - "grad_norm": 1.7707017660140991, - "learning_rate": 8.973685693469423e-05, - "loss": 0.0545, - "step": 241 - }, - { - "epoch": 1.2474226804123711, - "grad_norm": 1.4157006740570068, - "learning_rate": 8.963299494004291e-05, - "loss": 0.0454, - "step": 242 - }, - { - "epoch": 1.2525773195876289, - "grad_norm": 2.159468650817871, - "learning_rate": 8.952867079794218e-05, - "loss": 0.0411, - "step": 243 - }, - { - "epoch": 1.2577319587628866, - "grad_norm": 3.038072109222412, - "learning_rate": 8.942388572488187e-05, - "loss": 0.1618, - "step": 244 - }, - { - "epoch": 1.2628865979381443, - "grad_norm": 2.6952202320098877, - "learning_rate": 8.931864094272663e-05, - "loss": 0.1251, - "step": 245 - }, - { - "epoch": 1.268041237113402, - "grad_norm": 1.2491196393966675, - "learning_rate": 8.921293767870157e-05, - "loss": 0.0339, - "step": 246 - }, - { - "epoch": 1.2731958762886597, - "grad_norm": 1.2598530054092407, - "learning_rate": 8.910677716537806e-05, - "loss": 0.0333, - "step": 247 - }, - { - "epoch": 1.2783505154639174, - "grad_norm": 2.025956869125366, - "learning_rate": 8.900016064065923e-05, - "loss": 0.0672, - "step": 248 - }, - { - "epoch": 1.2835051546391751, - "grad_norm": 2.79536771774292, - "learning_rate": 8.889308934776572e-05, - "loss": 0.1278, - "step": 249 - }, - { - "epoch": 1.2886597938144329, - "grad_norm": 1.9153764247894287, - "learning_rate": 8.8785564535221e-05, - "loss": 0.0495, - "step": 250 - }, - { - "epoch": 1.2938144329896908, - "grad_norm": 2.235368013381958, - "learning_rate": 8.867758745683687e-05, - "loss": 0.0573, - "step": 251 - }, - { - "epoch": 1.2989690721649485, - "grad_norm": 0.9632558226585388, - "learning_rate": 8.85691593716989e-05, - "loss": 0.03, - "step": 252 - }, - { - "epoch": 1.3041237113402062, - "grad_norm": 0.7992522716522217, - "learning_rate": 8.84602815441517e-05, - "loss": 0.0194, - "step": 253 - }, - { - "epoch": 1.309278350515464, - "grad_norm": 1.7946758270263672, - "learning_rate": 8.835095524378414e-05, - "loss": 0.0344, - "step": 254 - }, - { - "epoch": 1.3144329896907216, - "grad_norm": 1.6255040168762207, - "learning_rate": 8.824118174541464e-05, - "loss": 0.0815, - "step": 255 - }, - { - "epoch": 1.3195876288659794, - "grad_norm": 3.2107818126678467, - "learning_rate": 8.81309623290762e-05, - "loss": 0.1047, - "step": 256 - }, - { - "epoch": 1.324742268041237, - "grad_norm": 3.3725228309631348, - "learning_rate": 8.802029828000156e-05, - "loss": 0.132, - "step": 257 - }, - { - "epoch": 1.3298969072164948, - "grad_norm": 3.7058792114257812, - "learning_rate": 8.790919088860814e-05, - "loss": 0.127, - "step": 258 - }, - { - "epoch": 1.3350515463917525, - "grad_norm": 2.2644128799438477, - "learning_rate": 8.779764145048308e-05, - "loss": 0.0506, - "step": 259 - }, - { - "epoch": 1.3402061855670104, - "grad_norm": 1.035138726234436, - "learning_rate": 8.768565126636806e-05, - "loss": 0.0165, - "step": 260 - }, - { - "epoch": 1.3453608247422681, - "grad_norm": 2.1964375972747803, - "learning_rate": 8.757322164214413e-05, - "loss": 0.0643, - "step": 261 - }, - { - "epoch": 1.3505154639175259, - "grad_norm": 1.7320009469985962, - "learning_rate": 8.746035388881655e-05, - "loss": 0.0418, - "step": 262 - }, - { - "epoch": 1.3556701030927836, - "grad_norm": 0.9327526092529297, - "learning_rate": 8.734704932249944e-05, - "loss": 0.0145, - "step": 263 - }, - { - "epoch": 1.3608247422680413, - "grad_norm": 3.7060790061950684, - "learning_rate": 8.723330926440045e-05, - "loss": 0.2553, - "step": 264 - }, - { - "epoch": 1.365979381443299, - "grad_norm": 2.5351924896240234, - "learning_rate": 8.711913504080534e-05, - "loss": 0.0692, - "step": 265 - }, - { - "epoch": 1.3711340206185567, - "grad_norm": 0.6212343573570251, - "learning_rate": 8.70045279830626e-05, - "loss": 0.0092, - "step": 266 - }, - { - "epoch": 1.3762886597938144, - "grad_norm": 5.307930946350098, - "learning_rate": 8.688948942756778e-05, - "loss": 0.2139, - "step": 267 - }, - { - "epoch": 1.3814432989690721, - "grad_norm": 2.6784942150115967, - "learning_rate": 8.677402071574805e-05, - "loss": 0.1175, - "step": 268 - }, - { - "epoch": 1.3865979381443299, - "grad_norm": 3.658653736114502, - "learning_rate": 8.665812319404643e-05, - "loss": 0.1758, - "step": 269 - }, - { - "epoch": 1.3917525773195876, - "grad_norm": 0.7189630270004272, - "learning_rate": 8.654179821390621e-05, - "loss": 0.0173, - "step": 270 - }, - { - "epoch": 1.3969072164948453, - "grad_norm": 1.0501234531402588, - "learning_rate": 8.642504713175508e-05, - "loss": 0.0255, - "step": 271 - }, - { - "epoch": 1.402061855670103, - "grad_norm": 1.4844290018081665, - "learning_rate": 8.630787130898943e-05, - "loss": 0.0522, - "step": 272 - }, - { - "epoch": 1.4072164948453607, - "grad_norm": 2.0968682765960693, - "learning_rate": 8.619027211195836e-05, - "loss": 0.044, - "step": 273 - }, - { - "epoch": 1.4123711340206184, - "grad_norm": 3.405850887298584, - "learning_rate": 8.607225091194779e-05, - "loss": 0.2777, - "step": 274 - }, - { - "epoch": 1.4175257731958764, - "grad_norm": 0.47108590602874756, - "learning_rate": 8.595380908516454e-05, - "loss": 0.0079, - "step": 275 - }, - { - "epoch": 1.422680412371134, - "grad_norm": 5.354682445526123, - "learning_rate": 8.583494801272018e-05, - "loss": 0.2747, - "step": 276 - }, - { - "epoch": 1.4278350515463918, - "grad_norm": 2.489128589630127, - "learning_rate": 8.571566908061497e-05, - "loss": 0.0863, - "step": 277 - }, - { - "epoch": 1.4329896907216495, - "grad_norm": 3.1641483306884766, - "learning_rate": 8.559597367972168e-05, - "loss": 0.1414, - "step": 278 - }, - { - "epoch": 1.4381443298969072, - "grad_norm": 2.6844987869262695, - "learning_rate": 8.547586320576945e-05, - "loss": 0.1925, - "step": 279 - }, - { - "epoch": 1.443298969072165, - "grad_norm": 0.3662746250629425, - "learning_rate": 8.535533905932738e-05, - "loss": 0.0079, - "step": 280 - }, - { - "epoch": 1.4484536082474226, - "grad_norm": 2.521512269973755, - "learning_rate": 8.52344026457883e-05, - "loss": 0.2552, - "step": 281 - }, - { - "epoch": 1.4536082474226804, - "grad_norm": 1.6905772686004639, - "learning_rate": 8.511305537535237e-05, - "loss": 0.0426, - "step": 282 - }, - { - "epoch": 1.458762886597938, - "grad_norm": 2.184577226638794, - "learning_rate": 8.499129866301057e-05, - "loss": 0.0719, - "step": 283 - }, - { - "epoch": 1.463917525773196, - "grad_norm": 2.447618007659912, - "learning_rate": 8.48691339285283e-05, - "loss": 0.1018, - "step": 284 - }, - { - "epoch": 1.4690721649484537, - "grad_norm": 3.0394506454467773, - "learning_rate": 8.474656259642873e-05, - "loss": 0.1984, - "step": 285 - }, - { - "epoch": 1.4742268041237114, - "grad_norm": 0.8660880923271179, - "learning_rate": 8.46235860959763e-05, - "loss": 0.0198, - "step": 286 - }, - { - "epoch": 1.4793814432989691, - "grad_norm": 2.118943929672241, - "learning_rate": 8.450020586115987e-05, - "loss": 0.1439, - "step": 287 - }, - { - "epoch": 1.4845360824742269, - "grad_norm": 1.2160487174987793, - "learning_rate": 8.437642333067625e-05, - "loss": 0.0511, - "step": 288 - }, - { - "epoch": 1.4896907216494846, - "grad_norm": 2.628675699234009, - "learning_rate": 8.42522399479132e-05, - "loss": 0.1341, - "step": 289 - }, - { - "epoch": 1.4948453608247423, - "grad_norm": 1.4727402925491333, - "learning_rate": 8.412765716093272e-05, - "loss": 0.0472, - "step": 290 - }, - { - "epoch": 1.5, - "grad_norm": 1.7666587829589844, - "learning_rate": 8.40026764224541e-05, - "loss": 0.0689, - "step": 291 - }, - { - "epoch": 1.5, - "eval_loss": 0.10398600995540619, - "eval_runtime": 22.9785, - "eval_samples_per_second": 7.137, - "eval_steps_per_second": 1.784, - "step": 291 - }, - { - "epoch": 1.5051546391752577, - "grad_norm": 2.2388651371002197, - "learning_rate": 8.387729918983706e-05, - "loss": 0.0711, - "step": 292 - }, - { - "epoch": 1.5103092783505154, - "grad_norm": 1.5305780172348022, - "learning_rate": 8.375152692506468e-05, - "loss": 0.0564, - "step": 293 - }, - { - "epoch": 1.5154639175257731, - "grad_norm": 3.6230311393737793, - "learning_rate": 8.362536109472636e-05, - "loss": 0.313, - "step": 294 - }, - { - "epoch": 1.5206185567010309, - "grad_norm": 1.3446002006530762, - "learning_rate": 8.349880317000082e-05, - "loss": 0.0531, - "step": 295 - }, - { - "epoch": 1.5257731958762886, - "grad_norm": 1.283772349357605, - "learning_rate": 8.337185462663878e-05, - "loss": 0.086, - "step": 296 - }, - { - "epoch": 1.5309278350515463, - "grad_norm": 1.9061647653579712, - "learning_rate": 8.32445169449459e-05, - "loss": 0.0897, - "step": 297 - }, - { - "epoch": 1.536082474226804, - "grad_norm": 0.9783570766448975, - "learning_rate": 8.311679160976539e-05, - "loss": 0.0401, - "step": 298 - }, - { - "epoch": 1.5412371134020617, - "grad_norm": 1.3108831644058228, - "learning_rate": 8.29886801104608e-05, - "loss": 0.0583, - "step": 299 - }, - { - "epoch": 1.5463917525773194, - "grad_norm": 2.761857032775879, - "learning_rate": 8.286018394089863e-05, - "loss": 0.1497, - "step": 300 - }, - { - "epoch": 1.5515463917525774, - "grad_norm": 1.9834277629852295, - "learning_rate": 8.273130459943086e-05, - "loss": 0.0695, - "step": 301 - }, - { - "epoch": 1.556701030927835, - "grad_norm": 1.5544965267181396, - "learning_rate": 8.260204358887754e-05, - "loss": 0.1192, - "step": 302 - }, - { - "epoch": 1.5618556701030928, - "grad_norm": 0.6277759671211243, - "learning_rate": 8.247240241650918e-05, - "loss": 0.0158, - "step": 303 - }, - { - "epoch": 1.5670103092783505, - "grad_norm": 0.8964148163795471, - "learning_rate": 8.234238259402935e-05, - "loss": 0.0364, - "step": 304 - }, - { - "epoch": 1.5721649484536082, - "grad_norm": 1.8108261823654175, - "learning_rate": 8.221198563755682e-05, - "loss": 0.1198, - "step": 305 - }, - { - "epoch": 1.577319587628866, - "grad_norm": 1.8986718654632568, - "learning_rate": 8.208121306760805e-05, - "loss": 0.0912, - "step": 306 - }, - { - "epoch": 1.5824742268041239, - "grad_norm": 0.25971317291259766, - "learning_rate": 8.195006640907942e-05, - "loss": 0.0047, - "step": 307 - }, - { - "epoch": 1.5876288659793816, - "grad_norm": 0.2667933702468872, - "learning_rate": 8.181854719122939e-05, - "loss": 0.0069, - "step": 308 - }, - { - "epoch": 1.5927835051546393, - "grad_norm": 1.9358141422271729, - "learning_rate": 8.168665694766073e-05, - "loss": 0.0873, - "step": 309 - }, - { - "epoch": 1.597938144329897, - "grad_norm": 1.8080211877822876, - "learning_rate": 8.155439721630264e-05, - "loss": 0.0706, - "step": 310 - }, - { - "epoch": 1.6030927835051547, - "grad_norm": 0.8074958324432373, - "learning_rate": 8.142176953939279e-05, - "loss": 0.0401, - "step": 311 - }, - { - "epoch": 1.6082474226804124, - "grad_norm": 0.34044063091278076, - "learning_rate": 8.128877546345933e-05, - "loss": 0.0088, - "step": 312 - }, - { - "epoch": 1.6134020618556701, - "grad_norm": 2.4520204067230225, - "learning_rate": 8.115541653930286e-05, - "loss": 0.0972, - "step": 313 - }, - { - "epoch": 1.6185567010309279, - "grad_norm": 2.7366955280303955, - "learning_rate": 8.102169432197842e-05, - "loss": 0.1658, - "step": 314 - }, - { - "epoch": 1.6237113402061856, - "grad_norm": 1.3256347179412842, - "learning_rate": 8.088761037077718e-05, - "loss": 0.0465, - "step": 315 - }, - { - "epoch": 1.6288659793814433, - "grad_norm": 0.23313826322555542, - "learning_rate": 8.075316624920848e-05, - "loss": 0.0077, - "step": 316 - }, - { - "epoch": 1.634020618556701, - "grad_norm": 1.9687016010284424, - "learning_rate": 8.061836352498145e-05, - "loss": 0.0768, - "step": 317 - }, - { - "epoch": 1.6391752577319587, - "grad_norm": 1.2009949684143066, - "learning_rate": 8.048320376998673e-05, - "loss": 0.128, - "step": 318 - }, - { - "epoch": 1.6443298969072164, - "grad_norm": 1.9169154167175293, - "learning_rate": 8.034768856027826e-05, - "loss": 0.0903, - "step": 319 - }, - { - "epoch": 1.6494845360824741, - "grad_norm": 2.0573318004608154, - "learning_rate": 8.021181947605473e-05, - "loss": 0.1958, - "step": 320 - }, - { - "epoch": 1.6546391752577319, - "grad_norm": 1.6862727403640747, - "learning_rate": 8.007559810164133e-05, - "loss": 0.1499, - "step": 321 - }, - { - "epoch": 1.6597938144329896, - "grad_norm": 2.078341007232666, - "learning_rate": 7.993902602547113e-05, - "loss": 0.0639, - "step": 322 - }, - { - "epoch": 1.6649484536082473, - "grad_norm": 1.3753212690353394, - "learning_rate": 7.980210484006666e-05, - "loss": 0.0471, - "step": 323 - }, - { - "epoch": 1.670103092783505, - "grad_norm": 2.7627837657928467, - "learning_rate": 7.966483614202128e-05, - "loss": 0.1081, - "step": 324 - }, - { - "epoch": 1.675257731958763, - "grad_norm": 2.5081069469451904, - "learning_rate": 7.952722153198054e-05, - "loss": 0.1212, - "step": 325 - }, - { - "epoch": 1.6804123711340206, - "grad_norm": 2.0369913578033447, - "learning_rate": 7.938926261462366e-05, - "loss": 0.0373, - "step": 326 - }, - { - "epoch": 1.6855670103092784, - "grad_norm": 2.289896249771118, - "learning_rate": 7.925096099864464e-05, - "loss": 0.0828, - "step": 327 - }, - { - "epoch": 1.690721649484536, - "grad_norm": 2.909376621246338, - "learning_rate": 7.911231829673356e-05, - "loss": 0.1396, - "step": 328 - }, - { - "epoch": 1.6958762886597938, - "grad_norm": 2.371479034423828, - "learning_rate": 7.897333612555785e-05, - "loss": 0.0491, - "step": 329 - }, - { - "epoch": 1.7010309278350515, - "grad_norm": 1.4082870483398438, - "learning_rate": 7.883401610574336e-05, - "loss": 0.0393, - "step": 330 - }, - { - "epoch": 1.7061855670103094, - "grad_norm": 3.49005126953125, - "learning_rate": 7.869435986185547e-05, - "loss": 0.1891, - "step": 331 - }, - { - "epoch": 1.7113402061855671, - "grad_norm": 2.8195505142211914, - "learning_rate": 7.855436902238017e-05, - "loss": 0.1249, - "step": 332 - }, - { - "epoch": 1.7164948453608249, - "grad_norm": 1.555109977722168, - "learning_rate": 7.841404521970505e-05, - "loss": 0.1454, - "step": 333 - }, - { - "epoch": 1.7216494845360826, - "grad_norm": 2.4946799278259277, - "learning_rate": 7.82733900901003e-05, - "loss": 0.0818, - "step": 334 - }, - { - "epoch": 1.7268041237113403, - "grad_norm": 0.8252114653587341, - "learning_rate": 7.813240527369959e-05, - "loss": 0.0296, - "step": 335 - }, - { - "epoch": 1.731958762886598, - "grad_norm": 1.7780154943466187, - "learning_rate": 7.799109241448091e-05, - "loss": 0.0818, - "step": 336 - }, - { - "epoch": 1.7371134020618557, - "grad_norm": 0.4910796284675598, - "learning_rate": 7.784945316024756e-05, - "loss": 0.0242, - "step": 337 - }, - { - "epoch": 1.7422680412371134, - "grad_norm": 2.092083215713501, - "learning_rate": 7.770748916260875e-05, - "loss": 0.1391, - "step": 338 - }, - { - "epoch": 1.7474226804123711, - "grad_norm": 1.151160478591919, - "learning_rate": 7.756520207696041e-05, - "loss": 0.0362, - "step": 339 - }, - { - "epoch": 1.7525773195876289, - "grad_norm": 1.400128960609436, - "learning_rate": 7.742259356246593e-05, - "loss": 0.0566, - "step": 340 - }, - { - "epoch": 1.7577319587628866, - "grad_norm": 1.9376780986785889, - "learning_rate": 7.727966528203678e-05, - "loss": 0.1098, - "step": 341 - }, - { - "epoch": 1.7628865979381443, - "grad_norm": 0.3112329840660095, - "learning_rate": 7.71364189023131e-05, - "loss": 0.011, - "step": 342 - }, - { - "epoch": 1.768041237113402, - "grad_norm": 1.3194454908370972, - "learning_rate": 7.699285609364424e-05, - "loss": 0.061, - "step": 343 - }, - { - "epoch": 1.7731958762886597, - "grad_norm": 1.4510351419448853, - "learning_rate": 7.68489785300694e-05, - "loss": 0.0577, - "step": 344 - }, - { - "epoch": 1.7783505154639174, - "grad_norm": 0.23605522513389587, - "learning_rate": 7.670478788929802e-05, - "loss": 0.0058, - "step": 345 - }, - { - "epoch": 1.7835051546391751, - "grad_norm": 2.1307742595672607, - "learning_rate": 7.656028585269018e-05, - "loss": 0.1553, - "step": 346 - }, - { - "epoch": 1.7886597938144329, - "grad_norm": 1.0913602113723755, - "learning_rate": 7.641547410523709e-05, - "loss": 0.0582, - "step": 347 - }, - { - "epoch": 1.7938144329896906, - "grad_norm": 0.9072116017341614, - "learning_rate": 7.627035433554138e-05, - "loss": 0.0298, - "step": 348 - }, - { - "epoch": 1.7989690721649485, - "grad_norm": 0.515763521194458, - "learning_rate": 7.612492823579745e-05, - "loss": 0.01, - "step": 349 - }, - { - "epoch": 1.8041237113402062, - "grad_norm": 0.8141824007034302, - "learning_rate": 7.597919750177168e-05, - "loss": 0.0182, - "step": 350 - }, - { - "epoch": 1.809278350515464, - "grad_norm": 2.7248380184173584, - "learning_rate": 7.583316383278273e-05, - "loss": 0.1133, - "step": 351 - }, - { - "epoch": 1.8144329896907216, - "grad_norm": 2.5838863849639893, - "learning_rate": 7.568682893168164e-05, - "loss": 0.0996, - "step": 352 - }, - { - "epoch": 1.8195876288659794, - "grad_norm": 1.152638554573059, - "learning_rate": 7.554019450483208e-05, - "loss": 0.015, - "step": 353 - }, - { - "epoch": 1.824742268041237, - "grad_norm": 1.3408974409103394, - "learning_rate": 7.539326226209031e-05, - "loss": 0.0599, - "step": 354 - }, - { - "epoch": 1.829896907216495, - "grad_norm": 2.786916494369507, - "learning_rate": 7.524603391678541e-05, - "loss": 0.1024, - "step": 355 - }, - { - "epoch": 1.8350515463917527, - "grad_norm": 0.8519769906997681, - "learning_rate": 7.509851118569915e-05, - "loss": 0.0269, - "step": 356 - }, - { - "epoch": 1.8402061855670104, - "grad_norm": 0.3977208435535431, - "learning_rate": 7.495069578904608e-05, - "loss": 0.009, - "step": 357 - }, - { - "epoch": 1.8453608247422681, - "grad_norm": 0.6682721376419067, - "learning_rate": 7.48025894504534e-05, - "loss": 0.014, - "step": 358 - }, - { - "epoch": 1.8505154639175259, - "grad_norm": 1.844580888748169, - "learning_rate": 7.465419389694092e-05, - "loss": 0.0935, - "step": 359 - }, - { - "epoch": 1.8556701030927836, - "grad_norm": 2.259485960006714, - "learning_rate": 7.450551085890087e-05, - "loss": 0.0708, - "step": 360 - }, - { - "epoch": 1.8608247422680413, - "grad_norm": 0.8421556949615479, - "learning_rate": 7.435654207007773e-05, - "loss": 0.0279, - "step": 361 - }, - { - "epoch": 1.865979381443299, - "grad_norm": 0.994603157043457, - "learning_rate": 7.420728926754803e-05, - "loss": 0.0273, - "step": 362 - }, - { - "epoch": 1.8711340206185567, - "grad_norm": 2.5991320610046387, - "learning_rate": 7.405775419170014e-05, - "loss": 0.1439, - "step": 363 - }, - { - "epoch": 1.8762886597938144, - "grad_norm": 3.4226911067962646, - "learning_rate": 7.390793858621386e-05, - "loss": 0.0539, - "step": 364 - }, - { - "epoch": 1.8814432989690721, - "grad_norm": 1.0767531394958496, - "learning_rate": 7.375784419804019e-05, - "loss": 0.0284, - "step": 365 - }, - { - "epoch": 1.8865979381443299, - "grad_norm": 0.8647091388702393, - "learning_rate": 7.360747277738094e-05, - "loss": 0.0178, - "step": 366 - }, - { - "epoch": 1.8917525773195876, - "grad_norm": 0.6758021116256714, - "learning_rate": 7.345682607766826e-05, - "loss": 0.0092, - "step": 367 - }, - { - "epoch": 1.8969072164948453, - "grad_norm": 0.6764877438545227, - "learning_rate": 7.330590585554428e-05, - "loss": 0.0114, - "step": 368 - }, - { - "epoch": 1.902061855670103, - "grad_norm": 1.526297926902771, - "learning_rate": 7.315471387084056e-05, - "loss": 0.0368, - "step": 369 - }, - { - "epoch": 1.9072164948453607, - "grad_norm": 2.972353219985962, - "learning_rate": 7.300325188655761e-05, - "loss": 0.18, - "step": 370 - }, - { - "epoch": 1.9123711340206184, - "grad_norm": 0.385691374540329, - "learning_rate": 7.285152166884432e-05, - "loss": 0.0104, - "step": 371 - }, - { - "epoch": 1.9175257731958761, - "grad_norm": 4.219260215759277, - "learning_rate": 7.269952498697734e-05, - "loss": 0.1956, - "step": 372 - }, - { - "epoch": 1.922680412371134, - "grad_norm": 0.18770945072174072, - "learning_rate": 7.25472636133405e-05, - "loss": 0.0041, - "step": 373 - }, - { - "epoch": 1.9278350515463918, - "grad_norm": 1.2908581495285034, - "learning_rate": 7.23947393234041e-05, - "loss": 0.0237, - "step": 374 - }, - { - "epoch": 1.9329896907216495, - "grad_norm": 2.868788242340088, - "learning_rate": 7.224195389570422e-05, - "loss": 0.1838, - "step": 375 - }, - { - "epoch": 1.9381443298969072, - "grad_norm": 2.256253719329834, - "learning_rate": 7.208890911182197e-05, - "loss": 0.0469, - "step": 376 - }, - { - "epoch": 1.943298969072165, - "grad_norm": 1.5368409156799316, - "learning_rate": 7.193560675636277e-05, - "loss": 0.0516, - "step": 377 - }, - { - "epoch": 1.9484536082474226, - "grad_norm": 0.9154139757156372, - "learning_rate": 7.178204861693545e-05, - "loss": 0.0178, - "step": 378 - }, - { - "epoch": 1.9536082474226806, - "grad_norm": 0.3536202311515808, - "learning_rate": 7.162823648413151e-05, - "loss": 0.0065, - "step": 379 - }, - { - "epoch": 1.9587628865979383, - "grad_norm": 3.183803081512451, - "learning_rate": 7.14741721515041e-05, - "loss": 0.1542, - "step": 380 - }, - { - "epoch": 1.963917525773196, - "grad_norm": 2.011648416519165, - "learning_rate": 7.131985741554728e-05, - "loss": 0.0602, - "step": 381 - }, - { - "epoch": 1.9690721649484537, - "grad_norm": 2.658155679702759, - "learning_rate": 7.116529407567489e-05, - "loss": 0.1196, - "step": 382 - }, - { - "epoch": 1.9742268041237114, - "grad_norm": 1.5329421758651733, - "learning_rate": 7.101048393419977e-05, - "loss": 0.0444, - "step": 383 - }, - { - "epoch": 1.9793814432989691, - "grad_norm": 0.7631077766418457, - "learning_rate": 7.085542879631253e-05, - "loss": 0.0241, - "step": 384 - }, - { - "epoch": 1.9845360824742269, - "grad_norm": 2.961472749710083, - "learning_rate": 7.070013047006068e-05, - "loss": 0.1172, - "step": 385 - }, - { - "epoch": 1.9896907216494846, - "grad_norm": 1.429437279701233, - "learning_rate": 7.054459076632743e-05, - "loss": 0.0483, - "step": 386 - }, - { - "epoch": 1.9948453608247423, - "grad_norm": 2.7817625999450684, - "learning_rate": 7.038881149881058e-05, - "loss": 0.1143, - "step": 387 - }, - { - "epoch": 2.0, - "grad_norm": 0.6764756441116333, - "learning_rate": 7.02327944840015e-05, - "loss": 0.0185, - "step": 388 - }, - { - "epoch": 2.0, - "eval_loss": 0.07160064578056335, - "eval_runtime": 23.0429, - "eval_samples_per_second": 7.117, - "eval_steps_per_second": 1.779, - "step": 388 - }, - { - "epoch": 2.0051546391752577, - "grad_norm": 0.07525697350502014, - "learning_rate": 7.007654154116377e-05, - "loss": 0.0027, - "step": 389 - }, - { - "epoch": 2.0103092783505154, - "grad_norm": 1.171702265739441, - "learning_rate": 6.992005449231208e-05, - "loss": 0.0673, - "step": 390 - }, - { - "epoch": 2.015463917525773, - "grad_norm": 0.2757341265678406, - "learning_rate": 6.976333516219096e-05, - "loss": 0.008, - "step": 391 - }, - { - "epoch": 2.020618556701031, - "grad_norm": 0.18395589292049408, - "learning_rate": 6.960638537825352e-05, - "loss": 0.0037, - "step": 392 - }, - { - "epoch": 2.0257731958762886, - "grad_norm": 0.2340579330921173, - "learning_rate": 6.944920697064004e-05, - "loss": 0.0073, - "step": 393 - }, - { - "epoch": 2.0309278350515463, - "grad_norm": 0.7616062164306641, - "learning_rate": 6.929180177215678e-05, - "loss": 0.0352, - "step": 394 - }, - { - "epoch": 2.036082474226804, - "grad_norm": 0.4245890974998474, - "learning_rate": 6.91341716182545e-05, - "loss": 0.0126, - "step": 395 - }, - { - "epoch": 2.0412371134020617, - "grad_norm": 2.350367546081543, - "learning_rate": 6.897631834700709e-05, - "loss": 0.0362, - "step": 396 - }, - { - "epoch": 2.0463917525773194, - "grad_norm": 1.0493096113204956, - "learning_rate": 6.881824379909017e-05, - "loss": 0.0292, - "step": 397 - }, - { - "epoch": 2.051546391752577, - "grad_norm": 1.125183343887329, - "learning_rate": 6.865994981775957e-05, - "loss": 0.0508, - "step": 398 - }, - { - "epoch": 2.056701030927835, - "grad_norm": 0.7406489253044128, - "learning_rate": 6.850143824882986e-05, - "loss": 0.0189, - "step": 399 - }, - { - "epoch": 2.0618556701030926, - "grad_norm": 1.1769431829452515, - "learning_rate": 6.834271094065283e-05, - "loss": 0.0347, - "step": 400 - }, - { - "epoch": 2.0670103092783507, - "grad_norm": 0.06829158961772919, - "learning_rate": 6.818376974409593e-05, - "loss": 0.0024, - "step": 401 - }, - { - "epoch": 2.0721649484536084, - "grad_norm": 0.5625203847885132, - "learning_rate": 6.802461651252073e-05, - "loss": 0.0107, - "step": 402 - }, - { - "epoch": 2.077319587628866, - "grad_norm": 0.4858710765838623, - "learning_rate": 6.786525310176123e-05, - "loss": 0.0231, - "step": 403 - }, - { - "epoch": 2.082474226804124, - "grad_norm": 0.5900213718414307, - "learning_rate": 6.770568137010226e-05, - "loss": 0.017, - "step": 404 - }, - { - "epoch": 2.0876288659793816, - "grad_norm": 2.3651366233825684, - "learning_rate": 6.754590317825785e-05, - "loss": 0.0628, - "step": 405 - }, - { - "epoch": 2.0927835051546393, - "grad_norm": 1.993862271308899, - "learning_rate": 6.738592038934946e-05, - "loss": 0.0581, - "step": 406 - }, - { - "epoch": 2.097938144329897, - "grad_norm": 1.991409182548523, - "learning_rate": 6.722573486888427e-05, - "loss": 0.055, - "step": 407 - }, - { - "epoch": 2.1030927835051547, - "grad_norm": 0.10092249512672424, - "learning_rate": 6.706534848473352e-05, - "loss": 0.0021, - "step": 408 - }, - { - "epoch": 2.1082474226804124, - "grad_norm": 0.5169625282287598, - "learning_rate": 6.69047631071106e-05, - "loss": 0.0116, - "step": 409 - }, - { - "epoch": 2.11340206185567, - "grad_norm": 0.5807163119316101, - "learning_rate": 6.674398060854931e-05, - "loss": 0.0111, - "step": 410 - }, - { - "epoch": 2.118556701030928, - "grad_norm": 2.279827833175659, - "learning_rate": 6.658300286388203e-05, - "loss": 0.059, - "step": 411 - }, - { - "epoch": 2.1237113402061856, - "grad_norm": 1.3535722494125366, - "learning_rate": 6.642183175021779e-05, - "loss": 0.0326, - "step": 412 - }, - { - "epoch": 2.1288659793814433, - "grad_norm": 0.06376868486404419, - "learning_rate": 6.62604691469205e-05, - "loss": 0.0021, - "step": 413 - }, - { - "epoch": 2.134020618556701, - "grad_norm": 0.6728596091270447, - "learning_rate": 6.609891693558692e-05, - "loss": 0.0139, - "step": 414 - }, - { - "epoch": 2.1391752577319587, - "grad_norm": 0.8516107797622681, - "learning_rate": 6.59371770000248e-05, - "loss": 0.0214, - "step": 415 - }, - { - "epoch": 2.1443298969072164, - "grad_norm": 1.3128242492675781, - "learning_rate": 6.577525122623084e-05, - "loss": 0.029, - "step": 416 - }, - { - "epoch": 2.149484536082474, - "grad_norm": 0.3968404233455658, - "learning_rate": 6.561314150236882e-05, - "loss": 0.0063, - "step": 417 - }, - { - "epoch": 2.154639175257732, - "grad_norm": 2.2075860500335693, - "learning_rate": 6.545084971874738e-05, - "loss": 0.04, - "step": 418 - }, - { - "epoch": 2.1597938144329896, - "grad_norm": 2.252422332763672, - "learning_rate": 6.528837776779819e-05, - "loss": 0.0291, - "step": 419 - }, - { - "epoch": 2.1649484536082473, - "grad_norm": 0.9669781923294067, - "learning_rate": 6.51257275440538e-05, - "loss": 0.0189, - "step": 420 - }, - { - "epoch": 2.170103092783505, - "grad_norm": 0.15717247128486633, - "learning_rate": 6.496290094412546e-05, - "loss": 0.0035, - "step": 421 - }, - { - "epoch": 2.1752577319587627, - "grad_norm": 1.4802827835083008, - "learning_rate": 6.479989986668118e-05, - "loss": 0.0245, - "step": 422 - }, - { - "epoch": 2.1804123711340204, - "grad_norm": 1.0676665306091309, - "learning_rate": 6.463672621242342e-05, - "loss": 0.0145, - "step": 423 - }, - { - "epoch": 2.1855670103092786, - "grad_norm": 3.3371965885162354, - "learning_rate": 6.447338188406704e-05, - "loss": 0.0616, - "step": 424 - }, - { - "epoch": 2.1907216494845363, - "grad_norm": 0.09870132803916931, - "learning_rate": 6.430986878631707e-05, - "loss": 0.0018, - "step": 425 - }, - { - "epoch": 2.195876288659794, - "grad_norm": 2.374746799468994, - "learning_rate": 6.41461888258465e-05, - "loss": 0.0527, - "step": 426 - }, - { - "epoch": 2.2010309278350517, - "grad_norm": 0.7650613188743591, - "learning_rate": 6.398234391127406e-05, - "loss": 0.0067, - "step": 427 - }, - { - "epoch": 2.2061855670103094, - "grad_norm": 0.8953922390937805, - "learning_rate": 6.381833595314195e-05, - "loss": 0.0279, - "step": 428 - }, - { - "epoch": 2.211340206185567, - "grad_norm": 0.2915783226490021, - "learning_rate": 6.365416686389358e-05, - "loss": 0.0035, - "step": 429 - }, - { - "epoch": 2.216494845360825, - "grad_norm": 2.34714412689209, - "learning_rate": 6.348983855785121e-05, - "loss": 0.1444, - "step": 430 - }, - { - "epoch": 2.2216494845360826, - "grad_norm": 0.20125645399093628, - "learning_rate": 6.332535295119377e-05, - "loss": 0.0024, - "step": 431 - }, - { - "epoch": 2.2268041237113403, - "grad_norm": 0.358877032995224, - "learning_rate": 6.31607119619343e-05, - "loss": 0.007, - "step": 432 - }, - { - "epoch": 2.231958762886598, - "grad_norm": 0.021968163549900055, - "learning_rate": 6.299591750989779e-05, - "loss": 0.0006, - "step": 433 - }, - { - "epoch": 2.2371134020618557, - "grad_norm": 0.09418534487485886, - "learning_rate": 6.283097151669869e-05, - "loss": 0.0018, - "step": 434 - }, - { - "epoch": 2.2422680412371134, - "grad_norm": 0.0898284986615181, - "learning_rate": 6.266587590571852e-05, - "loss": 0.0013, - "step": 435 - }, - { - "epoch": 2.247422680412371, - "grad_norm": 0.12911048531532288, - "learning_rate": 6.250063260208346e-05, - "loss": 0.0034, - "step": 436 - }, - { - "epoch": 2.252577319587629, - "grad_norm": 2.0727717876434326, - "learning_rate": 6.233524353264187e-05, - "loss": 0.0277, - "step": 437 - }, - { - "epoch": 2.2577319587628866, - "grad_norm": 0.8507752418518066, - "learning_rate": 6.216971062594179e-05, - "loss": 0.0093, - "step": 438 - }, - { - "epoch": 2.2628865979381443, - "grad_norm": 0.17573043704032898, - "learning_rate": 6.200403581220861e-05, - "loss": 0.0023, - "step": 439 - }, - { - "epoch": 2.268041237113402, - "grad_norm": 1.0571852922439575, - "learning_rate": 6.183822102332234e-05, - "loss": 0.018, - "step": 440 - }, - { - "epoch": 2.2731958762886597, - "grad_norm": 0.14709679782390594, - "learning_rate": 6.167226819279528e-05, - "loss": 0.0022, - "step": 441 - }, - { - "epoch": 2.2783505154639174, - "grad_norm": 0.7632313370704651, - "learning_rate": 6.150617925574933e-05, - "loss": 0.0123, - "step": 442 - }, - { - "epoch": 2.283505154639175, - "grad_norm": 0.09419295191764832, - "learning_rate": 6.13399561488935e-05, - "loss": 0.0016, - "step": 443 - }, - { - "epoch": 2.288659793814433, - "grad_norm": 0.7709088325500488, - "learning_rate": 6.117360081050136e-05, - "loss": 0.0133, - "step": 444 - }, - { - "epoch": 2.2938144329896906, - "grad_norm": 0.6842679977416992, - "learning_rate": 6.1007115180388285e-05, - "loss": 0.0116, - "step": 445 - }, - { - "epoch": 2.2989690721649483, - "grad_norm": 0.0223514623939991, - "learning_rate": 6.0840501199889046e-05, - "loss": 0.0005, - "step": 446 - }, - { - "epoch": 2.304123711340206, - "grad_norm": 1.7335599660873413, - "learning_rate": 6.067376081183499e-05, - "loss": 0.0198, - "step": 447 - }, - { - "epoch": 2.3092783505154637, - "grad_norm": 0.3301581144332886, - "learning_rate": 6.050689596053151e-05, - "loss": 0.0036, - "step": 448 - }, - { - "epoch": 2.3144329896907214, - "grad_norm": 4.583668231964111, - "learning_rate": 6.0339908591735296e-05, - "loss": 0.186, - "step": 449 - }, - { - "epoch": 2.319587628865979, - "grad_norm": 2.8353846073150635, - "learning_rate": 6.01728006526317e-05, - "loss": 0.0844, - "step": 450 - }, - { - "epoch": 2.3247422680412373, - "grad_norm": 1.3765830993652344, - "learning_rate": 6.0005574091811964e-05, - "loss": 0.027, - "step": 451 - }, - { - "epoch": 2.329896907216495, - "grad_norm": 3.2432947158813477, - "learning_rate": 5.9838230859250586e-05, - "loss": 0.2141, - "step": 452 - }, - { - "epoch": 2.3350515463917527, - "grad_norm": 0.536233127117157, - "learning_rate": 5.967077290628249e-05, - "loss": 0.007, - "step": 453 - }, - { - "epoch": 2.3402061855670104, - "grad_norm": 1.7645577192306519, - "learning_rate": 5.950320218558037e-05, - "loss": 0.1288, - "step": 454 - }, - { - "epoch": 2.345360824742268, - "grad_norm": 1.1916286945343018, - "learning_rate": 5.9335520651131814e-05, - "loss": 0.0184, - "step": 455 - }, - { - "epoch": 2.350515463917526, - "grad_norm": 2.7826900482177734, - "learning_rate": 5.9167730258216627e-05, - "loss": 0.1094, - "step": 456 - }, - { - "epoch": 2.3556701030927836, - "grad_norm": 0.09810366481542587, - "learning_rate": 5.899983296338392e-05, - "loss": 0.0017, - "step": 457 - }, - { - "epoch": 2.3608247422680413, - "grad_norm": 2.2794928550720215, - "learning_rate": 5.8831830724429384e-05, - "loss": 0.0475, - "step": 458 - }, - { - "epoch": 2.365979381443299, - "grad_norm": 0.5080482363700867, - "learning_rate": 5.866372550037242e-05, - "loss": 0.0049, - "step": 459 - }, - { - "epoch": 2.3711340206185567, - "grad_norm": 0.8963924050331116, - "learning_rate": 5.849551925143334e-05, - "loss": 0.0171, - "step": 460 - }, - { - "epoch": 2.3762886597938144, - "grad_norm": 1.555064082145691, - "learning_rate": 5.8327213939010414e-05, - "loss": 0.0299, - "step": 461 - }, - { - "epoch": 2.381443298969072, - "grad_norm": 2.659799337387085, - "learning_rate": 5.815881152565712e-05, - "loss": 0.0445, - "step": 462 - }, - { - "epoch": 2.38659793814433, - "grad_norm": 0.690310001373291, - "learning_rate": 5.799031397505913e-05, - "loss": 0.0074, - "step": 463 - }, - { - "epoch": 2.3917525773195876, - "grad_norm": 0.796109139919281, - "learning_rate": 5.782172325201155e-05, - "loss": 0.0095, - "step": 464 - }, - { - "epoch": 2.3969072164948453, - "grad_norm": 0.16811159253120422, - "learning_rate": 5.7653041322395895e-05, - "loss": 0.0038, - "step": 465 - }, - { - "epoch": 2.402061855670103, - "grad_norm": 1.949715495109558, - "learning_rate": 5.748427015315722e-05, - "loss": 0.0131, - "step": 466 - }, - { - "epoch": 2.4072164948453607, - "grad_norm": 1.220193862915039, - "learning_rate": 5.7315411712281186e-05, - "loss": 0.0177, - "step": 467 - }, - { - "epoch": 2.4123711340206184, - "grad_norm": 2.5171456336975098, - "learning_rate": 5.714646796877108e-05, - "loss": 0.0518, - "step": 468 - }, - { - "epoch": 2.417525773195876, - "grad_norm": 0.662219226360321, - "learning_rate": 5.697744089262491e-05, - "loss": 0.011, - "step": 469 - }, - { - "epoch": 2.422680412371134, - "grad_norm": 0.6332148313522339, - "learning_rate": 5.680833245481234e-05, - "loss": 0.0096, - "step": 470 - }, - { - "epoch": 2.4278350515463916, - "grad_norm": 2.559650182723999, - "learning_rate": 5.6639144627251816e-05, - "loss": 0.0927, - "step": 471 - }, - { - "epoch": 2.4329896907216497, - "grad_norm": 4.00107479095459, - "learning_rate": 5.646987938278753e-05, - "loss": 0.1422, - "step": 472 - }, - { - "epoch": 2.4381443298969074, - "grad_norm": 2.220475673675537, - "learning_rate": 5.630053869516635e-05, - "loss": 0.0452, - "step": 473 - }, - { - "epoch": 2.443298969072165, - "grad_norm": 0.17029526829719543, - "learning_rate": 5.6131124539014926e-05, - "loss": 0.0029, - "step": 474 - }, - { - "epoch": 2.448453608247423, - "grad_norm": 1.107358455657959, - "learning_rate": 5.596163888981656e-05, - "loss": 0.0108, - "step": 475 - }, - { - "epoch": 2.4536082474226806, - "grad_norm": 0.07260440289974213, - "learning_rate": 5.5792083723888225e-05, - "loss": 0.0019, - "step": 476 - }, - { - "epoch": 2.4587628865979383, - "grad_norm": 1.1532561779022217, - "learning_rate": 5.5622461018357486e-05, - "loss": 0.0205, - "step": 477 - }, - { - "epoch": 2.463917525773196, - "grad_norm": 1.5596643686294556, - "learning_rate": 5.5452772751139496e-05, - "loss": 0.0361, - "step": 478 - }, - { - "epoch": 2.4690721649484537, - "grad_norm": 0.033296748995780945, - "learning_rate": 5.5283020900913886e-05, - "loss": 0.0008, - "step": 479 - }, - { - "epoch": 2.4742268041237114, - "grad_norm": 1.283778190612793, - "learning_rate": 5.511320744710171e-05, - "loss": 0.02, - "step": 480 - }, - { - "epoch": 2.479381443298969, - "grad_norm": 1.2095892429351807, - "learning_rate": 5.494333436984238e-05, - "loss": 0.0115, - "step": 481 - }, - { - "epoch": 2.484536082474227, - "grad_norm": 2.116044282913208, - "learning_rate": 5.477340364997051e-05, - "loss": 0.0457, - "step": 482 - }, - { - "epoch": 2.4896907216494846, - "grad_norm": 1.7883367538452148, - "learning_rate": 5.460341726899291e-05, - "loss": 0.0319, - "step": 483 - }, - { - "epoch": 2.4948453608247423, - "grad_norm": 0.627947211265564, - "learning_rate": 5.4433377209065414e-05, - "loss": 0.0065, - "step": 484 - }, - { - "epoch": 2.5, - "grad_norm": 0.03853435814380646, - "learning_rate": 5.4263285452969806e-05, - "loss": 0.0007, - "step": 485 - }, - { - "epoch": 2.5, - "eval_loss": 0.09605833142995834, - "eval_runtime": 22.9335, - "eval_samples_per_second": 7.151, - "eval_steps_per_second": 1.788, - "step": 485 - }, - { - "epoch": 2.5051546391752577, - "grad_norm": 0.8205093145370483, - "learning_rate": 5.409314398409067e-05, - "loss": 0.0129, - "step": 486 - }, - { - "epoch": 2.5103092783505154, - "grad_norm": 0.027210909873247147, - "learning_rate": 5.392295478639225e-05, - "loss": 0.0005, - "step": 487 - }, - { - "epoch": 2.515463917525773, - "grad_norm": 0.6312366724014282, - "learning_rate": 5.3752719844395405e-05, - "loss": 0.0075, - "step": 488 - }, - { - "epoch": 2.520618556701031, - "grad_norm": 0.26939788460731506, - "learning_rate": 5.358244114315434e-05, - "loss": 0.0037, - "step": 489 - }, - { - "epoch": 2.5257731958762886, - "grad_norm": 0.18861384689807892, - "learning_rate": 5.341212066823355e-05, - "loss": 0.0028, - "step": 490 - }, - { - "epoch": 2.5309278350515463, - "grad_norm": 0.18018989264965057, - "learning_rate": 5.324176040568465e-05, - "loss": 0.0023, - "step": 491 - }, - { - "epoch": 2.536082474226804, - "grad_norm": 0.2840721011161804, - "learning_rate": 5.307136234202318e-05, - "loss": 0.0036, - "step": 492 - }, - { - "epoch": 2.5412371134020617, - "grad_norm": 0.11515682190656662, - "learning_rate": 5.290092846420548e-05, - "loss": 0.0021, - "step": 493 - }, - { - "epoch": 2.5463917525773194, - "grad_norm": 3.078644037246704, - "learning_rate": 5.27304607596055e-05, - "loss": 0.0914, - "step": 494 - }, - { - "epoch": 2.551546391752577, - "grad_norm": 2.2690036296844482, - "learning_rate": 5.255996121599167e-05, - "loss": 0.0396, - "step": 495 - }, - { - "epoch": 2.556701030927835, - "grad_norm": 2.1380650997161865, - "learning_rate": 5.2389431821503606e-05, - "loss": 0.0346, - "step": 496 - }, - { - "epoch": 2.5618556701030926, - "grad_norm": 0.0314573273062706, - "learning_rate": 5.221887456462907e-05, - "loss": 0.0005, - "step": 497 - }, - { - "epoch": 2.5670103092783503, - "grad_norm": 0.434132844209671, - "learning_rate": 5.2048291434180716e-05, - "loss": 0.0044, - "step": 498 - }, - { - "epoch": 2.572164948453608, - "grad_norm": 0.48740851879119873, - "learning_rate": 5.1877684419272875e-05, - "loss": 0.0067, - "step": 499 - }, - { - "epoch": 2.5773195876288657, - "grad_norm": 0.2123924344778061, - "learning_rate": 5.1707055509298396e-05, - "loss": 0.0027, - "step": 500 - }, - { - "epoch": 2.582474226804124, - "grad_norm": 2.360799789428711, - "learning_rate": 5.153640669390546e-05, - "loss": 0.0923, - "step": 501 - }, - { - "epoch": 2.5876288659793816, - "grad_norm": 2.2972326278686523, - "learning_rate": 5.1365739962974304e-05, - "loss": 0.0257, - "step": 502 - }, - { - "epoch": 2.5927835051546393, - "grad_norm": 0.734491229057312, - "learning_rate": 5.119505730659413e-05, - "loss": 0.0069, - "step": 503 - }, - { - "epoch": 2.597938144329897, - "grad_norm": 0.681890606880188, - "learning_rate": 5.102436071503982e-05, - "loss": 0.0054, - "step": 504 - }, - { - "epoch": 2.6030927835051547, - "grad_norm": 0.29352545738220215, - "learning_rate": 5.0853652178748746e-05, - "loss": 0.0037, - "step": 505 - }, - { - "epoch": 2.6082474226804124, - "grad_norm": 0.1541396975517273, - "learning_rate": 5.068293368829755e-05, - "loss": 0.0014, - "step": 506 - }, - { - "epoch": 2.61340206185567, - "grad_norm": 3.912865161895752, - "learning_rate": 5.0512207234379004e-05, - "loss": 0.06, - "step": 507 - }, - { - "epoch": 2.618556701030928, - "grad_norm": 2.996270179748535, - "learning_rate": 5.0341474807778663e-05, - "loss": 0.0982, - "step": 508 - }, - { - "epoch": 2.6237113402061856, - "grad_norm": 0.02659071795642376, - "learning_rate": 5.017073839935178e-05, - "loss": 0.0006, - "step": 509 - }, - { - "epoch": 2.6288659793814433, - "grad_norm": 0.17042477428913116, - "learning_rate": 5e-05, - "loss": 0.0018, - "step": 510 - }, - { - "epoch": 2.634020618556701, - "grad_norm": 0.02339012175798416, - "learning_rate": 4.982926160064823e-05, - "loss": 0.0004, - "step": 511 - }, - { - "epoch": 2.6391752577319587, - "grad_norm": 2.022866725921631, - "learning_rate": 4.965852519222134e-05, - "loss": 0.0534, - "step": 512 - }, - { - "epoch": 2.6443298969072164, - "grad_norm": 0.9663375020027161, - "learning_rate": 4.948779276562101e-05, - "loss": 0.0172, - "step": 513 - }, - { - "epoch": 2.649484536082474, - "grad_norm": 1.1391932964324951, - "learning_rate": 4.9317066311702456e-05, - "loss": 0.0105, - "step": 514 - }, - { - "epoch": 2.654639175257732, - "grad_norm": 0.11315444111824036, - "learning_rate": 4.9146347821251266e-05, - "loss": 0.0011, - "step": 515 - }, - { - "epoch": 2.6597938144329896, - "grad_norm": 2.212524652481079, - "learning_rate": 4.89756392849602e-05, - "loss": 0.0243, - "step": 516 - }, - { - "epoch": 2.6649484536082473, - "grad_norm": 5.0199294090271, - "learning_rate": 4.880494269340588e-05, - "loss": 0.1882, - "step": 517 - }, - { - "epoch": 2.670103092783505, - "grad_norm": 0.7238196134567261, - "learning_rate": 4.863426003702572e-05, - "loss": 0.0059, - "step": 518 - }, - { - "epoch": 2.675257731958763, - "grad_norm": 3.742496967315674, - "learning_rate": 4.8463593306094555e-05, - "loss": 0.019, - "step": 519 - }, - { - "epoch": 2.680412371134021, - "grad_norm": 0.03769111633300781, - "learning_rate": 4.829294449070161e-05, - "loss": 0.0008, - "step": 520 - }, - { - "epoch": 2.6855670103092786, - "grad_norm": 0.20755574107170105, - "learning_rate": 4.8122315580727136e-05, - "loss": 0.0023, - "step": 521 - }, - { - "epoch": 2.6907216494845363, - "grad_norm": 1.4307230710983276, - "learning_rate": 4.795170856581929e-05, - "loss": 0.0152, - "step": 522 - }, - { - "epoch": 2.695876288659794, - "grad_norm": 0.22543154656887054, - "learning_rate": 4.778112543537094e-05, - "loss": 0.0028, - "step": 523 - }, - { - "epoch": 2.7010309278350517, - "grad_norm": 0.31340834498405457, - "learning_rate": 4.7610568178496405e-05, - "loss": 0.0036, - "step": 524 - }, - { - "epoch": 2.7061855670103094, - "grad_norm": 2.5833778381347656, - "learning_rate": 4.744003878400835e-05, - "loss": 0.0361, - "step": 525 - }, - { - "epoch": 2.711340206185567, - "grad_norm": 3.0378775596618652, - "learning_rate": 4.726953924039451e-05, - "loss": 0.0751, - "step": 526 - }, - { - "epoch": 2.716494845360825, - "grad_norm": 0.24335841834545135, - "learning_rate": 4.709907153579454e-05, - "loss": 0.002, - "step": 527 - }, - { - "epoch": 2.7216494845360826, - "grad_norm": 0.7776563763618469, - "learning_rate": 4.692863765797683e-05, - "loss": 0.0082, - "step": 528 - }, - { - "epoch": 2.7268041237113403, - "grad_norm": 0.052026282995939255, - "learning_rate": 4.675823959431535e-05, - "loss": 0.0009, - "step": 529 - }, - { - "epoch": 2.731958762886598, - "grad_norm": 2.127627372741699, - "learning_rate": 4.658787933176646e-05, - "loss": 0.0297, - "step": 530 - }, - { - "epoch": 2.7371134020618557, - "grad_norm": 0.5500102043151855, - "learning_rate": 4.641755885684566e-05, - "loss": 0.0051, - "step": 531 - }, - { - "epoch": 2.7422680412371134, - "grad_norm": 0.05251020938158035, - "learning_rate": 4.624728015560461e-05, - "loss": 0.0013, - "step": 532 - }, - { - "epoch": 2.747422680412371, - "grad_norm": 0.04040883108973503, - "learning_rate": 4.607704521360776e-05, - "loss": 0.0008, - "step": 533 - }, - { - "epoch": 2.752577319587629, - "grad_norm": 0.2735699415206909, - "learning_rate": 4.590685601590936e-05, - "loss": 0.003, - "step": 534 - }, - { - "epoch": 2.7577319587628866, - "grad_norm": 0.2323101907968521, - "learning_rate": 4.57367145470302e-05, - "loss": 0.003, - "step": 535 - }, - { - "epoch": 2.7628865979381443, - "grad_norm": 0.025867881253361702, - "learning_rate": 4.5566622790934604e-05, - "loss": 0.0006, - "step": 536 - }, - { - "epoch": 2.768041237113402, - "grad_norm": 2.5169122219085693, - "learning_rate": 4.5396582731007095e-05, - "loss": 0.0334, - "step": 537 - }, - { - "epoch": 2.7731958762886597, - "grad_norm": 0.39224404096603394, - "learning_rate": 4.52265963500295e-05, - "loss": 0.0049, - "step": 538 - }, - { - "epoch": 2.7783505154639174, - "grad_norm": 0.15631736814975739, - "learning_rate": 4.505666563015763e-05, - "loss": 0.0025, - "step": 539 - }, - { - "epoch": 2.783505154639175, - "grad_norm": 1.025899052619934, - "learning_rate": 4.4886792552898286e-05, - "loss": 0.0211, - "step": 540 - }, - { - "epoch": 2.788659793814433, - "grad_norm": 0.10308719426393509, - "learning_rate": 4.471697909908613e-05, - "loss": 0.0013, - "step": 541 - }, - { - "epoch": 2.7938144329896906, - "grad_norm": 0.9129118919372559, - "learning_rate": 4.454722724886051e-05, - "loss": 0.0106, - "step": 542 - }, - { - "epoch": 2.7989690721649483, - "grad_norm": 0.37369224429130554, - "learning_rate": 4.437753898164254e-05, - "loss": 0.0043, - "step": 543 - }, - { - "epoch": 2.804123711340206, - "grad_norm": 0.12186601012945175, - "learning_rate": 4.420791627611179e-05, - "loss": 0.0017, - "step": 544 - }, - { - "epoch": 2.8092783505154637, - "grad_norm": 0.03575112298130989, - "learning_rate": 4.403836111018346e-05, - "loss": 0.0005, - "step": 545 - }, - { - "epoch": 2.8144329896907214, - "grad_norm": 0.0675966814160347, - "learning_rate": 4.3868875460985085e-05, - "loss": 0.0008, - "step": 546 - }, - { - "epoch": 2.819587628865979, - "grad_norm": 0.6237895488739014, - "learning_rate": 4.369946130483364e-05, - "loss": 0.0081, - "step": 547 - }, - { - "epoch": 2.824742268041237, - "grad_norm": 0.6258450746536255, - "learning_rate": 4.353012061721249e-05, - "loss": 0.009, - "step": 548 - }, - { - "epoch": 2.829896907216495, - "grad_norm": 1.2090260982513428, - "learning_rate": 4.336085537274818e-05, - "loss": 0.0127, - "step": 549 - }, - { - "epoch": 2.8350515463917527, - "grad_norm": 0.41710153222084045, - "learning_rate": 4.319166754518768e-05, - "loss": 0.0047, - "step": 550 - }, - { - "epoch": 2.8402061855670104, - "grad_norm": 0.38828399777412415, - "learning_rate": 4.3022559107375106e-05, - "loss": 0.0037, - "step": 551 - }, - { - "epoch": 2.845360824742268, - "grad_norm": 0.008606897667050362, - "learning_rate": 4.285353203122893e-05, - "loss": 0.0003, - "step": 552 - }, - { - "epoch": 2.850515463917526, - "grad_norm": 0.21119487285614014, - "learning_rate": 4.268458828771883e-05, - "loss": 0.002, - "step": 553 - }, - { - "epoch": 2.8556701030927836, - "grad_norm": 0.38432881236076355, - "learning_rate": 4.251572984684281e-05, - "loss": 0.0056, - "step": 554 - }, - { - "epoch": 2.8608247422680413, - "grad_norm": 1.544846773147583, - "learning_rate": 4.234695867760412e-05, - "loss": 0.0234, - "step": 555 - }, - { - "epoch": 2.865979381443299, - "grad_norm": 0.02039588801562786, - "learning_rate": 4.2178276747988446e-05, - "loss": 0.0005, - "step": 556 - }, - { - "epoch": 2.8711340206185567, - "grad_norm": 0.2800588309764862, - "learning_rate": 4.200968602494087e-05, - "loss": 0.0035, - "step": 557 - }, - { - "epoch": 2.8762886597938144, - "grad_norm": 2.2519681453704834, - "learning_rate": 4.18411884743429e-05, - "loss": 0.0577, - "step": 558 - }, - { - "epoch": 2.881443298969072, - "grad_norm": 0.7249019742012024, - "learning_rate": 4.16727860609896e-05, - "loss": 0.0064, - "step": 559 - }, - { - "epoch": 2.88659793814433, - "grad_norm": 0.8100419640541077, - "learning_rate": 4.150448074856667e-05, - "loss": 0.0066, - "step": 560 - }, - { - "epoch": 2.8917525773195876, - "grad_norm": 1.3413070440292358, - "learning_rate": 4.1336274499627596e-05, - "loss": 0.0219, - "step": 561 - }, - { - "epoch": 2.8969072164948453, - "grad_norm": 3.6059770584106445, - "learning_rate": 4.1168169275570635e-05, - "loss": 0.0575, - "step": 562 - }, - { - "epoch": 2.902061855670103, - "grad_norm": 4.4368181228637695, - "learning_rate": 4.1000167036616113e-05, - "loss": 0.1659, - "step": 563 - }, - { - "epoch": 2.9072164948453607, - "grad_norm": 0.06619425863027573, - "learning_rate": 4.083226974178339e-05, - "loss": 0.0012, - "step": 564 - }, - { - "epoch": 2.9123711340206184, - "grad_norm": 0.5384992957115173, - "learning_rate": 4.066447934886819e-05, - "loss": 0.007, - "step": 565 - }, - { - "epoch": 2.917525773195876, - "grad_norm": 5.088779926300049, - "learning_rate": 4.049679781441965e-05, - "loss": 0.1876, - "step": 566 - }, - { - "epoch": 2.9226804123711343, - "grad_norm": 2.96576189994812, - "learning_rate": 4.0329227093717515e-05, - "loss": 0.0468, - "step": 567 - }, - { - "epoch": 2.927835051546392, - "grad_norm": 0.0372987799346447, - "learning_rate": 4.016176914074944e-05, - "loss": 0.0007, - "step": 568 - }, - { - "epoch": 2.9329896907216497, - "grad_norm": 1.4958171844482422, - "learning_rate": 3.999442590818804e-05, - "loss": 0.037, - "step": 569 - }, - { - "epoch": 2.9381443298969074, - "grad_norm": 0.24370728433132172, - "learning_rate": 3.982719934736832e-05, - "loss": 0.0029, - "step": 570 - }, - { - "epoch": 2.943298969072165, - "grad_norm": 1.0667810440063477, - "learning_rate": 3.9660091408264716e-05, - "loss": 0.0227, - "step": 571 - }, - { - "epoch": 2.948453608247423, - "grad_norm": 0.06778667867183685, - "learning_rate": 3.949310403946849e-05, - "loss": 0.0012, - "step": 572 - }, - { - "epoch": 2.9536082474226806, - "grad_norm": 1.0466609001159668, - "learning_rate": 3.9326239188165025e-05, - "loss": 0.0082, - "step": 573 - }, - { - "epoch": 2.9587628865979383, - "grad_norm": 0.33205440640449524, - "learning_rate": 3.915949880011096e-05, - "loss": 0.0037, - "step": 574 - }, - { - "epoch": 2.963917525773196, - "grad_norm": 0.016779989004135132, - "learning_rate": 3.899288481961173e-05, - "loss": 0.0004, - "step": 575 - }, - { - "epoch": 2.9690721649484537, - "grad_norm": 0.9853585958480835, - "learning_rate": 3.8826399189498654e-05, - "loss": 0.0098, - "step": 576 - }, - { - "epoch": 2.9742268041237114, - "grad_norm": 0.5929998159408569, - "learning_rate": 3.86600438511065e-05, - "loss": 0.0043, - "step": 577 - }, - { - "epoch": 2.979381443298969, - "grad_norm": 3.938533306121826, - "learning_rate": 3.8493820744250685e-05, - "loss": 0.131, - "step": 578 - }, - { - "epoch": 2.984536082474227, - "grad_norm": 2.7528624534606934, - "learning_rate": 3.832773180720475e-05, - "loss": 0.0992, - "step": 579 - }, - { - "epoch": 2.9896907216494846, - "grad_norm": 1.0223017930984497, - "learning_rate": 3.8161778976677666e-05, - "loss": 0.0069, - "step": 580 - }, - { - "epoch": 2.9948453608247423, - "grad_norm": 0.7486586570739746, - "learning_rate": 3.79959641877914e-05, - "loss": 0.0071, - "step": 581 - }, - { - "epoch": 3.0, - "grad_norm": 0.2129761278629303, - "learning_rate": 3.783028937405821e-05, - "loss": 0.0026, - "step": 582 - }, - { - "epoch": 3.0, - "eval_loss": 0.1013411208987236, - "eval_runtime": 22.9543, - "eval_samples_per_second": 7.145, - "eval_steps_per_second": 1.786, - "step": 582 - }, - { - "epoch": 3.0051546391752577, - "grad_norm": 0.47546765208244324, - "learning_rate": 3.766475646735815e-05, - "loss": 0.0029, - "step": 583 - }, - { - "epoch": 3.0103092783505154, - "grad_norm": 0.024188758805394173, - "learning_rate": 3.7499367397916555e-05, - "loss": 0.0006, - "step": 584 - }, - { - "epoch": 3.015463917525773, - "grad_norm": 0.03356657549738884, - "learning_rate": 3.733412409428148e-05, - "loss": 0.0006, - "step": 585 - }, - { - "epoch": 3.020618556701031, - "grad_norm": 0.19245874881744385, - "learning_rate": 3.716902848330133e-05, - "loss": 0.0025, - "step": 586 - }, - { - "epoch": 3.0257731958762886, - "grad_norm": 0.992292046546936, - "learning_rate": 3.7004082490102226e-05, - "loss": 0.0109, - "step": 587 - }, - { - "epoch": 3.0309278350515463, - "grad_norm": 0.0995234027504921, - "learning_rate": 3.6839288038065734e-05, - "loss": 0.0012, - "step": 588 - }, - { - "epoch": 3.036082474226804, - "grad_norm": 0.07061074674129486, - "learning_rate": 3.667464704880625e-05, - "loss": 0.001, - "step": 589 - }, - { - "epoch": 3.0412371134020617, - "grad_norm": 0.23088815808296204, - "learning_rate": 3.651016144214878e-05, - "loss": 0.003, - "step": 590 - }, - { - "epoch": 3.0463917525773194, - "grad_norm": 0.18078461289405823, - "learning_rate": 3.634583313610644e-05, - "loss": 0.0015, - "step": 591 - }, - { - "epoch": 3.051546391752577, - "grad_norm": 0.22349312901496887, - "learning_rate": 3.618166404685805e-05, - "loss": 0.0019, - "step": 592 - }, - { - "epoch": 3.056701030927835, - "grad_norm": 0.5327549576759338, - "learning_rate": 3.601765608872595e-05, - "loss": 0.0048, - "step": 593 - }, - { - "epoch": 3.0618556701030926, - "grad_norm": 0.10766857862472534, - "learning_rate": 3.585381117415349e-05, - "loss": 0.0015, - "step": 594 - }, - { - "epoch": 3.0670103092783507, - "grad_norm": 0.08378089964389801, - "learning_rate": 3.5690131213682944e-05, - "loss": 0.0014, - "step": 595 - }, - { - "epoch": 3.0721649484536084, - "grad_norm": 0.12249798327684402, - "learning_rate": 3.5526618115932975e-05, - "loss": 0.0014, - "step": 596 - }, - { - "epoch": 3.077319587628866, - "grad_norm": 0.09342166781425476, - "learning_rate": 3.53632737875766e-05, - "loss": 0.001, - "step": 597 - }, - { - "epoch": 3.082474226804124, - "grad_norm": 0.030285261571407318, - "learning_rate": 3.5200100133318834e-05, - "loss": 0.0005, - "step": 598 - }, - { - "epoch": 3.0876288659793816, - "grad_norm": 0.6078401803970337, - "learning_rate": 3.5037099055874536e-05, - "loss": 0.0053, - "step": 599 - }, - { - "epoch": 3.0927835051546393, - "grad_norm": 0.031891413033008575, - "learning_rate": 3.487427245594622e-05, - "loss": 0.0006, - "step": 600 - }, - { - "epoch": 3.097938144329897, - "grad_norm": 0.22419054806232452, - "learning_rate": 3.47116222322018e-05, - "loss": 0.0026, - "step": 601 - }, - { - "epoch": 3.1030927835051547, - "grad_norm": 3.222254753112793, - "learning_rate": 3.4549150281252636e-05, - "loss": 0.0673, - "step": 602 - }, - { - "epoch": 3.1082474226804124, - "grad_norm": 0.07035762071609497, - "learning_rate": 3.4386858497631205e-05, - "loss": 0.0012, - "step": 603 - }, - { - "epoch": 3.11340206185567, - "grad_norm": 0.0403384231030941, - "learning_rate": 3.422474877376917e-05, - "loss": 0.0006, - "step": 604 - }, - { - "epoch": 3.118556701030928, - "grad_norm": 0.8612932562828064, - "learning_rate": 3.406282299997521e-05, - "loss": 0.0059, - "step": 605 - }, - { - "epoch": 3.1237113402061856, - "grad_norm": 0.04132150113582611, - "learning_rate": 3.3901083064413095e-05, - "loss": 0.0005, - "step": 606 - }, - { - "epoch": 3.1288659793814433, - "grad_norm": 0.06787485629320145, - "learning_rate": 3.3739530853079516e-05, - "loss": 0.0009, - "step": 607 - }, - { - "epoch": 3.134020618556701, - "grad_norm": 0.1399659663438797, - "learning_rate": 3.357816824978222e-05, - "loss": 0.0009, - "step": 608 - }, - { - "epoch": 3.1391752577319587, - "grad_norm": 0.4071262776851654, - "learning_rate": 3.341699713611799e-05, - "loss": 0.003, - "step": 609 - }, - { - "epoch": 3.1443298969072164, - "grad_norm": 0.0502166785299778, - "learning_rate": 3.325601939145069e-05, - "loss": 0.0005, - "step": 610 - }, - { - "epoch": 3.149484536082474, - "grad_norm": 0.044071536511182785, - "learning_rate": 3.309523689288941e-05, - "loss": 0.0008, - "step": 611 - }, - { - "epoch": 3.154639175257732, - "grad_norm": 0.00721757160499692, - "learning_rate": 3.293465151526649e-05, - "loss": 0.0002, - "step": 612 - }, - { - "epoch": 3.1597938144329896, - "grad_norm": 0.02341277524828911, - "learning_rate": 3.277426513111575e-05, - "loss": 0.0005, - "step": 613 - }, - { - "epoch": 3.1649484536082473, - "grad_norm": 0.06053034961223602, - "learning_rate": 3.261407961065056e-05, - "loss": 0.001, - "step": 614 - }, - { - "epoch": 3.170103092783505, - "grad_norm": 0.27043405175209045, - "learning_rate": 3.245409682174217e-05, - "loss": 0.0031, - "step": 615 - }, - { - "epoch": 3.1752577319587627, - "grad_norm": 0.03400631621479988, - "learning_rate": 3.229431862989775e-05, - "loss": 0.0006, - "step": 616 - }, - { - "epoch": 3.1804123711340204, - "grad_norm": 1.3103770017623901, - "learning_rate": 3.2134746898238774e-05, - "loss": 0.0111, - "step": 617 - }, - { - "epoch": 3.1855670103092786, - "grad_norm": 0.20999719202518463, - "learning_rate": 3.197538348747927e-05, - "loss": 0.003, - "step": 618 - }, - { - "epoch": 3.1907216494845363, - "grad_norm": 0.04834004119038582, - "learning_rate": 3.181623025590405e-05, - "loss": 0.0005, - "step": 619 - }, - { - "epoch": 3.195876288659794, - "grad_norm": 0.060436930507421494, - "learning_rate": 3.165728905934718e-05, - "loss": 0.0007, - "step": 620 - }, - { - "epoch": 3.2010309278350517, - "grad_norm": 0.12254571914672852, - "learning_rate": 3.149856175117014e-05, - "loss": 0.0014, - "step": 621 - }, - { - "epoch": 3.2061855670103094, - "grad_norm": 0.17747434973716736, - "learning_rate": 3.134005018224044e-05, - "loss": 0.0014, - "step": 622 - }, - { - "epoch": 3.211340206185567, - "grad_norm": 0.12086270749568939, - "learning_rate": 3.118175620090983e-05, - "loss": 0.0013, - "step": 623 - }, - { - "epoch": 3.216494845360825, - "grad_norm": 0.01552607398480177, - "learning_rate": 3.1023681652992926e-05, - "loss": 0.0003, - "step": 624 - }, - { - "epoch": 3.2216494845360826, - "grad_norm": 0.04116373881697655, - "learning_rate": 3.086582838174551e-05, - "loss": 0.0007, - "step": 625 - }, - { - "epoch": 3.2268041237113403, - "grad_norm": 0.018168117851018906, - "learning_rate": 3.070819822784323e-05, - "loss": 0.0003, - "step": 626 - }, - { - "epoch": 3.231958762886598, - "grad_norm": 0.07606551796197891, - "learning_rate": 3.055079302935997e-05, - "loss": 0.0009, - "step": 627 - }, - { - "epoch": 3.2371134020618557, - "grad_norm": 0.0784323439002037, - "learning_rate": 3.0393614621746498e-05, - "loss": 0.0007, - "step": 628 - }, - { - "epoch": 3.2422680412371134, - "grad_norm": 0.07449536770582199, - "learning_rate": 3.023666483780905e-05, - "loss": 0.0009, - "step": 629 - }, - { - "epoch": 3.247422680412371, - "grad_norm": 0.049209631979465485, - "learning_rate": 3.007994550768793e-05, - "loss": 0.0008, - "step": 630 - }, - { - "epoch": 3.252577319587629, - "grad_norm": 0.006166890263557434, - "learning_rate": 2.9923458458836258e-05, - "loss": 0.0002, - "step": 631 - }, - { - "epoch": 3.2577319587628866, - "grad_norm": 0.2379947453737259, - "learning_rate": 2.9767205515998518e-05, - "loss": 0.0016, - "step": 632 - }, - { - "epoch": 3.2628865979381443, - "grad_norm": 0.00971717108041048, - "learning_rate": 2.9611188501189435e-05, - "loss": 0.0002, - "step": 633 - }, - { - "epoch": 3.268041237113402, - "grad_norm": 0.003669175785034895, - "learning_rate": 2.9455409233672592e-05, - "loss": 0.0001, - "step": 634 - }, - { - "epoch": 3.2731958762886597, - "grad_norm": 0.006943593733012676, - "learning_rate": 2.929986952993933e-05, - "loss": 0.0002, - "step": 635 - }, - { - "epoch": 3.2783505154639174, - "grad_norm": 0.030062507838010788, - "learning_rate": 2.9144571203687476e-05, - "loss": 0.0005, - "step": 636 - }, - { - "epoch": 3.283505154639175, - "grad_norm": 0.009839212521910667, - "learning_rate": 2.8989516065800238e-05, - "loss": 0.0002, - "step": 637 - }, - { - "epoch": 3.288659793814433, - "grad_norm": 0.0119356419891119, - "learning_rate": 2.8834705924325118e-05, - "loss": 0.0002, - "step": 638 - }, - { - "epoch": 3.2938144329896906, - "grad_norm": 0.4045789837837219, - "learning_rate": 2.8680142584452742e-05, - "loss": 0.0032, - "step": 639 - }, - { - "epoch": 3.2989690721649483, - "grad_norm": 0.12698271870613098, - "learning_rate": 2.8525827848495913e-05, - "loss": 0.0016, - "step": 640 - }, - { - "epoch": 3.304123711340206, - "grad_norm": 0.45483699440956116, - "learning_rate": 2.83717635158685e-05, - "loss": 0.0057, - "step": 641 - }, - { - "epoch": 3.3092783505154637, - "grad_norm": 0.061637796461582184, - "learning_rate": 2.8217951383064544e-05, - "loss": 0.0012, - "step": 642 - }, - { - "epoch": 3.3144329896907214, - "grad_norm": 0.18873506784439087, - "learning_rate": 2.8064393243637222e-05, - "loss": 0.0018, - "step": 643 - }, - { - "epoch": 3.319587628865979, - "grad_norm": 0.013112235814332962, - "learning_rate": 2.791109088817803e-05, - "loss": 0.0003, - "step": 644 - }, - { - "epoch": 3.3247422680412373, - "grad_norm": 0.006930911913514137, - "learning_rate": 2.7758046104295797e-05, - "loss": 0.0002, - "step": 645 - }, - { - "epoch": 3.329896907216495, - "grad_norm": 0.017246872186660767, - "learning_rate": 2.760526067659591e-05, - "loss": 0.0003, - "step": 646 - }, - { - "epoch": 3.3350515463917527, - "grad_norm": 0.5544341206550598, - "learning_rate": 2.7452736386659516e-05, - "loss": 0.0042, - "step": 647 - }, - { - "epoch": 3.3402061855670104, - "grad_norm": 0.01653260365128517, - "learning_rate": 2.7300475013022663e-05, - "loss": 0.0003, - "step": 648 - }, - { - "epoch": 3.345360824742268, - "grad_norm": 0.011309400200843811, - "learning_rate": 2.7148478331155702e-05, - "loss": 0.0003, - "step": 649 - }, - { - "epoch": 3.350515463917526, - "grad_norm": 1.8607807159423828, - "learning_rate": 2.6996748113442394e-05, - "loss": 0.0126, - "step": 650 - }, - { - "epoch": 3.3556701030927836, - "grad_norm": 4.186022758483887, - "learning_rate": 2.6845286129159464e-05, - "loss": 0.0301, - "step": 651 - }, - { - "epoch": 3.3608247422680413, - "grad_norm": 1.2969677448272705, - "learning_rate": 2.669409414445574e-05, - "loss": 0.0136, - "step": 652 - }, - { - "epoch": 3.365979381443299, - "grad_norm": 0.030797425657510757, - "learning_rate": 2.6543173922331743e-05, - "loss": 0.0005, - "step": 653 - }, - { - "epoch": 3.3711340206185567, - "grad_norm": 0.16669800877571106, - "learning_rate": 2.639252722261908e-05, - "loss": 0.0015, - "step": 654 - }, - { - "epoch": 3.3762886597938144, - "grad_norm": 0.11004102230072021, - "learning_rate": 2.624215580195981e-05, - "loss": 0.001, - "step": 655 - }, - { - "epoch": 3.381443298969072, - "grad_norm": 1.4919167757034302, - "learning_rate": 2.6092061413786156e-05, - "loss": 0.0151, - "step": 656 - }, - { - "epoch": 3.38659793814433, - "grad_norm": 0.12221274524927139, - "learning_rate": 2.5942245808299886e-05, - "loss": 0.0011, - "step": 657 - }, - { - "epoch": 3.3917525773195876, - "grad_norm": 0.40452706813812256, - "learning_rate": 2.5792710732451997e-05, - "loss": 0.0028, - "step": 658 - }, - { - "epoch": 3.3969072164948453, - "grad_norm": 1.5829023122787476, - "learning_rate": 2.56434579299223e-05, - "loss": 0.0206, - "step": 659 - }, - { - "epoch": 3.402061855670103, - "grad_norm": 0.26589515805244446, - "learning_rate": 2.5494489141099153e-05, - "loss": 0.0018, - "step": 660 - }, - { - "epoch": 3.4072164948453607, - "grad_norm": 0.6049233078956604, - "learning_rate": 2.534580610305909e-05, - "loss": 0.0049, - "step": 661 - }, - { - "epoch": 3.4123711340206184, - "grad_norm": 0.012360837310552597, - "learning_rate": 2.5197410549546595e-05, - "loss": 0.0002, - "step": 662 - }, - { - "epoch": 3.417525773195876, - "grad_norm": 0.03558109700679779, - "learning_rate": 2.5049304210953933e-05, - "loss": 0.0004, - "step": 663 - }, - { - "epoch": 3.422680412371134, - "grad_norm": 0.03851904720067978, - "learning_rate": 2.4901488814300856e-05, - "loss": 0.0004, - "step": 664 - }, - { - "epoch": 3.4278350515463916, - "grad_norm": 0.41700494289398193, - "learning_rate": 2.4753966083214615e-05, - "loss": 0.0043, - "step": 665 - }, - { - "epoch": 3.4329896907216497, - "grad_norm": 0.0076299929060041904, - "learning_rate": 2.4606737737909697e-05, - "loss": 0.0002, - "step": 666 - }, - { - "epoch": 3.4381443298969074, - "grad_norm": 0.029439013451337814, - "learning_rate": 2.4459805495167942e-05, - "loss": 0.0005, - "step": 667 - }, - { - "epoch": 3.443298969072165, - "grad_norm": 0.20996803045272827, - "learning_rate": 2.4313171068318357e-05, - "loss": 0.0023, - "step": 668 - }, - { - "epoch": 3.448453608247423, - "grad_norm": 0.11522570997476578, - "learning_rate": 2.4166836167217283e-05, - "loss": 0.0013, - "step": 669 - }, - { - "epoch": 3.4536082474226806, - "grad_norm": 0.041188135743141174, - "learning_rate": 2.4020802498228335e-05, - "loss": 0.0004, - "step": 670 - }, - { - "epoch": 3.4587628865979383, - "grad_norm": 0.20725612342357635, - "learning_rate": 2.3875071764202563e-05, - "loss": 0.0017, - "step": 671 - }, - { - "epoch": 3.463917525773196, - "grad_norm": 0.8663135170936584, - "learning_rate": 2.3729645664458638e-05, - "loss": 0.0066, - "step": 672 - }, - { - "epoch": 3.4690721649484537, - "grad_norm": 0.36084750294685364, - "learning_rate": 2.3584525894762928e-05, - "loss": 0.002, - "step": 673 - }, - { - "epoch": 3.4742268041237114, - "grad_norm": 0.03471383452415466, - "learning_rate": 2.3439714147309845e-05, - "loss": 0.0005, - "step": 674 - }, - { - "epoch": 3.479381443298969, - "grad_norm": 0.008910657837986946, - "learning_rate": 2.329521211070199e-05, - "loss": 0.0002, - "step": 675 - }, - { - "epoch": 3.484536082474227, - "grad_norm": 0.06672267615795135, - "learning_rate": 2.3151021469930613e-05, - "loss": 0.0009, - "step": 676 - }, - { - "epoch": 3.4896907216494846, - "grad_norm": 0.05928796902298927, - "learning_rate": 2.3007143906355767e-05, - "loss": 0.0005, - "step": 677 - }, - { - "epoch": 3.4948453608247423, - "grad_norm": 0.22010110318660736, - "learning_rate": 2.2863581097686925e-05, - "loss": 0.0024, - "step": 678 - }, - { - "epoch": 3.5, - "grad_norm": 0.012560139410197735, - "learning_rate": 2.2720334717963222e-05, - "loss": 0.0002, - "step": 679 - }, - { - "epoch": 3.5, - "eval_loss": 0.10579764097929001, - "eval_runtime": 22.9584, - "eval_samples_per_second": 7.143, - "eval_steps_per_second": 1.786, - "step": 679 - }, - { - "epoch": 3.5051546391752577, - "grad_norm": 1.5844885110855103, - "learning_rate": 2.2577406437534054e-05, - "loss": 0.0165, - "step": 680 - }, - { - "epoch": 3.5103092783505154, - "grad_norm": 0.12943275272846222, - "learning_rate": 2.2434797923039598e-05, - "loss": 0.0012, - "step": 681 - }, - { - "epoch": 3.515463917525773, - "grad_norm": 0.41947558522224426, - "learning_rate": 2.2292510837391267e-05, - "loss": 0.0028, - "step": 682 - }, - { - "epoch": 3.520618556701031, - "grad_norm": 0.279430091381073, - "learning_rate": 2.2150546839752438e-05, - "loss": 0.0027, - "step": 683 - }, - { - "epoch": 3.5257731958762886, - "grad_norm": 0.07363631576299667, - "learning_rate": 2.2008907585519095e-05, - "loss": 0.0007, - "step": 684 - }, - { - "epoch": 3.5309278350515463, - "grad_norm": 0.28761395812034607, - "learning_rate": 2.186759472630045e-05, - "loss": 0.0024, - "step": 685 - }, - { - "epoch": 3.536082474226804, - "grad_norm": 0.013523326255381107, - "learning_rate": 2.172660990989971e-05, - "loss": 0.0003, - "step": 686 - }, - { - "epoch": 3.5412371134020617, - "grad_norm": 0.02186383679509163, - "learning_rate": 2.1585954780294947e-05, - "loss": 0.0004, - "step": 687 - }, - { - "epoch": 3.5463917525773194, - "grad_norm": 0.12467202544212341, - "learning_rate": 2.144563097761984e-05, - "loss": 0.0008, - "step": 688 - }, - { - "epoch": 3.551546391752577, - "grad_norm": 0.0455549992620945, - "learning_rate": 2.130564013814453e-05, - "loss": 0.0006, - "step": 689 - }, - { - "epoch": 3.556701030927835, - "grad_norm": 0.06408347189426422, - "learning_rate": 2.1165983894256647e-05, - "loss": 0.0009, - "step": 690 - }, - { - "epoch": 3.5618556701030926, - "grad_norm": 0.027774274349212646, - "learning_rate": 2.102666387444215e-05, - "loss": 0.0004, - "step": 691 - }, - { - "epoch": 3.5670103092783503, - "grad_norm": 1.0454899072647095, - "learning_rate": 2.0887681703266453e-05, - "loss": 0.007, - "step": 692 - }, - { - "epoch": 3.572164948453608, - "grad_norm": 0.023052293807268143, - "learning_rate": 2.0749039001355375e-05, - "loss": 0.0003, - "step": 693 - }, - { - "epoch": 3.5773195876288657, - "grad_norm": 0.05714479088783264, - "learning_rate": 2.061073738537635e-05, - "loss": 0.0007, - "step": 694 - }, - { - "epoch": 3.582474226804124, - "grad_norm": 0.008403950370848179, - "learning_rate": 2.0472778468019454e-05, - "loss": 0.0002, - "step": 695 - }, - { - "epoch": 3.5876288659793816, - "grad_norm": 0.46188488602638245, - "learning_rate": 2.0335163857978744e-05, - "loss": 0.0038, - "step": 696 - }, - { - "epoch": 3.5927835051546393, - "grad_norm": 0.008672907017171383, - "learning_rate": 2.019789515993336e-05, - "loss": 0.0002, - "step": 697 - }, - { - "epoch": 3.597938144329897, - "grad_norm": 0.6004514098167419, - "learning_rate": 2.0060973974528874e-05, - "loss": 0.0044, - "step": 698 - }, - { - "epoch": 3.6030927835051547, - "grad_norm": 0.049697019159793854, - "learning_rate": 1.992440189835869e-05, - "loss": 0.0007, - "step": 699 - }, - { - "epoch": 3.6082474226804124, - "grad_norm": 0.14393624663352966, - "learning_rate": 1.9788180523945277e-05, - "loss": 0.0016, - "step": 700 - }, - { - "epoch": 3.61340206185567, - "grad_norm": 0.005976496264338493, - "learning_rate": 1.9652311439721764e-05, - "loss": 0.0001, - "step": 701 - }, - { - "epoch": 3.618556701030928, - "grad_norm": 0.45061904191970825, - "learning_rate": 1.9516796230013272e-05, - "loss": 0.0033, - "step": 702 - }, - { - "epoch": 3.6237113402061856, - "grad_norm": 0.004622936248779297, - "learning_rate": 1.9381636475018577e-05, - "loss": 0.0001, - "step": 703 - }, - { - "epoch": 3.6288659793814433, - "grad_norm": 0.05081766098737717, - "learning_rate": 1.9246833750791526e-05, - "loss": 0.0004, - "step": 704 - }, - { - "epoch": 3.634020618556701, - "grad_norm": 0.09289523959159851, - "learning_rate": 1.9112389629222823e-05, - "loss": 0.0011, - "step": 705 - }, - { - "epoch": 3.6391752577319587, - "grad_norm": 0.030106987804174423, - "learning_rate": 1.8978305678021595e-05, - "loss": 0.0004, - "step": 706 - }, - { - "epoch": 3.6443298969072164, - "grad_norm": 0.2676849067211151, - "learning_rate": 1.884458346069713e-05, - "loss": 0.0024, - "step": 707 - }, - { - "epoch": 3.649484536082474, - "grad_norm": 0.011654643341898918, - "learning_rate": 1.8711224536540678e-05, - "loss": 0.0002, - "step": 708 - }, - { - "epoch": 3.654639175257732, - "grad_norm": 3.1462109088897705, - "learning_rate": 1.857823046060722e-05, - "loss": 0.0191, - "step": 709 - }, - { - "epoch": 3.6597938144329896, - "grad_norm": 0.6463388204574585, - "learning_rate": 1.8445602783697374e-05, - "loss": 0.0083, - "step": 710 - }, - { - "epoch": 3.6649484536082473, - "grad_norm": 0.005683832801878452, - "learning_rate": 1.831334305233928e-05, - "loss": 0.0001, - "step": 711 - }, - { - "epoch": 3.670103092783505, - "grad_norm": 0.14142107963562012, - "learning_rate": 1.8181452808770637e-05, - "loss": 0.0009, - "step": 712 - }, - { - "epoch": 3.675257731958763, - "grad_norm": 0.0627388134598732, - "learning_rate": 1.804993359092059e-05, - "loss": 0.0007, - "step": 713 - }, - { - "epoch": 3.680412371134021, - "grad_norm": 0.006492619402706623, - "learning_rate": 1.7918786932391944e-05, - "loss": 0.0001, - "step": 714 - }, - { - "epoch": 3.6855670103092786, - "grad_norm": 0.37058717012405396, - "learning_rate": 1.778801436244319e-05, - "loss": 0.0039, - "step": 715 - }, - { - "epoch": 3.6907216494845363, - "grad_norm": 0.02725241892039776, - "learning_rate": 1.765761740597065e-05, - "loss": 0.0004, - "step": 716 - }, - { - "epoch": 3.695876288659794, - "grad_norm": 0.5024199485778809, - "learning_rate": 1.7527597583490822e-05, - "loss": 0.0028, - "step": 717 - }, - { - "epoch": 3.7010309278350517, - "grad_norm": 0.28971970081329346, - "learning_rate": 1.739795641112248e-05, - "loss": 0.0029, - "step": 718 - }, - { - "epoch": 3.7061855670103094, - "grad_norm": 0.4896758198738098, - "learning_rate": 1.726869540056915e-05, - "loss": 0.0041, - "step": 719 - }, - { - "epoch": 3.711340206185567, - "grad_norm": 0.01545119471848011, - "learning_rate": 1.713981605910137e-05, - "loss": 0.0003, - "step": 720 - }, - { - "epoch": 3.716494845360825, - "grad_norm": 0.8276351094245911, - "learning_rate": 1.70113198895392e-05, - "loss": 0.0072, - "step": 721 - }, - { - "epoch": 3.7216494845360826, - "grad_norm": 0.008877018466591835, - "learning_rate": 1.6883208390234628e-05, - "loss": 0.0002, - "step": 722 - }, - { - "epoch": 3.7268041237113403, - "grad_norm": 0.011914421804249287, - "learning_rate": 1.6755483055054105e-05, - "loss": 0.0002, - "step": 723 - }, - { - "epoch": 3.731958762886598, - "grad_norm": 0.4500105082988739, - "learning_rate": 1.662814537336122e-05, - "loss": 0.0034, - "step": 724 - }, - { - "epoch": 3.7371134020618557, - "grad_norm": 0.022771602496504784, - "learning_rate": 1.650119682999918e-05, - "loss": 0.0003, - "step": 725 - }, - { - "epoch": 3.7422680412371134, - "grad_norm": 1.4144443273544312, - "learning_rate": 1.6374638905273643e-05, - "loss": 0.0103, - "step": 726 - }, - { - "epoch": 3.747422680412371, - "grad_norm": 0.09157131612300873, - "learning_rate": 1.624847307493534e-05, - "loss": 0.0013, - "step": 727 - }, - { - "epoch": 3.752577319587629, - "grad_norm": 0.09352636337280273, - "learning_rate": 1.6122700810162966e-05, - "loss": 0.0009, - "step": 728 - }, - { - "epoch": 3.7577319587628866, - "grad_norm": 0.02057919092476368, - "learning_rate": 1.5997323577545915e-05, - "loss": 0.0003, - "step": 729 - }, - { - "epoch": 3.7628865979381443, - "grad_norm": 0.053213931620121, - "learning_rate": 1.5872342839067306e-05, - "loss": 0.0004, - "step": 730 - }, - { - "epoch": 3.768041237113402, - "grad_norm": 0.21348951756954193, - "learning_rate": 1.5747760052086803e-05, - "loss": 0.0021, - "step": 731 - }, - { - "epoch": 3.7731958762886597, - "grad_norm": 0.03968895971775055, - "learning_rate": 1.5623576669323743e-05, - "loss": 0.0005, - "step": 732 - }, - { - "epoch": 3.7783505154639174, - "grad_norm": 0.00650789774954319, - "learning_rate": 1.5499794138840122e-05, - "loss": 0.0002, - "step": 733 - }, - { - "epoch": 3.783505154639175, - "grad_norm": 1.0022690296173096, - "learning_rate": 1.5376413904023722e-05, - "loss": 0.0081, - "step": 734 - }, - { - "epoch": 3.788659793814433, - "grad_norm": 1.8135733604431152, - "learning_rate": 1.525343740357128e-05, - "loss": 0.0138, - "step": 735 - }, - { - "epoch": 3.7938144329896906, - "grad_norm": 0.0444062165915966, - "learning_rate": 1.5130866071471717e-05, - "loss": 0.0007, - "step": 736 - }, - { - "epoch": 3.7989690721649483, - "grad_norm": 0.06379842758178711, - "learning_rate": 1.500870133698945e-05, - "loss": 0.0009, - "step": 737 - }, - { - "epoch": 3.804123711340206, - "grad_norm": 0.012018664740025997, - "learning_rate": 1.4886944624647647e-05, - "loss": 0.0002, - "step": 738 - }, - { - "epoch": 3.8092783505154637, - "grad_norm": 0.34600210189819336, - "learning_rate": 1.4765597354211713e-05, - "loss": 0.0027, - "step": 739 - }, - { - "epoch": 3.8144329896907214, - "grad_norm": 2.4469614028930664, - "learning_rate": 1.4644660940672627e-05, - "loss": 0.0204, - "step": 740 - }, - { - "epoch": 3.819587628865979, - "grad_norm": 0.017720194533467293, - "learning_rate": 1.4524136794230547e-05, - "loss": 0.0003, - "step": 741 - }, - { - "epoch": 3.824742268041237, - "grad_norm": 0.15242381393909454, - "learning_rate": 1.4404026320278318e-05, - "loss": 0.0011, - "step": 742 - }, - { - "epoch": 3.829896907216495, - "grad_norm": 0.06981607526540756, - "learning_rate": 1.4284330919385036e-05, - "loss": 0.0007, - "step": 743 - }, - { - "epoch": 3.8350515463917527, - "grad_norm": 0.08876276761293411, - "learning_rate": 1.4165051987279831e-05, - "loss": 0.0006, - "step": 744 - }, - { - "epoch": 3.8402061855670104, - "grad_norm": 0.17177937924861908, - "learning_rate": 1.404619091483546e-05, - "loss": 0.0013, - "step": 745 - }, - { - "epoch": 3.845360824742268, - "grad_norm": 0.033125944435596466, - "learning_rate": 1.3927749088052217e-05, - "loss": 0.0005, - "step": 746 - }, - { - "epoch": 3.850515463917526, - "grad_norm": 1.0394357442855835, - "learning_rate": 1.3809727888041668e-05, - "loss": 0.0086, - "step": 747 - }, - { - "epoch": 3.8556701030927836, - "grad_norm": 0.020910169929265976, - "learning_rate": 1.3692128691010592e-05, - "loss": 0.0003, - "step": 748 - }, - { - "epoch": 3.8608247422680413, - "grad_norm": 0.016304798424243927, - "learning_rate": 1.3574952868244922e-05, - "loss": 0.0003, - "step": 749 - }, - { - "epoch": 3.865979381443299, - "grad_norm": 1.6812098026275635, - "learning_rate": 1.3458201786093794e-05, - "loss": 0.0097, - "step": 750 - }, - { - "epoch": 3.8711340206185567, - "grad_norm": 0.09045127779245377, - "learning_rate": 1.334187680595358e-05, - "loss": 0.0009, - "step": 751 - }, - { - "epoch": 3.8762886597938144, - "grad_norm": 0.39485737681388855, - "learning_rate": 1.3225979284251954e-05, - "loss": 0.0032, - "step": 752 - }, - { - "epoch": 3.881443298969072, - "grad_norm": 1.2273677587509155, - "learning_rate": 1.3110510572432221e-05, - "loss": 0.0108, - "step": 753 - }, - { - "epoch": 3.88659793814433, - "grad_norm": 0.03808501362800598, - "learning_rate": 1.2995472016937404e-05, - "loss": 0.0003, - "step": 754 - }, - { - "epoch": 3.8917525773195876, - "grad_norm": 0.02039515972137451, - "learning_rate": 1.2880864959194665e-05, - "loss": 0.0003, - "step": 755 - }, - { - "epoch": 3.8969072164948453, - "grad_norm": 1.4002429246902466, - "learning_rate": 1.2766690735599568e-05, - "loss": 0.0142, - "step": 756 - }, - { - "epoch": 3.902061855670103, - "grad_norm": 0.06780578941106796, - "learning_rate": 1.2652950677500574e-05, - "loss": 0.0006, - "step": 757 - }, - { - "epoch": 3.9072164948453607, - "grad_norm": 0.00860658474266529, - "learning_rate": 1.253964611118345e-05, - "loss": 0.0002, - "step": 758 - }, - { - "epoch": 3.9123711340206184, - "grad_norm": 2.367659330368042, - "learning_rate": 1.2426778357855873e-05, - "loss": 0.024, - "step": 759 - }, - { - "epoch": 3.917525773195876, - "grad_norm": 0.3828847110271454, - "learning_rate": 1.2314348733631959e-05, - "loss": 0.0037, - "step": 760 - }, - { - "epoch": 3.9226804123711343, - "grad_norm": 0.0686371847987175, - "learning_rate": 1.2202358549516923e-05, - "loss": 0.0007, - "step": 761 - }, - { - "epoch": 3.927835051546392, - "grad_norm": 0.00528530590236187, - "learning_rate": 1.209080911139187e-05, - "loss": 0.0002, - "step": 762 - }, - { - "epoch": 3.9329896907216497, - "grad_norm": 0.10510650277137756, - "learning_rate": 1.1979701719998453e-05, - "loss": 0.0013, - "step": 763 - }, - { - "epoch": 3.9381443298969074, - "grad_norm": 0.019046945497393608, - "learning_rate": 1.1869037670923815e-05, - "loss": 0.0002, - "step": 764 - }, - { - "epoch": 3.943298969072165, - "grad_norm": 0.012646029703319073, - "learning_rate": 1.1758818254585369e-05, - "loss": 0.0002, - "step": 765 - }, - { - "epoch": 3.948453608247423, - "grad_norm": 2.9753146171569824, - "learning_rate": 1.164904475621587e-05, - "loss": 0.0265, - "step": 766 - }, - { - "epoch": 3.9536082474226806, - "grad_norm": 1.622678279876709, - "learning_rate": 1.1539718455848309e-05, - "loss": 0.0135, - "step": 767 - }, - { - "epoch": 3.9587628865979383, - "grad_norm": 0.0164541844278574, - "learning_rate": 1.1430840628301093e-05, - "loss": 0.0003, - "step": 768 - }, - { - "epoch": 3.963917525773196, - "grad_norm": 0.15376994013786316, - "learning_rate": 1.1322412543163135e-05, - "loss": 0.0013, - "step": 769 - }, - { - "epoch": 3.9690721649484537, - "grad_norm": 0.052942678332328796, - "learning_rate": 1.1214435464779006e-05, - "loss": 0.0005, - "step": 770 - }, - { - "epoch": 3.9742268041237114, - "grad_norm": 0.0726817324757576, - "learning_rate": 1.1106910652234276e-05, - "loss": 0.0007, - "step": 771 - }, - { - "epoch": 3.979381443298969, - "grad_norm": 0.014821125194430351, - "learning_rate": 1.099983935934077e-05, - "loss": 0.0002, - "step": 772 - }, - { - "epoch": 3.984536082474227, - "grad_norm": 0.014423374086618423, - "learning_rate": 1.089322283462197e-05, - "loss": 0.0003, - "step": 773 - }, - { - "epoch": 3.9896907216494846, - "grad_norm": 0.0970507562160492, - "learning_rate": 1.0787062321298442e-05, - "loss": 0.0009, - "step": 774 - }, - { - "epoch": 3.9948453608247423, - "grad_norm": 0.013409281149506569, - "learning_rate": 1.0681359057273388e-05, - "loss": 0.0003, - "step": 775 - }, - { - "epoch": 4.0, - "grad_norm": 0.8209367394447327, - "learning_rate": 1.0576114275118131e-05, - "loss": 0.0061, - "step": 776 - }, - { - "epoch": 4.0, - "eval_loss": 0.10921349376440048, - "eval_runtime": 22.9078, - "eval_samples_per_second": 7.159, - "eval_steps_per_second": 1.79, - "step": 776 - }, - { - "epoch": 4.005154639175258, - "grad_norm": 0.08000417053699493, - "learning_rate": 1.0471329202057823e-05, - "loss": 0.0008, - "step": 777 - }, - { - "epoch": 4.010309278350515, - "grad_norm": 0.05137176439166069, - "learning_rate": 1.0367005059957096e-05, - "loss": 0.0006, - "step": 778 - }, - { - "epoch": 4.015463917525773, - "grad_norm": 0.006002445705235004, - "learning_rate": 1.0263143065305769e-05, - "loss": 0.0001, - "step": 779 - }, - { - "epoch": 4.020618556701031, - "grad_norm": 0.00495640654116869, - "learning_rate": 1.0159744429204777e-05, - "loss": 0.0001, - "step": 780 - }, - { - "epoch": 4.025773195876289, - "grad_norm": 0.23104587197303772, - "learning_rate": 1.005681035735192e-05, - "loss": 0.002, - "step": 781 - }, - { - "epoch": 4.030927835051546, - "grad_norm": 0.018206927925348282, - "learning_rate": 9.954342050027921e-06, - "loss": 0.0002, - "step": 782 - }, - { - "epoch": 4.036082474226804, - "grad_norm": 0.03753966838121414, - "learning_rate": 9.852340702082318e-06, - "loss": 0.0004, - "step": 783 - }, - { - "epoch": 4.041237113402062, - "grad_norm": 0.03190065175294876, - "learning_rate": 9.750807502919652e-06, - "loss": 0.0004, - "step": 784 - }, - { - "epoch": 4.046391752577319, - "grad_norm": 0.029177363961935043, - "learning_rate": 9.64974363648548e-06, - "loss": 0.0003, - "step": 785 - }, - { - "epoch": 4.051546391752577, - "grad_norm": 0.013272424228489399, - "learning_rate": 9.549150281252633e-06, - "loss": 0.0002, - "step": 786 - }, - { - "epoch": 4.056701030927835, - "grad_norm": 0.11541124433279037, - "learning_rate": 9.449028610207494e-06, - "loss": 0.0013, - "step": 787 - }, - { - "epoch": 4.061855670103093, - "grad_norm": 0.07466880977153778, - "learning_rate": 9.349379790836243e-06, - "loss": 0.0006, - "step": 788 - }, - { - "epoch": 4.06701030927835, - "grad_norm": 0.01846369542181492, - "learning_rate": 9.25020498511135e-06, - "loss": 0.0003, - "step": 789 - }, - { - "epoch": 4.072164948453608, - "grad_norm": 0.053815506398677826, - "learning_rate": 9.151505349477902e-06, - "loss": 0.0006, - "step": 790 - }, - { - "epoch": 4.077319587628866, - "grad_norm": 0.011579618789255619, - "learning_rate": 9.053282034840238e-06, - "loss": 0.0002, - "step": 791 - }, - { - "epoch": 4.082474226804123, - "grad_norm": 0.007092161104083061, - "learning_rate": 8.955536186548425e-06, - "loss": 0.0001, - "step": 792 - }, - { - "epoch": 4.087628865979381, - "grad_norm": 0.047016046941280365, - "learning_rate": 8.858268944384995e-06, - "loss": 0.0006, - "step": 793 - }, - { - "epoch": 4.092783505154639, - "grad_norm": 0.08547954261302948, - "learning_rate": 8.761481442551573e-06, - "loss": 0.0008, - "step": 794 - }, - { - "epoch": 4.097938144329897, - "grad_norm": 0.008324719965457916, - "learning_rate": 8.665174809655708e-06, - "loss": 0.0002, - "step": 795 - }, - { - "epoch": 4.103092783505154, - "grad_norm": 0.1423901468515396, - "learning_rate": 8.569350168697704e-06, - "loss": 0.0012, - "step": 796 - }, - { - "epoch": 4.108247422680412, - "grad_norm": 0.13605594635009766, - "learning_rate": 8.474008637057478e-06, - "loss": 0.0008, - "step": 797 - }, - { - "epoch": 4.11340206185567, - "grad_norm": 0.01406699325889349, - "learning_rate": 8.379151326481587e-06, - "loss": 0.0002, - "step": 798 - }, - { - "epoch": 4.118556701030927, - "grad_norm": 0.023498743772506714, - "learning_rate": 8.284779343070265e-06, - "loss": 0.0003, - "step": 799 - }, - { - "epoch": 4.123711340206185, - "grad_norm": 0.3762100040912628, - "learning_rate": 8.19089378726447e-06, - "loss": 0.0023, - "step": 800 - }, - { - "epoch": 4.128865979381443, - "grad_norm": 0.007977807894349098, - "learning_rate": 8.097495753833078e-06, - "loss": 0.0002, - "step": 801 - }, - { - "epoch": 4.134020618556701, - "grad_norm": 0.02798437513411045, - "learning_rate": 8.004586331860175e-06, - "loss": 0.0003, - "step": 802 - }, - { - "epoch": 4.139175257731959, - "grad_norm": 0.013621841557323933, - "learning_rate": 7.91216660473228e-06, - "loss": 0.0003, - "step": 803 - }, - { - "epoch": 4.144329896907217, - "grad_norm": 0.010507001541554928, - "learning_rate": 7.820237650125712e-06, - "loss": 0.0002, - "step": 804 - }, - { - "epoch": 4.149484536082475, - "grad_norm": 0.06786844879388809, - "learning_rate": 7.728800539994113e-06, - "loss": 0.0006, - "step": 805 - }, - { - "epoch": 4.154639175257732, - "grad_norm": 0.02216934785246849, - "learning_rate": 7.637856340555822e-06, - "loss": 0.0003, - "step": 806 - }, - { - "epoch": 4.15979381443299, - "grad_norm": 0.021773718297481537, - "learning_rate": 7.547406112281557e-06, - "loss": 0.0003, - "step": 807 - }, - { - "epoch": 4.164948453608248, - "grad_norm": 0.005113635677844286, - "learning_rate": 7.457450909881969e-06, - "loss": 0.0001, - "step": 808 - }, - { - "epoch": 4.170103092783505, - "grad_norm": 0.027230041101574898, - "learning_rate": 7.367991782295391e-06, - "loss": 0.0002, - "step": 809 - }, - { - "epoch": 4.175257731958763, - "grad_norm": 0.12998707592487335, - "learning_rate": 7.2790297726755716e-06, - "loss": 0.0013, - "step": 810 - }, - { - "epoch": 4.180412371134021, - "grad_norm": 0.26570916175842285, - "learning_rate": 7.190565918379549e-06, - "loss": 0.0018, - "step": 811 - }, - { - "epoch": 4.185567010309279, - "grad_norm": 0.014305188320577145, - "learning_rate": 7.1026012509555265e-06, - "loss": 0.0002, - "step": 812 - }, - { - "epoch": 4.190721649484536, - "grad_norm": 0.014115352183580399, - "learning_rate": 7.015136796130828e-06, - "loss": 0.0003, - "step": 813 - }, - { - "epoch": 4.195876288659794, - "grad_norm": 0.06203961372375488, - "learning_rate": 6.928173573800006e-06, - "loss": 0.0007, - "step": 814 - }, - { - "epoch": 4.201030927835052, - "grad_norm": 0.594573438167572, - "learning_rate": 6.8417125980128675e-06, - "loss": 0.0048, - "step": 815 - }, - { - "epoch": 4.206185567010309, - "grad_norm": 0.1779005527496338, - "learning_rate": 6.755754876962711e-06, - "loss": 0.0013, - "step": 816 - }, - { - "epoch": 4.211340206185567, - "grad_norm": 0.005702666938304901, - "learning_rate": 6.670301412974511e-06, - "loss": 0.0002, - "step": 817 - }, - { - "epoch": 4.216494845360825, - "grad_norm": 0.00647127116099, - "learning_rate": 6.585353202493322e-06, - "loss": 0.0002, - "step": 818 - }, - { - "epoch": 4.221649484536083, - "grad_norm": 0.04004105553030968, - "learning_rate": 6.500911236072532e-06, - "loss": 0.0006, - "step": 819 - }, - { - "epoch": 4.22680412371134, - "grad_norm": 0.019848493859171867, - "learning_rate": 6.416976498362432e-06, - "loss": 0.0002, - "step": 820 - }, - { - "epoch": 4.231958762886598, - "grad_norm": 0.009120230562984943, - "learning_rate": 6.333549968098684e-06, - "loss": 0.0002, - "step": 821 - }, - { - "epoch": 4.237113402061856, - "grad_norm": 0.010857068933546543, - "learning_rate": 6.250632618090868e-06, - "loss": 0.0002, - "step": 822 - }, - { - "epoch": 4.242268041237113, - "grad_norm": 0.010758602060377598, - "learning_rate": 6.168225415211226e-06, - "loss": 0.0002, - "step": 823 - }, - { - "epoch": 4.247422680412371, - "grad_norm": 0.033467814326286316, - "learning_rate": 6.0863293203833105e-06, - "loss": 0.0004, - "step": 824 - }, - { - "epoch": 4.252577319587629, - "grad_norm": 0.008194089867174625, - "learning_rate": 6.004945288570813e-06, - "loss": 0.0002, - "step": 825 - }, - { - "epoch": 4.257731958762887, - "grad_norm": 0.04414292797446251, - "learning_rate": 5.924074268766422e-06, - "loss": 0.0004, - "step": 826 - }, - { - "epoch": 4.262886597938144, - "grad_norm": 0.041067712008953094, - "learning_rate": 5.843717203980792e-06, - "loss": 0.0004, - "step": 827 - }, - { - "epoch": 4.268041237113402, - "grad_norm": 0.006359627936035395, - "learning_rate": 5.763875031231464e-06, - "loss": 0.0001, - "step": 828 - }, - { - "epoch": 4.27319587628866, - "grad_norm": 0.016976401209831238, - "learning_rate": 5.684548681532032e-06, - "loss": 0.0003, - "step": 829 - }, - { - "epoch": 4.278350515463917, - "grad_norm": 0.00730324350297451, - "learning_rate": 5.605739079881239e-06, - "loss": 0.0002, - "step": 830 - }, - { - "epoch": 4.283505154639175, - "grad_norm": 0.013788518495857716, - "learning_rate": 5.527447145252174e-06, - "loss": 0.0003, - "step": 831 - }, - { - "epoch": 4.288659793814433, - "grad_norm": 0.1177946925163269, - "learning_rate": 5.449673790581611e-06, - "loss": 0.0014, - "step": 832 - }, - { - "epoch": 4.293814432989691, - "grad_norm": 0.011995314620435238, - "learning_rate": 5.372419922759292e-06, - "loss": 0.0002, - "step": 833 - }, - { - "epoch": 4.298969072164948, - "grad_norm": 0.07039642333984375, - "learning_rate": 5.295686442617443e-06, - "loss": 0.001, - "step": 834 - }, - { - "epoch": 4.304123711340206, - "grad_norm": 0.05830349028110504, - "learning_rate": 5.219474244920164e-06, - "loss": 0.0005, - "step": 835 - }, - { - "epoch": 4.309278350515464, - "grad_norm": 0.05796303227543831, - "learning_rate": 5.143784218353103e-06, - "loss": 0.0009, - "step": 836 - }, - { - "epoch": 4.314432989690721, - "grad_norm": 0.01927541196346283, - "learning_rate": 5.068617245513008e-06, - "loss": 0.0002, - "step": 837 - }, - { - "epoch": 4.319587628865979, - "grad_norm": 0.04202926531434059, - "learning_rate": 4.993974202897455e-06, - "loss": 0.0004, - "step": 838 - }, - { - "epoch": 4.324742268041237, - "grad_norm": 0.6382042169570923, - "learning_rate": 4.9198559608946815e-06, - "loss": 0.0048, - "step": 839 - }, - { - "epoch": 4.329896907216495, - "grad_norm": 0.008230258710682392, - "learning_rate": 4.846263383773364e-06, - "loss": 0.0002, - "step": 840 - }, - { - "epoch": 4.335051546391752, - "grad_norm": 0.0035637454129755497, - "learning_rate": 4.773197329672596e-06, - "loss": 0.0001, - "step": 841 - }, - { - "epoch": 4.34020618556701, - "grad_norm": 0.10410863161087036, - "learning_rate": 4.700658650591827e-06, - "loss": 0.001, - "step": 842 - }, - { - "epoch": 4.345360824742268, - "grad_norm": 0.038275450468063354, - "learning_rate": 4.628648192380986e-06, - "loss": 0.0005, - "step": 843 - }, - { - "epoch": 4.350515463917525, - "grad_norm": 0.006225410848855972, - "learning_rate": 4.557166794730572e-06, - "loss": 0.0002, - "step": 844 - }, - { - "epoch": 4.355670103092783, - "grad_norm": 0.02506340481340885, - "learning_rate": 4.4862152911618934e-06, - "loss": 0.0003, - "step": 845 - }, - { - "epoch": 4.360824742268041, - "grad_norm": 0.03422137349843979, - "learning_rate": 4.415794509017329e-06, - "loss": 0.0004, - "step": 846 - }, - { - "epoch": 4.365979381443299, - "grad_norm": 0.0062943557277321815, - "learning_rate": 4.34590526945069e-06, - "loss": 0.0001, - "step": 847 - }, - { - "epoch": 4.371134020618557, - "grad_norm": 0.006638492923229933, - "learning_rate": 4.276548387417656e-06, - "loss": 0.0002, - "step": 848 - }, - { - "epoch": 4.376288659793815, - "grad_norm": 0.17980645596981049, - "learning_rate": 4.20772467166623e-06, - "loss": 0.0017, - "step": 849 - }, - { - "epoch": 4.381443298969073, - "grad_norm": 0.04007335379719734, - "learning_rate": 4.139434924727359e-06, - "loss": 0.0006, - "step": 850 - }, - { - "epoch": 4.38659793814433, - "grad_norm": 0.06706022471189499, - "learning_rate": 4.071679942905532e-06, - "loss": 0.0007, - "step": 851 - }, - { - "epoch": 4.391752577319588, - "grad_norm": 0.2766573131084442, - "learning_rate": 4.004460516269554e-06, - "loss": 0.0013, - "step": 852 - }, - { - "epoch": 4.396907216494846, - "grad_norm": 0.029405513778328896, - "learning_rate": 3.937777428643253e-06, - "loss": 0.0004, - "step": 853 - }, - { - "epoch": 4.402061855670103, - "grad_norm": 0.031775474548339844, - "learning_rate": 3.87163145759642e-06, - "loss": 0.0005, - "step": 854 - }, - { - "epoch": 4.407216494845361, - "grad_norm": 0.038619689643383026, - "learning_rate": 3.8060233744356633e-06, - "loss": 0.0004, - "step": 855 - }, - { - "epoch": 4.412371134020619, - "grad_norm": 0.01843906380236149, - "learning_rate": 3.7409539441954965e-06, - "loss": 0.0003, - "step": 856 - }, - { - "epoch": 4.417525773195877, - "grad_norm": 0.005279079079627991, - "learning_rate": 3.6764239256293577e-06, - "loss": 0.0001, - "step": 857 - }, - { - "epoch": 4.422680412371134, - "grad_norm": 0.006745612248778343, - "learning_rate": 3.612434071200771e-06, - "loss": 0.0002, - "step": 858 - }, - { - "epoch": 4.427835051546392, - "grad_norm": 0.3258199691772461, - "learning_rate": 3.548985127074611e-06, - "loss": 0.0014, - "step": 859 - }, - { - "epoch": 4.43298969072165, - "grad_norm": 0.017199253663420677, - "learning_rate": 3.486077833108342e-06, - "loss": 0.0003, - "step": 860 - }, - { - "epoch": 4.438144329896907, - "grad_norm": 0.003292699111625552, - "learning_rate": 3.4237129228434415e-06, - "loss": 0.0001, - "step": 861 - }, - { - "epoch": 4.443298969072165, - "grad_norm": 0.13467860221862793, - "learning_rate": 3.3618911234968243e-06, - "loss": 0.0012, - "step": 862 - }, - { - "epoch": 4.448453608247423, - "grad_norm": 0.006807579658925533, - "learning_rate": 3.300613155952359e-06, - "loss": 0.0001, - "step": 863 - }, - { - "epoch": 4.453608247422681, - "grad_norm": 0.04459763318300247, - "learning_rate": 3.2398797347524656e-06, - "loss": 0.0004, - "step": 864 - }, - { - "epoch": 4.458762886597938, - "grad_norm": 0.005752299912273884, - "learning_rate": 3.1796915680897988e-06, - "loss": 0.0002, - "step": 865 } ], "logging_steps": 1, - "max_steps": 970, + "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, @@ -6153,7 +209,7 @@ "attributes": {} } }, - "total_flos": 3.345416922983301e+17, + "total_flos": 2.79484292923392e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null