diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,5859 +1,265 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.252577319587629, + "epoch": 0.18041237113402062, "eval_steps": 97, - "global_step": 825, + "global_step": 35, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005154639175257732, - "grad_norm": 0.8410804867744446, + "grad_norm": 14.970044136047363, "learning_rate": 2.0000000000000003e-06, - "loss": 0.9672, + "loss": 9.1577, "step": 1 }, { "epoch": 0.005154639175257732, - "eval_loss": 0.986353874206543, - "eval_runtime": 22.7571, - "eval_samples_per_second": 7.207, - "eval_steps_per_second": 1.802, + "eval_loss": 9.00115966796875, + "eval_runtime": 22.9959, + "eval_samples_per_second": 7.132, + "eval_steps_per_second": 1.783, "step": 1 }, { "epoch": 0.010309278350515464, - "grad_norm": 0.7938956618309021, + "grad_norm": 15.33629035949707, "learning_rate": 4.000000000000001e-06, - "loss": 1.0214, + "loss": 9.2777, "step": 2 }, { "epoch": 0.015463917525773196, - "grad_norm": 0.705802321434021, + "grad_norm": 16.92699432373047, "learning_rate": 6e-06, - "loss": 0.8948, + "loss": 8.744, "step": 3 }, { "epoch": 0.020618556701030927, - "grad_norm": 0.8201313614845276, + "grad_norm": 19.3622989654541, "learning_rate": 8.000000000000001e-06, - "loss": 0.9046, + "loss": 8.545, "step": 4 }, { "epoch": 0.02577319587628866, - "grad_norm": 1.1557425260543823, + "grad_norm": 20.165861129760742, "learning_rate": 1e-05, - "loss": 1.0693, + "loss": 9.7134, "step": 5 }, { "epoch": 0.030927835051546393, - "grad_norm": 1.263604998588562, + "grad_norm": 18.959148406982422, "learning_rate": 1.2e-05, - "loss": 1.1302, + "loss": 9.7526, "step": 6 }, { "epoch": 0.03608247422680412, - "grad_norm": 0.7817522883415222, + "grad_norm": 16.62849235534668, "learning_rate": 1.4000000000000001e-05, - "loss": 0.9247, + "loss": 8.9248, "step": 7 }, { "epoch": 0.041237113402061855, - "grad_norm": 0.6655614376068115, + "grad_norm": 19.083742141723633, "learning_rate": 1.6000000000000003e-05, - "loss": 0.6942, + "loss": 9.0543, "step": 8 }, { "epoch": 0.04639175257731959, - "grad_norm": 0.9141671061515808, + "grad_norm": 16.922298431396484, "learning_rate": 1.8e-05, - "loss": 0.8889, + "loss": 8.7968, "step": 9 }, { "epoch": 0.05154639175257732, - "grad_norm": 0.8002082109451294, + "grad_norm": 15.013986587524414, "learning_rate": 2e-05, - "loss": 0.9334, + "loss": 8.8618, "step": 10 }, { "epoch": 0.05670103092783505, - "grad_norm": 1.0407661199569702, + "grad_norm": 16.929433822631836, "learning_rate": 2.2000000000000003e-05, - "loss": 0.9861, + "loss": 8.3198, "step": 11 }, { "epoch": 0.061855670103092786, - "grad_norm": 0.7072747945785522, + "grad_norm": 14.889703750610352, "learning_rate": 2.4e-05, - "loss": 1.0617, + "loss": 8.1047, "step": 12 }, { "epoch": 0.06701030927835051, - "grad_norm": 0.6469315886497498, + "grad_norm": 15.865534782409668, "learning_rate": 2.6000000000000002e-05, - "loss": 0.846, + "loss": 7.6863, "step": 13 }, { "epoch": 0.07216494845360824, - "grad_norm": 0.7403808236122131, + "grad_norm": 15.19003963470459, "learning_rate": 2.8000000000000003e-05, - "loss": 0.8555, + "loss": 7.779, "step": 14 }, { "epoch": 0.07731958762886598, - "grad_norm": 0.7198350429534912, + "grad_norm": 17.154193878173828, "learning_rate": 3e-05, - "loss": 0.9019, + "loss": 7.7273, "step": 15 }, { "epoch": 0.08247422680412371, - "grad_norm": 0.5983201265335083, + "grad_norm": 14.37552547454834, "learning_rate": 3.2000000000000005e-05, - "loss": 0.9096, + "loss": 7.1202, "step": 16 }, { "epoch": 0.08762886597938144, - "grad_norm": 0.6279081702232361, + "grad_norm": 13.732095718383789, "learning_rate": 3.4000000000000007e-05, - "loss": 0.945, + "loss": 6.9298, "step": 17 }, { "epoch": 0.09278350515463918, - "grad_norm": 0.4670332670211792, + "grad_norm": 13.2391357421875, "learning_rate": 3.6e-05, - "loss": 0.8456, + "loss": 6.308, "step": 18 }, { "epoch": 0.0979381443298969, - "grad_norm": 0.4307556748390198, + "grad_norm": 11.839639663696289, "learning_rate": 3.8e-05, - "loss": 0.8309, + "loss": 6.1335, "step": 19 }, { "epoch": 0.10309278350515463, - "grad_norm": 0.6864880919456482, + "grad_norm": 11.819024085998535, "learning_rate": 4e-05, - "loss": 1.1069, + "loss": 5.7396, "step": 20 }, { "epoch": 0.10824742268041238, - "grad_norm": 0.3861861526966095, + "grad_norm": 11.764089584350586, "learning_rate": 4.2e-05, - "loss": 0.864, + "loss": 5.1633, "step": 21 }, { "epoch": 0.1134020618556701, - "grad_norm": 0.5285366773605347, + "grad_norm": 12.526201248168945, "learning_rate": 4.4000000000000006e-05, - "loss": 0.8627, + "loss": 4.7586, "step": 22 }, { "epoch": 0.11855670103092783, - "grad_norm": 0.39096203446388245, + "grad_norm": 10.944100379943848, "learning_rate": 4.600000000000001e-05, - "loss": 0.8984, + "loss": 4.47, "step": 23 }, { "epoch": 0.12371134020618557, - "grad_norm": 0.39784133434295654, + "grad_norm": 14.008569717407227, "learning_rate": 4.8e-05, - "loss": 0.9631, + "loss": 4.2032, "step": 24 }, { "epoch": 0.12886597938144329, - "grad_norm": 0.37275853753089905, + "grad_norm": 14.705824851989746, "learning_rate": 5e-05, - "loss": 0.831, + "loss": 3.2385, "step": 25 }, { "epoch": 0.13402061855670103, - "grad_norm": 0.4517609179019928, + "grad_norm": 16.897640228271484, "learning_rate": 5.2000000000000004e-05, - "loss": 0.8904, + "loss": 3.0181, "step": 26 }, { "epoch": 0.13917525773195877, - "grad_norm": 0.4948658347129822, + "grad_norm": 15.038681983947754, "learning_rate": 5.4000000000000005e-05, - "loss": 0.8942, + "loss": 2.171, "step": 27 }, { "epoch": 0.14432989690721648, - "grad_norm": 0.43722641468048096, + "grad_norm": 15.12243938446045, "learning_rate": 5.6000000000000006e-05, - "loss": 0.8707, + "loss": 2.1848, "step": 28 }, { "epoch": 0.14948453608247422, - "grad_norm": 0.36861544847488403, + "grad_norm": 12.384464263916016, "learning_rate": 5.8e-05, - "loss": 0.7922, + "loss": 1.6113, "step": 29 }, { "epoch": 0.15463917525773196, - "grad_norm": 0.5646365880966187, + "grad_norm": 10.626302719116211, "learning_rate": 6e-05, - "loss": 0.9966, + "loss": 0.8058, "step": 30 }, { "epoch": 0.15979381443298968, - "grad_norm": 0.32113298773765564, + "grad_norm": 10.231700897216797, "learning_rate": 6.2e-05, - "loss": 0.6493, + "loss": 1.1714, "step": 31 }, { "epoch": 0.16494845360824742, - "grad_norm": 0.5166299939155579, + "grad_norm": 11.243683815002441, "learning_rate": 6.400000000000001e-05, - "loss": 0.9284, + "loss": 0.8837, "step": 32 }, { "epoch": 0.17010309278350516, - "grad_norm": 0.47344839572906494, + "grad_norm": 13.52505874633789, "learning_rate": 6.6e-05, - "loss": 0.718, + "loss": 0.9935, "step": 33 }, { "epoch": 0.17525773195876287, - "grad_norm": 0.4184200167655945, + "grad_norm": 11.277730941772461, "learning_rate": 6.800000000000001e-05, - "loss": 0.8632, + "loss": 0.7997, "step": 34 }, { "epoch": 0.18041237113402062, - "grad_norm": 0.5064208507537842, + "grad_norm": 8.148871421813965, "learning_rate": 7e-05, - "loss": 0.8827, + "loss": 0.5211, "step": 35 - }, - { - "epoch": 0.18556701030927836, - "grad_norm": 0.3908073902130127, - "learning_rate": 7.2e-05, - "loss": 0.7842, - "step": 36 - }, - { - "epoch": 0.19072164948453607, - "grad_norm": 0.4032430648803711, - "learning_rate": 7.4e-05, - "loss": 0.711, - "step": 37 - }, - { - "epoch": 0.1958762886597938, - "grad_norm": 0.44231921434402466, - "learning_rate": 7.6e-05, - "loss": 0.755, - "step": 38 - }, - { - "epoch": 0.20103092783505155, - "grad_norm": 0.4277024269104004, - "learning_rate": 7.800000000000001e-05, - "loss": 0.9275, - "step": 39 - }, - { - "epoch": 0.20618556701030927, - "grad_norm": 0.417645663022995, - "learning_rate": 8e-05, - "loss": 0.7142, - "step": 40 - }, - { - "epoch": 0.211340206185567, - "grad_norm": 0.5510072708129883, - "learning_rate": 8.2e-05, - "loss": 0.8196, - "step": 41 - }, - { - "epoch": 0.21649484536082475, - "grad_norm": 0.42206454277038574, - "learning_rate": 8.4e-05, - "loss": 0.8858, - "step": 42 - }, - { - "epoch": 0.22164948453608246, - "grad_norm": 0.4281873106956482, - "learning_rate": 8.6e-05, - "loss": 0.9484, - "step": 43 - }, - { - "epoch": 0.2268041237113402, - "grad_norm": 0.36399605870246887, - "learning_rate": 8.800000000000001e-05, - "loss": 0.662, - "step": 44 - }, - { - "epoch": 0.23195876288659795, - "grad_norm": 0.43412840366363525, - "learning_rate": 9e-05, - "loss": 0.7535, - "step": 45 - }, - { - "epoch": 0.23711340206185566, - "grad_norm": 0.5094694495201111, - "learning_rate": 9.200000000000001e-05, - "loss": 1.104, - "step": 46 - }, - { - "epoch": 0.2422680412371134, - "grad_norm": 0.5075117945671082, - "learning_rate": 9.4e-05, - "loss": 0.8194, - "step": 47 - }, - { - "epoch": 0.24742268041237114, - "grad_norm": 0.4164354205131531, - "learning_rate": 9.6e-05, - "loss": 0.8732, - "step": 48 - }, - { - "epoch": 0.25257731958762886, - "grad_norm": 0.4368288218975067, - "learning_rate": 9.8e-05, - "loss": 0.9469, - "step": 49 - }, - { - "epoch": 0.25773195876288657, - "grad_norm": 0.4410831928253174, - "learning_rate": 0.0001, - "loss": 0.9754, - "step": 50 - }, - { - "epoch": 0.26288659793814434, - "grad_norm": 0.3590613603591919, - "learning_rate": 9.999970848314005e-05, - "loss": 0.6932, - "step": 51 - }, - { - "epoch": 0.26804123711340205, - "grad_norm": 0.4591519832611084, - "learning_rate": 9.999883393595947e-05, - "loss": 0.8374, - "step": 52 - }, - { - "epoch": 0.27319587628865977, - "grad_norm": 0.3702682852745056, - "learning_rate": 9.999737636865609e-05, - "loss": 0.73, - "step": 53 - }, - { - "epoch": 0.27835051546391754, - "grad_norm": 0.42546412348747253, - "learning_rate": 9.99953357982261e-05, - "loss": 0.7748, - "step": 54 - }, - { - "epoch": 0.28350515463917525, - "grad_norm": 0.4516392946243286, - "learning_rate": 9.999271224846396e-05, - "loss": 0.8051, - "step": 55 - }, - { - "epoch": 0.28865979381443296, - "grad_norm": 0.34078386425971985, - "learning_rate": 9.998950574996199e-05, - "loss": 0.6558, - "step": 56 - }, - { - "epoch": 0.29381443298969073, - "grad_norm": 0.5125095844268799, - "learning_rate": 9.998571634011015e-05, - "loss": 0.9307, - "step": 57 - }, - { - "epoch": 0.29896907216494845, - "grad_norm": 0.3470001816749573, - "learning_rate": 9.998134406309554e-05, - "loss": 0.7473, - "step": 58 - }, - { - "epoch": 0.30412371134020616, - "grad_norm": 0.3486801087856293, - "learning_rate": 9.99763889699018e-05, - "loss": 0.6796, - "step": 59 - }, - { - "epoch": 0.30927835051546393, - "grad_norm": 0.4312107563018799, - "learning_rate": 9.99708511183087e-05, - "loss": 0.8765, - "step": 60 - }, - { - "epoch": 0.31443298969072164, - "grad_norm": 0.3910192847251892, - "learning_rate": 9.996473057289132e-05, - "loss": 0.849, - "step": 61 - }, - { - "epoch": 0.31958762886597936, - "grad_norm": 0.4039042890071869, - "learning_rate": 9.995802740501933e-05, - "loss": 0.8548, - "step": 62 - }, - { - "epoch": 0.3247422680412371, - "grad_norm": 0.4193078577518463, - "learning_rate": 9.99507416928562e-05, - "loss": 0.7689, - "step": 63 - }, - { - "epoch": 0.32989690721649484, - "grad_norm": 0.36549004912376404, - "learning_rate": 9.994287352135825e-05, - "loss": 0.8877, - "step": 64 - }, - { - "epoch": 0.33505154639175255, - "grad_norm": 0.3615829348564148, - "learning_rate": 9.993442298227365e-05, - "loss": 0.8102, - "step": 65 - }, - { - "epoch": 0.3402061855670103, - "grad_norm": 0.42232227325439453, - "learning_rate": 9.99253901741414e-05, - "loss": 0.7458, - "step": 66 - }, - { - "epoch": 0.34536082474226804, - "grad_norm": 0.38672903180122375, - "learning_rate": 9.991577520229014e-05, - "loss": 0.923, - "step": 67 - }, - { - "epoch": 0.35051546391752575, - "grad_norm": 0.3575025200843811, - "learning_rate": 9.99055781788369e-05, - "loss": 0.7854, - "step": 68 - }, - { - "epoch": 0.3556701030927835, - "grad_norm": 0.37855708599090576, - "learning_rate": 9.989479922268588e-05, - "loss": 0.6534, - "step": 69 - }, - { - "epoch": 0.36082474226804123, - "grad_norm": 0.3880573809146881, - "learning_rate": 9.988343845952697e-05, - "loss": 0.8377, - "step": 70 - }, - { - "epoch": 0.36597938144329895, - "grad_norm": 0.41330426931381226, - "learning_rate": 9.98714960218343e-05, - "loss": 0.8176, - "step": 71 - }, - { - "epoch": 0.3711340206185567, - "grad_norm": 0.3920232057571411, - "learning_rate": 9.985897204886481e-05, - "loss": 0.7983, - "step": 72 - }, - { - "epoch": 0.37628865979381443, - "grad_norm": 0.4510120451450348, - "learning_rate": 9.98458666866564e-05, - "loss": 0.8814, - "step": 73 - }, - { - "epoch": 0.38144329896907214, - "grad_norm": 0.4475913643836975, - "learning_rate": 9.983218008802648e-05, - "loss": 0.8086, - "step": 74 - }, - { - "epoch": 0.3865979381443299, - "grad_norm": 0.4115805923938751, - "learning_rate": 9.981791241257e-05, - "loss": 0.8963, - "step": 75 - }, - { - "epoch": 0.3917525773195876, - "grad_norm": 0.3336406648159027, - "learning_rate": 9.98030638266577e-05, - "loss": 0.6722, - "step": 76 - }, - { - "epoch": 0.39690721649484534, - "grad_norm": 0.4010220766067505, - "learning_rate": 9.978763450343407e-05, - "loss": 0.9867, - "step": 77 - }, - { - "epoch": 0.4020618556701031, - "grad_norm": 0.45894166827201843, - "learning_rate": 9.977162462281544e-05, - "loss": 0.8535, - "step": 78 - }, - { - "epoch": 0.4072164948453608, - "grad_norm": 0.36226922273635864, - "learning_rate": 9.975503437148783e-05, - "loss": 0.715, - "step": 79 - }, - { - "epoch": 0.41237113402061853, - "grad_norm": 0.4665627181529999, - "learning_rate": 9.973786394290474e-05, - "loss": 0.8986, - "step": 80 - }, - { - "epoch": 0.4175257731958763, - "grad_norm": 0.38297462463378906, - "learning_rate": 9.972011353728496e-05, - "loss": 0.6301, - "step": 81 - }, - { - "epoch": 0.422680412371134, - "grad_norm": 0.42398950457572937, - "learning_rate": 9.970178336161018e-05, - "loss": 0.8692, - "step": 82 - }, - { - "epoch": 0.42783505154639173, - "grad_norm": 0.43351009488105774, - "learning_rate": 9.968287362962264e-05, - "loss": 0.8294, - "step": 83 - }, - { - "epoch": 0.4329896907216495, - "grad_norm": 0.38560381531715393, - "learning_rate": 9.96633845618225e-05, - "loss": 0.6438, - "step": 84 - }, - { - "epoch": 0.4381443298969072, - "grad_norm": 0.38818269968032837, - "learning_rate": 9.96433163854655e-05, - "loss": 0.8707, - "step": 85 - }, - { - "epoch": 0.44329896907216493, - "grad_norm": 0.42916783690452576, - "learning_rate": 9.962266933456008e-05, - "loss": 0.8905, - "step": 86 - }, - { - "epoch": 0.4484536082474227, - "grad_norm": 0.4095560908317566, - "learning_rate": 9.96014436498648e-05, - "loss": 0.7936, - "step": 87 - }, - { - "epoch": 0.4536082474226804, - "grad_norm": 0.3557605445384979, - "learning_rate": 9.957963957888542e-05, - "loss": 0.7018, - "step": 88 - }, - { - "epoch": 0.4587628865979381, - "grad_norm": 0.3835409879684448, - "learning_rate": 9.955725737587214e-05, - "loss": 0.7601, - "step": 89 - }, - { - "epoch": 0.4639175257731959, - "grad_norm": 0.43281444907188416, - "learning_rate": 9.953429730181653e-05, - "loss": 0.8437, - "step": 90 - }, - { - "epoch": 0.4690721649484536, - "grad_norm": 0.5132317543029785, - "learning_rate": 9.951075962444856e-05, - "loss": 0.9834, - "step": 91 - }, - { - "epoch": 0.4742268041237113, - "grad_norm": 0.48250970244407654, - "learning_rate": 9.94866446182334e-05, - "loss": 0.9003, - "step": 92 - }, - { - "epoch": 0.4793814432989691, - "grad_norm": 0.4286979138851166, - "learning_rate": 9.94619525643683e-05, - "loss": 0.9507, - "step": 93 - }, - { - "epoch": 0.4845360824742268, - "grad_norm": 0.3736780285835266, - "learning_rate": 9.943668375077925e-05, - "loss": 0.8167, - "step": 94 - }, - { - "epoch": 0.4896907216494845, - "grad_norm": 0.39715173840522766, - "learning_rate": 9.941083847211765e-05, - "loss": 0.7804, - "step": 95 - }, - { - "epoch": 0.4948453608247423, - "grad_norm": 0.4031369686126709, - "learning_rate": 9.938441702975689e-05, - "loss": 0.8113, - "step": 96 - }, - { - "epoch": 0.5, - "grad_norm": 0.39510655403137207, - "learning_rate": 9.93574197317888e-05, - "loss": 0.7941, - "step": 97 - }, - { - "epoch": 0.5, - "eval_loss": 0.8026774525642395, - "eval_runtime": 23.1438, - "eval_samples_per_second": 7.086, - "eval_steps_per_second": 1.772, - "step": 97 - }, - { - "epoch": 0.5051546391752577, - "grad_norm": 0.5623717308044434, - "learning_rate": 9.93298468930201e-05, - "loss": 0.9629, - "step": 98 - }, - { - "epoch": 0.5103092783505154, - "grad_norm": 0.3663593530654907, - "learning_rate": 9.930169883496867e-05, - "loss": 0.7471, - "step": 99 - }, - { - "epoch": 0.5154639175257731, - "grad_norm": 0.41299688816070557, - "learning_rate": 9.927297588585984e-05, - "loss": 0.6697, - "step": 100 - }, - { - "epoch": 0.520618556701031, - "grad_norm": 0.41344979405403137, - "learning_rate": 9.924367838062259e-05, - "loss": 0.6884, - "step": 101 - }, - { - "epoch": 0.5257731958762887, - "grad_norm": 0.4803592562675476, - "learning_rate": 9.921380666088558e-05, - "loss": 0.807, - "step": 102 - }, - { - "epoch": 0.5309278350515464, - "grad_norm": 0.3681694567203522, - "learning_rate": 9.91833610749732e-05, - "loss": 0.8595, - "step": 103 - }, - { - "epoch": 0.5360824742268041, - "grad_norm": 0.3941933810710907, - "learning_rate": 9.915234197790152e-05, - "loss": 0.7553, - "step": 104 - }, - { - "epoch": 0.5412371134020618, - "grad_norm": 0.4864148199558258, - "learning_rate": 9.912074973137412e-05, - "loss": 0.9059, - "step": 105 - }, - { - "epoch": 0.5463917525773195, - "grad_norm": 0.4248103201389313, - "learning_rate": 9.908858470377793e-05, - "loss": 0.7479, - "step": 106 - }, - { - "epoch": 0.5515463917525774, - "grad_norm": 0.39183953404426575, - "learning_rate": 9.905584727017884e-05, - "loss": 0.8723, - "step": 107 - }, - { - "epoch": 0.5567010309278351, - "grad_norm": 0.35667285323143005, - "learning_rate": 9.90225378123174e-05, - "loss": 0.7337, - "step": 108 - }, - { - "epoch": 0.5618556701030928, - "grad_norm": 0.33024296164512634, - "learning_rate": 9.898865671860438e-05, - "loss": 0.6999, - "step": 109 - }, - { - "epoch": 0.5670103092783505, - "grad_norm": 0.5383896827697754, - "learning_rate": 9.895420438411616e-05, - "loss": 0.7962, - "step": 110 - }, - { - "epoch": 0.5721649484536082, - "grad_norm": 0.41683122515678406, - "learning_rate": 9.891918121059019e-05, - "loss": 0.8638, - "step": 111 - }, - { - "epoch": 0.5773195876288659, - "grad_norm": 0.37739497423171997, - "learning_rate": 9.888358760642029e-05, - "loss": 0.65, - "step": 112 - }, - { - "epoch": 0.5824742268041238, - "grad_norm": 0.5009469985961914, - "learning_rate": 9.884742398665191e-05, - "loss": 0.8071, - "step": 113 - }, - { - "epoch": 0.5876288659793815, - "grad_norm": 0.38047826290130615, - "learning_rate": 9.881069077297723e-05, - "loss": 0.702, - "step": 114 - }, - { - "epoch": 0.5927835051546392, - "grad_norm": 0.42374080419540405, - "learning_rate": 9.877338839373032e-05, - "loss": 0.761, - "step": 115 - }, - { - "epoch": 0.5979381443298969, - "grad_norm": 0.45142003893852234, - "learning_rate": 9.873551728388203e-05, - "loss": 0.7846, - "step": 116 - }, - { - "epoch": 0.6030927835051546, - "grad_norm": 0.4653826653957367, - "learning_rate": 9.869707788503508e-05, - "loss": 0.8521, - "step": 117 - }, - { - "epoch": 0.6082474226804123, - "grad_norm": 0.40280038118362427, - "learning_rate": 9.865807064541877e-05, - "loss": 0.7892, - "step": 118 - }, - { - "epoch": 0.6134020618556701, - "grad_norm": 0.38454410433769226, - "learning_rate": 9.861849601988383e-05, - "loss": 0.7835, - "step": 119 - }, - { - "epoch": 0.6185567010309279, - "grad_norm": 0.43196186423301697, - "learning_rate": 9.857835446989707e-05, - "loss": 0.7327, - "step": 120 - }, - { - "epoch": 0.6237113402061856, - "grad_norm": 0.4396250247955322, - "learning_rate": 9.853764646353605e-05, - "loss": 0.9224, - "step": 121 - }, - { - "epoch": 0.6288659793814433, - "grad_norm": 0.34583795070648193, - "learning_rate": 9.849637247548356e-05, - "loss": 0.6383, - "step": 122 - }, - { - "epoch": 0.634020618556701, - "grad_norm": 0.49787506461143494, - "learning_rate": 9.845453298702216e-05, - "loss": 0.8184, - "step": 123 - }, - { - "epoch": 0.6391752577319587, - "grad_norm": 0.37390121817588806, - "learning_rate": 9.841212848602846e-05, - "loss": 0.6893, - "step": 124 - }, - { - "epoch": 0.6443298969072165, - "grad_norm": 0.4788094460964203, - "learning_rate": 9.836915946696759e-05, - "loss": 0.7379, - "step": 125 - }, - { - "epoch": 0.6494845360824743, - "grad_norm": 0.5014426112174988, - "learning_rate": 9.832562643088724e-05, - "loss": 0.8205, - "step": 126 - }, - { - "epoch": 0.654639175257732, - "grad_norm": 0.3642871081829071, - "learning_rate": 9.828152988541201e-05, - "loss": 0.6653, - "step": 127 - }, - { - "epoch": 0.6597938144329897, - "grad_norm": 0.3835623264312744, - "learning_rate": 9.823687034473735e-05, - "loss": 0.7903, - "step": 128 - }, - { - "epoch": 0.6649484536082474, - "grad_norm": 0.36520278453826904, - "learning_rate": 9.81916483296236e-05, - "loss": 0.5541, - "step": 129 - }, - { - "epoch": 0.6701030927835051, - "grad_norm": 0.382394403219223, - "learning_rate": 9.814586436738998e-05, - "loss": 0.8159, - "step": 130 - }, - { - "epoch": 0.6752577319587629, - "grad_norm": 0.4563412070274353, - "learning_rate": 9.809951899190835e-05, - "loss": 0.9263, - "step": 131 - }, - { - "epoch": 0.6804123711340206, - "grad_norm": 0.43446069955825806, - "learning_rate": 9.805261274359705e-05, - "loss": 0.8323, - "step": 132 - }, - { - "epoch": 0.6855670103092784, - "grad_norm": 0.3788876235485077, - "learning_rate": 9.800514616941457e-05, - "loss": 0.8373, - "step": 133 - }, - { - "epoch": 0.6907216494845361, - "grad_norm": 0.32987335324287415, - "learning_rate": 9.795711982285316e-05, - "loss": 0.5943, - "step": 134 - }, - { - "epoch": 0.6958762886597938, - "grad_norm": 0.41327905654907227, - "learning_rate": 9.790853426393245e-05, - "loss": 0.7817, - "step": 135 - }, - { - "epoch": 0.7010309278350515, - "grad_norm": 0.40496692061424255, - "learning_rate": 9.785939005919278e-05, - "loss": 0.7946, - "step": 136 - }, - { - "epoch": 0.7061855670103093, - "grad_norm": 0.305164635181427, - "learning_rate": 9.780968778168874e-05, - "loss": 0.6239, - "step": 137 - }, - { - "epoch": 0.711340206185567, - "grad_norm": 0.5270521640777588, - "learning_rate": 9.77594280109824e-05, - "loss": 0.8353, - "step": 138 - }, - { - "epoch": 0.7164948453608248, - "grad_norm": 0.3462916314601898, - "learning_rate": 9.77086113331366e-05, - "loss": 0.7364, - "step": 139 - }, - { - "epoch": 0.7216494845360825, - "grad_norm": 0.4270078241825104, - "learning_rate": 9.765723834070804e-05, - "loss": 0.6089, - "step": 140 - }, - { - "epoch": 0.7268041237113402, - "grad_norm": 0.42253026366233826, - "learning_rate": 9.760530963274048e-05, - "loss": 0.6619, - "step": 141 - }, - { - "epoch": 0.7319587628865979, - "grad_norm": 0.4140520691871643, - "learning_rate": 9.755282581475769e-05, - "loss": 0.7179, - "step": 142 - }, - { - "epoch": 0.7371134020618557, - "grad_norm": 0.4326084852218628, - "learning_rate": 9.749978749875635e-05, - "loss": 0.7465, - "step": 143 - }, - { - "epoch": 0.7422680412371134, - "grad_norm": 0.44547709822654724, - "learning_rate": 9.744619530319899e-05, - "loss": 0.7606, - "step": 144 - }, - { - "epoch": 0.7474226804123711, - "grad_norm": 0.4291188418865204, - "learning_rate": 9.739204985300679e-05, - "loss": 0.8603, - "step": 145 - }, - { - "epoch": 0.7525773195876289, - "grad_norm": 0.38336077332496643, - "learning_rate": 9.733735177955219e-05, - "loss": 0.7124, - "step": 146 - }, - { - "epoch": 0.7577319587628866, - "grad_norm": 0.49138304591178894, - "learning_rate": 9.728210172065162e-05, - "loss": 0.9644, - "step": 147 - }, - { - "epoch": 0.7628865979381443, - "grad_norm": 0.4559629261493683, - "learning_rate": 9.722630032055803e-05, - "loss": 0.8036, - "step": 148 - }, - { - "epoch": 0.7680412371134021, - "grad_norm": 0.45178917050361633, - "learning_rate": 9.716994822995338e-05, - "loss": 0.9369, - "step": 149 - }, - { - "epoch": 0.7731958762886598, - "grad_norm": 0.3612707257270813, - "learning_rate": 9.711304610594104e-05, - "loss": 0.6617, - "step": 150 - }, - { - "epoch": 0.7783505154639175, - "grad_norm": 0.41716015338897705, - "learning_rate": 9.705559461203815e-05, - "loss": 0.8865, - "step": 151 - }, - { - "epoch": 0.7835051546391752, - "grad_norm": 0.4099236726760864, - "learning_rate": 9.699759441816787e-05, - "loss": 0.7566, - "step": 152 - }, - { - "epoch": 0.788659793814433, - "grad_norm": 0.42841625213623047, - "learning_rate": 9.69390462006516e-05, - "loss": 0.7696, - "step": 153 - }, - { - "epoch": 0.7938144329896907, - "grad_norm": 0.33566102385520935, - "learning_rate": 9.687995064220102e-05, - "loss": 0.7742, - "step": 154 - }, - { - "epoch": 0.7989690721649485, - "grad_norm": 0.4562363624572754, - "learning_rate": 9.682030843191022e-05, - "loss": 0.8731, - "step": 155 - }, - { - "epoch": 0.8041237113402062, - "grad_norm": 0.43077826499938965, - "learning_rate": 9.676012026524755e-05, - "loss": 0.7728, - "step": 156 - }, - { - "epoch": 0.8092783505154639, - "grad_norm": 0.4811527729034424, - "learning_rate": 9.669938684404766e-05, - "loss": 0.9621, - "step": 157 - }, - { - "epoch": 0.8144329896907216, - "grad_norm": 0.340797483921051, - "learning_rate": 9.663810887650318e-05, - "loss": 0.6092, - "step": 158 - }, - { - "epoch": 0.8195876288659794, - "grad_norm": 0.399932324886322, - "learning_rate": 9.657628707715655e-05, - "loss": 0.6474, - "step": 159 - }, - { - "epoch": 0.8247422680412371, - "grad_norm": 0.4639035761356354, - "learning_rate": 9.651392216689165e-05, - "loss": 0.9095, - "step": 160 - }, - { - "epoch": 0.8298969072164949, - "grad_norm": 0.39397144317626953, - "learning_rate": 9.645101487292539e-05, - "loss": 0.7839, - "step": 161 - }, - { - "epoch": 0.8350515463917526, - "grad_norm": 0.45253580808639526, - "learning_rate": 9.638756592879922e-05, - "loss": 0.7803, - "step": 162 - }, - { - "epoch": 0.8402061855670103, - "grad_norm": 0.39097675681114197, - "learning_rate": 9.632357607437065e-05, - "loss": 0.7402, - "step": 163 - }, - { - "epoch": 0.845360824742268, - "grad_norm": 0.3595625162124634, - "learning_rate": 9.625904605580452e-05, - "loss": 0.6507, - "step": 164 - }, - { - "epoch": 0.8505154639175257, - "grad_norm": 0.3774010241031647, - "learning_rate": 9.619397662556435e-05, - "loss": 0.7891, - "step": 165 - }, - { - "epoch": 0.8556701030927835, - "grad_norm": 0.43031737208366394, - "learning_rate": 9.612836854240358e-05, - "loss": 0.8141, - "step": 166 - }, - { - "epoch": 0.8608247422680413, - "grad_norm": 0.43795299530029297, - "learning_rate": 9.606222257135675e-05, - "loss": 0.7407, - "step": 167 - }, - { - "epoch": 0.865979381443299, - "grad_norm": 0.43038204312324524, - "learning_rate": 9.599553948373045e-05, - "loss": 0.5853, - "step": 168 - }, - { - "epoch": 0.8711340206185567, - "grad_norm": 0.4552323818206787, - "learning_rate": 9.592832005709448e-05, - "loss": 0.8141, - "step": 169 - }, - { - "epoch": 0.8762886597938144, - "grad_norm": 0.4247092604637146, - "learning_rate": 9.586056507527266e-05, - "loss": 0.8336, - "step": 170 - }, - { - "epoch": 0.8814432989690721, - "grad_norm": 0.4271308481693268, - "learning_rate": 9.579227532833377e-05, - "loss": 0.8686, - "step": 171 - }, - { - "epoch": 0.8865979381443299, - "grad_norm": 0.49759241938591003, - "learning_rate": 9.572345161258235e-05, - "loss": 0.8467, - "step": 172 - }, - { - "epoch": 0.8917525773195877, - "grad_norm": 0.38598212599754333, - "learning_rate": 9.565409473054932e-05, - "loss": 0.6457, - "step": 173 - }, - { - "epoch": 0.8969072164948454, - "grad_norm": 0.3519401550292969, - "learning_rate": 9.558420549098268e-05, - "loss": 0.8855, - "step": 174 - }, - { - "epoch": 0.9020618556701031, - "grad_norm": 0.4537753462791443, - "learning_rate": 9.551378470883812e-05, - "loss": 0.7518, - "step": 175 - }, - { - "epoch": 0.9072164948453608, - "grad_norm": 0.4422832429409027, - "learning_rate": 9.544283320526943e-05, - "loss": 0.8546, - "step": 176 - }, - { - "epoch": 0.9123711340206185, - "grad_norm": 0.43420276045799255, - "learning_rate": 9.537135180761903e-05, - "loss": 0.8066, - "step": 177 - }, - { - "epoch": 0.9175257731958762, - "grad_norm": 0.5223539471626282, - "learning_rate": 9.52993413494082e-05, - "loss": 0.922, - "step": 178 - }, - { - "epoch": 0.9226804123711341, - "grad_norm": 0.32053807377815247, - "learning_rate": 9.522680267032742e-05, - "loss": 0.6183, - "step": 179 - }, - { - "epoch": 0.9278350515463918, - "grad_norm": 0.4255862236022949, - "learning_rate": 9.515373661622664e-05, - "loss": 0.8405, - "step": 180 - }, - { - "epoch": 0.9329896907216495, - "grad_norm": 0.4862018823623657, - "learning_rate": 9.508014403910533e-05, - "loss": 0.8749, - "step": 181 - }, - { - "epoch": 0.9381443298969072, - "grad_norm": 0.38821473717689514, - "learning_rate": 9.500602579710256e-05, - "loss": 0.8354, - "step": 182 - }, - { - "epoch": 0.9432989690721649, - "grad_norm": 0.416502982378006, - "learning_rate": 9.4931382754487e-05, - "loss": 0.7721, - "step": 183 - }, - { - "epoch": 0.9484536082474226, - "grad_norm": 0.38249465823173523, - "learning_rate": 9.485621578164689e-05, - "loss": 0.6713, - "step": 184 - }, - { - "epoch": 0.9536082474226805, - "grad_norm": 0.4525269567966461, - "learning_rate": 9.478052575507982e-05, - "loss": 0.7183, - "step": 185 - }, - { - "epoch": 0.9587628865979382, - "grad_norm": 0.35123562812805176, - "learning_rate": 9.470431355738257e-05, - "loss": 0.7076, - "step": 186 - }, - { - "epoch": 0.9639175257731959, - "grad_norm": 0.3771204948425293, - "learning_rate": 9.46275800772407e-05, - "loss": 0.827, - "step": 187 - }, - { - "epoch": 0.9690721649484536, - "grad_norm": 0.39616143703460693, - "learning_rate": 9.45503262094184e-05, - "loss": 0.7204, - "step": 188 - }, - { - "epoch": 0.9742268041237113, - "grad_norm": 0.40495437383651733, - "learning_rate": 9.447255285474783e-05, - "loss": 0.739, - "step": 189 - }, - { - "epoch": 0.979381443298969, - "grad_norm": 0.44291606545448303, - "learning_rate": 9.439426092011875e-05, - "loss": 0.9957, - "step": 190 - }, - { - "epoch": 0.9845360824742269, - "grad_norm": 0.39156320691108704, - "learning_rate": 9.431545131846797e-05, - "loss": 0.8605, - "step": 191 - }, - { - "epoch": 0.9896907216494846, - "grad_norm": 0.5261005163192749, - "learning_rate": 9.423612496876855e-05, - "loss": 0.7107, - "step": 192 - }, - { - "epoch": 0.9948453608247423, - "grad_norm": 0.39191609621047974, - "learning_rate": 9.415628279601923e-05, - "loss": 0.7519, - "step": 193 - }, - { - "epoch": 1.0, - "grad_norm": 0.5396644473075867, - "learning_rate": 9.407592573123358e-05, - "loss": 0.7442, - "step": 194 - }, - { - "epoch": 1.0, - "eval_loss": 0.7816203236579895, - "eval_runtime": 23.1628, - "eval_samples_per_second": 7.08, - "eval_steps_per_second": 1.77, - "step": 194 - }, - { - "epoch": 1.0051546391752577, - "grad_norm": 0.37258586287498474, - "learning_rate": 9.39950547114292e-05, - "loss": 0.8192, - "step": 195 - }, - { - "epoch": 1.0103092783505154, - "grad_norm": 0.3827040493488312, - "learning_rate": 9.39136706796167e-05, - "loss": 0.6509, - "step": 196 - }, - { - "epoch": 1.0154639175257731, - "grad_norm": 0.3988623023033142, - "learning_rate": 9.383177458478878e-05, - "loss": 0.7298, - "step": 197 - }, - { - "epoch": 1.0206185567010309, - "grad_norm": 0.40072330832481384, - "learning_rate": 9.374936738190914e-05, - "loss": 0.7147, - "step": 198 - }, - { - "epoch": 1.0257731958762886, - "grad_norm": 0.45648422837257385, - "learning_rate": 9.366645003190132e-05, - "loss": 0.7607, - "step": 199 - }, - { - "epoch": 1.0309278350515463, - "grad_norm": 0.45081812143325806, - "learning_rate": 9.358302350163757e-05, - "loss": 0.9211, - "step": 200 - }, - { - "epoch": 1.0360824742268042, - "grad_norm": 0.4077489674091339, - "learning_rate": 9.349908876392748e-05, - "loss": 0.6459, - "step": 201 - }, - { - "epoch": 1.041237113402062, - "grad_norm": 0.47869938611984253, - "learning_rate": 9.341464679750669e-05, - "loss": 0.7938, - "step": 202 - }, - { - "epoch": 1.0463917525773196, - "grad_norm": 0.45740288496017456, - "learning_rate": 9.33296985870255e-05, - "loss": 0.7555, - "step": 203 - }, - { - "epoch": 1.0515463917525774, - "grad_norm": 0.3793458938598633, - "learning_rate": 9.32442451230373e-05, - "loss": 0.6282, - "step": 204 - }, - { - "epoch": 1.056701030927835, - "grad_norm": 0.4383585751056671, - "learning_rate": 9.315828740198714e-05, - "loss": 0.808, - "step": 205 - }, - { - "epoch": 1.0618556701030928, - "grad_norm": 0.36687490344047546, - "learning_rate": 9.30718264262e-05, - "loss": 0.8306, - "step": 206 - }, - { - "epoch": 1.0670103092783505, - "grad_norm": 0.3878268599510193, - "learning_rate": 9.298486320386919e-05, - "loss": 0.6411, - "step": 207 - }, - { - "epoch": 1.0721649484536082, - "grad_norm": 0.41478773951530457, - "learning_rate": 9.289739874904449e-05, - "loss": 0.6118, - "step": 208 - }, - { - "epoch": 1.077319587628866, - "grad_norm": 0.3920111358165741, - "learning_rate": 9.280943408162046e-05, - "loss": 0.6645, - "step": 209 - }, - { - "epoch": 1.0824742268041236, - "grad_norm": 0.44977059960365295, - "learning_rate": 9.272097022732443e-05, - "loss": 0.7551, - "step": 210 - }, - { - "epoch": 1.0876288659793814, - "grad_norm": 0.4331608712673187, - "learning_rate": 9.263200821770461e-05, - "loss": 0.7164, - "step": 211 - }, - { - "epoch": 1.0927835051546393, - "grad_norm": 0.47665220499038696, - "learning_rate": 9.254254909011804e-05, - "loss": 0.8104, - "step": 212 - }, - { - "epoch": 1.097938144329897, - "grad_norm": 0.42196914553642273, - "learning_rate": 9.245259388771845e-05, - "loss": 0.7194, - "step": 213 - }, - { - "epoch": 1.1030927835051547, - "grad_norm": 0.42440083622932434, - "learning_rate": 9.236214365944418e-05, - "loss": 0.7578, - "step": 214 - }, - { - "epoch": 1.1082474226804124, - "grad_norm": 0.38077446818351746, - "learning_rate": 9.22711994600059e-05, - "loss": 0.6378, - "step": 215 - }, - { - "epoch": 1.1134020618556701, - "grad_norm": 0.3987003564834595, - "learning_rate": 9.217976234987428e-05, - "loss": 0.7715, - "step": 216 - }, - { - "epoch": 1.1185567010309279, - "grad_norm": 0.3989262282848358, - "learning_rate": 9.208783339526773e-05, - "loss": 0.5646, - "step": 217 - }, - { - "epoch": 1.1237113402061856, - "grad_norm": 0.3729444742202759, - "learning_rate": 9.199541366813982e-05, - "loss": 0.7266, - "step": 218 - }, - { - "epoch": 1.1288659793814433, - "grad_norm": 0.487589031457901, - "learning_rate": 9.190250424616693e-05, - "loss": 0.7127, - "step": 219 - }, - { - "epoch": 1.134020618556701, - "grad_norm": 0.47167909145355225, - "learning_rate": 9.180910621273555e-05, - "loss": 0.681, - "step": 220 - }, - { - "epoch": 1.1391752577319587, - "grad_norm": 0.40272852778434753, - "learning_rate": 9.171522065692975e-05, - "loss": 0.7188, - "step": 221 - }, - { - "epoch": 1.1443298969072164, - "grad_norm": 0.48004817962646484, - "learning_rate": 9.162084867351842e-05, - "loss": 0.6727, - "step": 222 - }, - { - "epoch": 1.1494845360824741, - "grad_norm": 0.35861465334892273, - "learning_rate": 9.152599136294253e-05, - "loss": 0.5481, - "step": 223 - }, - { - "epoch": 1.1546391752577319, - "grad_norm": 0.4914831221103668, - "learning_rate": 9.14306498313023e-05, - "loss": 0.7459, - "step": 224 - }, - { - "epoch": 1.1597938144329896, - "grad_norm": 0.4673561751842499, - "learning_rate": 9.133482519034428e-05, - "loss": 0.7004, - "step": 225 - }, - { - "epoch": 1.1649484536082475, - "grad_norm": 0.4495156407356262, - "learning_rate": 9.123851855744843e-05, - "loss": 0.751, - "step": 226 - }, - { - "epoch": 1.1701030927835052, - "grad_norm": 0.4275054335594177, - "learning_rate": 9.114173105561501e-05, - "loss": 0.7959, - "step": 227 - }, - { - "epoch": 1.175257731958763, - "grad_norm": 0.3814803659915924, - "learning_rate": 9.104446381345159e-05, - "loss": 0.7069, - "step": 228 - }, - { - "epoch": 1.1804123711340206, - "grad_norm": 0.3911239206790924, - "learning_rate": 9.094671796515978e-05, - "loss": 0.627, - "step": 229 - }, - { - "epoch": 1.1855670103092784, - "grad_norm": 0.5213860273361206, - "learning_rate": 9.08484946505221e-05, - "loss": 0.7112, - "step": 230 - }, - { - "epoch": 1.190721649484536, - "grad_norm": 0.41048097610473633, - "learning_rate": 9.074979501488867e-05, - "loss": 0.588, - "step": 231 - }, - { - "epoch": 1.1958762886597938, - "grad_norm": 0.40846139192581177, - "learning_rate": 9.065062020916377e-05, - "loss": 0.6034, - "step": 232 - }, - { - "epoch": 1.2010309278350515, - "grad_norm": 0.4402838945388794, - "learning_rate": 9.055097138979252e-05, - "loss": 0.7185, - "step": 233 - }, - { - "epoch": 1.2061855670103092, - "grad_norm": 0.4315119683742523, - "learning_rate": 9.045084971874738e-05, - "loss": 0.6632, - "step": 234 - }, - { - "epoch": 1.211340206185567, - "grad_norm": 0.5140849947929382, - "learning_rate": 9.035025636351452e-05, - "loss": 0.6369, - "step": 235 - }, - { - "epoch": 1.2164948453608249, - "grad_norm": 0.45563650131225586, - "learning_rate": 9.024919249708035e-05, - "loss": 0.7947, - "step": 236 - }, - { - "epoch": 1.2216494845360826, - "grad_norm": 0.5599696636199951, - "learning_rate": 9.014765929791768e-05, - "loss": 0.7507, - "step": 237 - }, - { - "epoch": 1.2268041237113403, - "grad_norm": 0.4429771602153778, - "learning_rate": 9.004565794997209e-05, - "loss": 0.6726, - "step": 238 - }, - { - "epoch": 1.231958762886598, - "grad_norm": 0.3993242084980011, - "learning_rate": 8.994318964264809e-05, - "loss": 0.6158, - "step": 239 - }, - { - "epoch": 1.2371134020618557, - "grad_norm": 0.4936448931694031, - "learning_rate": 8.984025557079523e-05, - "loss": 0.752, - "step": 240 - }, - { - "epoch": 1.2422680412371134, - "grad_norm": 0.5145292282104492, - "learning_rate": 8.973685693469423e-05, - "loss": 0.7791, - "step": 241 - }, - { - "epoch": 1.2474226804123711, - "grad_norm": 0.4600600600242615, - "learning_rate": 8.963299494004291e-05, - "loss": 0.7412, - "step": 242 - }, - { - "epoch": 1.2525773195876289, - "grad_norm": 0.3841022849082947, - "learning_rate": 8.952867079794218e-05, - "loss": 0.5547, - "step": 243 - }, - { - "epoch": 1.2577319587628866, - "grad_norm": 0.42398545145988464, - "learning_rate": 8.942388572488187e-05, - "loss": 0.8999, - "step": 244 - }, - { - "epoch": 1.2628865979381443, - "grad_norm": 0.461775004863739, - "learning_rate": 8.931864094272663e-05, - "loss": 0.5883, - "step": 245 - }, - { - "epoch": 1.268041237113402, - "grad_norm": 0.5585225820541382, - "learning_rate": 8.921293767870157e-05, - "loss": 0.7303, - "step": 246 - }, - { - "epoch": 1.2731958762886597, - "grad_norm": 0.45500585436820984, - "learning_rate": 8.910677716537806e-05, - "loss": 0.8066, - "step": 247 - }, - { - "epoch": 1.2783505154639174, - "grad_norm": 0.49855995178222656, - "learning_rate": 8.900016064065923e-05, - "loss": 0.7256, - "step": 248 - }, - { - "epoch": 1.2835051546391751, - "grad_norm": 0.5525998473167419, - "learning_rate": 8.889308934776572e-05, - "loss": 0.8411, - "step": 249 - }, - { - "epoch": 1.2886597938144329, - "grad_norm": 0.49777719378471375, - "learning_rate": 8.8785564535221e-05, - "loss": 0.8178, - "step": 250 - }, - { - "epoch": 1.2938144329896908, - "grad_norm": 0.48288318514823914, - "learning_rate": 8.867758745683687e-05, - "loss": 0.8036, - "step": 251 - }, - { - "epoch": 1.2989690721649485, - "grad_norm": 0.5231608152389526, - "learning_rate": 8.85691593716989e-05, - "loss": 0.8048, - "step": 252 - }, - { - "epoch": 1.3041237113402062, - "grad_norm": 0.4988687038421631, - "learning_rate": 8.84602815441517e-05, - "loss": 0.7114, - "step": 253 - }, - { - "epoch": 1.309278350515464, - "grad_norm": 0.47835099697113037, - "learning_rate": 8.835095524378414e-05, - "loss": 0.8064, - "step": 254 - }, - { - "epoch": 1.3144329896907216, - "grad_norm": 0.4335271716117859, - "learning_rate": 8.824118174541464e-05, - "loss": 0.6845, - "step": 255 - }, - { - "epoch": 1.3195876288659794, - "grad_norm": 0.5307462215423584, - "learning_rate": 8.81309623290762e-05, - "loss": 0.8478, - "step": 256 - }, - { - "epoch": 1.324742268041237, - "grad_norm": 0.4708554446697235, - "learning_rate": 8.802029828000156e-05, - "loss": 0.8098, - "step": 257 - }, - { - "epoch": 1.3298969072164948, - "grad_norm": 0.4892447590827942, - "learning_rate": 8.790919088860814e-05, - "loss": 0.7612, - "step": 258 - }, - { - "epoch": 1.3350515463917525, - "grad_norm": 0.5239393711090088, - "learning_rate": 8.779764145048308e-05, - "loss": 0.7195, - "step": 259 - }, - { - "epoch": 1.3402061855670104, - "grad_norm": 0.5320796966552734, - "learning_rate": 8.768565126636806e-05, - "loss": 0.7106, - "step": 260 - }, - { - "epoch": 1.3453608247422681, - "grad_norm": 0.48917362093925476, - "learning_rate": 8.757322164214413e-05, - "loss": 0.6933, - "step": 261 - }, - { - "epoch": 1.3505154639175259, - "grad_norm": 0.5254843831062317, - "learning_rate": 8.746035388881655e-05, - "loss": 0.8247, - "step": 262 - }, - { - "epoch": 1.3556701030927836, - "grad_norm": 0.4948481619358063, - "learning_rate": 8.734704932249944e-05, - "loss": 0.9323, - "step": 263 - }, - { - "epoch": 1.3608247422680413, - "grad_norm": 0.5369058847427368, - "learning_rate": 8.723330926440045e-05, - "loss": 0.8482, - "step": 264 - }, - { - "epoch": 1.365979381443299, - "grad_norm": 0.48337045311927795, - "learning_rate": 8.711913504080534e-05, - "loss": 0.6482, - "step": 265 - }, - { - "epoch": 1.3711340206185567, - "grad_norm": 0.4217570722103119, - "learning_rate": 8.70045279830626e-05, - "loss": 0.6593, - "step": 266 - }, - { - "epoch": 1.3762886597938144, - "grad_norm": 0.43802469968795776, - "learning_rate": 8.688948942756778e-05, - "loss": 0.6567, - "step": 267 - }, - { - "epoch": 1.3814432989690721, - "grad_norm": 0.5340169072151184, - "learning_rate": 8.677402071574805e-05, - "loss": 0.6976, - "step": 268 - }, - { - "epoch": 1.3865979381443299, - "grad_norm": 0.5338581800460815, - "learning_rate": 8.665812319404643e-05, - "loss": 0.6684, - "step": 269 - }, - { - "epoch": 1.3917525773195876, - "grad_norm": 0.5673936009407043, - "learning_rate": 8.654179821390621e-05, - "loss": 0.8238, - "step": 270 - }, - { - "epoch": 1.3969072164948453, - "grad_norm": 0.3845212459564209, - "learning_rate": 8.642504713175508e-05, - "loss": 0.7176, - "step": 271 - }, - { - "epoch": 1.402061855670103, - "grad_norm": 0.5132887959480286, - "learning_rate": 8.630787130898943e-05, - "loss": 0.819, - "step": 272 - }, - { - "epoch": 1.4072164948453607, - "grad_norm": 0.4670240879058838, - "learning_rate": 8.619027211195836e-05, - "loss": 0.759, - "step": 273 - }, - { - "epoch": 1.4123711340206184, - "grad_norm": 0.4744627773761749, - "learning_rate": 8.607225091194779e-05, - "loss": 0.6824, - "step": 274 - }, - { - "epoch": 1.4175257731958764, - "grad_norm": 0.4141714870929718, - "learning_rate": 8.595380908516454e-05, - "loss": 0.5199, - "step": 275 - }, - { - "epoch": 1.422680412371134, - "grad_norm": 0.4950573444366455, - "learning_rate": 8.583494801272018e-05, - "loss": 0.7358, - "step": 276 - }, - { - "epoch": 1.4278350515463918, - "grad_norm": 0.42697805166244507, - "learning_rate": 8.571566908061497e-05, - "loss": 0.6716, - "step": 277 - }, - { - "epoch": 1.4329896907216495, - "grad_norm": 0.5071122646331787, - "learning_rate": 8.559597367972168e-05, - "loss": 0.7642, - "step": 278 - }, - { - "epoch": 1.4381443298969072, - "grad_norm": 0.5294380187988281, - "learning_rate": 8.547586320576945e-05, - "loss": 0.7594, - "step": 279 - }, - { - "epoch": 1.443298969072165, - "grad_norm": 0.4704250395298004, - "learning_rate": 8.535533905932738e-05, - "loss": 0.6536, - "step": 280 - }, - { - "epoch": 1.4484536082474226, - "grad_norm": 0.4988890290260315, - "learning_rate": 8.52344026457883e-05, - "loss": 0.752, - "step": 281 - }, - { - "epoch": 1.4536082474226804, - "grad_norm": 0.5117019414901733, - "learning_rate": 8.511305537535237e-05, - "loss": 0.6501, - "step": 282 - }, - { - "epoch": 1.458762886597938, - "grad_norm": 0.48722848296165466, - "learning_rate": 8.499129866301057e-05, - "loss": 0.6892, - "step": 283 - }, - { - "epoch": 1.463917525773196, - "grad_norm": 0.4610782563686371, - "learning_rate": 8.48691339285283e-05, - "loss": 0.7259, - "step": 284 - }, - { - "epoch": 1.4690721649484537, - "grad_norm": 0.5337876081466675, - "learning_rate": 8.474656259642873e-05, - "loss": 0.7162, - "step": 285 - }, - { - "epoch": 1.4742268041237114, - "grad_norm": 0.6001775860786438, - "learning_rate": 8.46235860959763e-05, - "loss": 0.8537, - "step": 286 - }, - { - "epoch": 1.4793814432989691, - "grad_norm": 0.5563547015190125, - "learning_rate": 8.450020586115987e-05, - "loss": 0.7413, - "step": 287 - }, - { - "epoch": 1.4845360824742269, - "grad_norm": 0.49136775732040405, - "learning_rate": 8.437642333067625e-05, - "loss": 0.7277, - "step": 288 - }, - { - "epoch": 1.4896907216494846, - "grad_norm": 0.6892454624176025, - "learning_rate": 8.42522399479132e-05, - "loss": 0.7893, - "step": 289 - }, - { - "epoch": 1.4948453608247423, - "grad_norm": 0.41972535848617554, - "learning_rate": 8.412765716093272e-05, - "loss": 0.6497, - "step": 290 - }, - { - "epoch": 1.5, - "grad_norm": 0.5736141800880432, - "learning_rate": 8.40026764224541e-05, - "loss": 0.8443, - "step": 291 - }, - { - "epoch": 1.5, - "eval_loss": 0.775231659412384, - "eval_runtime": 23.1141, - "eval_samples_per_second": 7.095, - "eval_steps_per_second": 1.774, - "step": 291 - }, - { - "epoch": 1.5051546391752577, - "grad_norm": 0.47275158762931824, - "learning_rate": 8.387729918983706e-05, - "loss": 0.5539, - "step": 292 - }, - { - "epoch": 1.5103092783505154, - "grad_norm": 0.5868181586265564, - "learning_rate": 8.375152692506468e-05, - "loss": 0.9621, - "step": 293 - }, - { - "epoch": 1.5154639175257731, - "grad_norm": 0.560245931148529, - "learning_rate": 8.362536109472636e-05, - "loss": 0.8364, - "step": 294 - }, - { - "epoch": 1.5206185567010309, - "grad_norm": 0.5294224619865417, - "learning_rate": 8.349880317000082e-05, - "loss": 0.6673, - "step": 295 - }, - { - "epoch": 1.5257731958762886, - "grad_norm": 0.47846174240112305, - "learning_rate": 8.337185462663878e-05, - "loss": 0.6843, - "step": 296 - }, - { - "epoch": 1.5309278350515463, - "grad_norm": 0.5413031578063965, - "learning_rate": 8.32445169449459e-05, - "loss": 0.9445, - "step": 297 - }, - { - "epoch": 1.536082474226804, - "grad_norm": 0.5185613036155701, - "learning_rate": 8.311679160976539e-05, - "loss": 0.8915, - "step": 298 - }, - { - "epoch": 1.5412371134020617, - "grad_norm": 0.5198041796684265, - "learning_rate": 8.29886801104608e-05, - "loss": 0.7735, - "step": 299 - }, - { - "epoch": 1.5463917525773194, - "grad_norm": 0.5014578104019165, - "learning_rate": 8.286018394089863e-05, - "loss": 0.7663, - "step": 300 - }, - { - "epoch": 1.5515463917525774, - "grad_norm": 0.5157457590103149, - "learning_rate": 8.273130459943086e-05, - "loss": 0.741, - "step": 301 - }, - { - "epoch": 1.556701030927835, - "grad_norm": 0.6345148086547852, - "learning_rate": 8.260204358887754e-05, - "loss": 0.9313, - "step": 302 - }, - { - "epoch": 1.5618556701030928, - "grad_norm": 0.46231186389923096, - "learning_rate": 8.247240241650918e-05, - "loss": 0.6452, - "step": 303 - }, - { - "epoch": 1.5670103092783505, - "grad_norm": 0.5200813412666321, - "learning_rate": 8.234238259402935e-05, - "loss": 0.723, - "step": 304 - }, - { - "epoch": 1.5721649484536082, - "grad_norm": 0.49244582653045654, - "learning_rate": 8.221198563755682e-05, - "loss": 0.6612, - "step": 305 - }, - { - "epoch": 1.577319587628866, - "grad_norm": 0.4823918640613556, - "learning_rate": 8.208121306760805e-05, - "loss": 0.7428, - "step": 306 - }, - { - "epoch": 1.5824742268041239, - "grad_norm": 0.4022305905818939, - "learning_rate": 8.195006640907942e-05, - "loss": 0.5851, - "step": 307 - }, - { - "epoch": 1.5876288659793816, - "grad_norm": 0.5390604734420776, - "learning_rate": 8.181854719122939e-05, - "loss": 0.7401, - "step": 308 - }, - { - "epoch": 1.5927835051546393, - "grad_norm": 0.5512533783912659, - "learning_rate": 8.168665694766073e-05, - "loss": 0.8259, - "step": 309 - }, - { - "epoch": 1.597938144329897, - "grad_norm": 0.4972156882286072, - "learning_rate": 8.155439721630264e-05, - "loss": 0.7784, - "step": 310 - }, - { - "epoch": 1.6030927835051547, - "grad_norm": 0.4425565302371979, - "learning_rate": 8.142176953939279e-05, - "loss": 0.6065, - "step": 311 - }, - { - "epoch": 1.6082474226804124, - "grad_norm": 0.4272889196872711, - "learning_rate": 8.128877546345933e-05, - "loss": 0.5773, - "step": 312 - }, - { - "epoch": 1.6134020618556701, - "grad_norm": 0.5591496229171753, - "learning_rate": 8.115541653930286e-05, - "loss": 0.6459, - "step": 313 - }, - { - "epoch": 1.6185567010309279, - "grad_norm": 0.4658234119415283, - "learning_rate": 8.102169432197842e-05, - "loss": 0.7003, - "step": 314 - }, - { - "epoch": 1.6237113402061856, - "grad_norm": 0.4371519684791565, - "learning_rate": 8.088761037077718e-05, - "loss": 0.8428, - "step": 315 - }, - { - "epoch": 1.6288659793814433, - "grad_norm": 0.40482646226882935, - "learning_rate": 8.075316624920848e-05, - "loss": 0.5668, - "step": 316 - }, - { - "epoch": 1.634020618556701, - "grad_norm": 0.5180542469024658, - "learning_rate": 8.061836352498145e-05, - "loss": 0.7009, - "step": 317 - }, - { - "epoch": 1.6391752577319587, - "grad_norm": 0.5090633630752563, - "learning_rate": 8.048320376998673e-05, - "loss": 0.6369, - "step": 318 - }, - { - "epoch": 1.6443298969072164, - "grad_norm": 0.5216387510299683, - "learning_rate": 8.034768856027826e-05, - "loss": 0.8043, - "step": 319 - }, - { - "epoch": 1.6494845360824741, - "grad_norm": 0.6093303561210632, - "learning_rate": 8.021181947605473e-05, - "loss": 0.793, - "step": 320 - }, - { - "epoch": 1.6546391752577319, - "grad_norm": 0.477040559053421, - "learning_rate": 8.007559810164133e-05, - "loss": 0.7163, - "step": 321 - }, - { - "epoch": 1.6597938144329896, - "grad_norm": 0.5674880146980286, - "learning_rate": 7.993902602547113e-05, - "loss": 0.6498, - "step": 322 - }, - { - "epoch": 1.6649484536082473, - "grad_norm": 0.5792298913002014, - "learning_rate": 7.980210484006666e-05, - "loss": 0.7753, - "step": 323 - }, - { - "epoch": 1.670103092783505, - "grad_norm": 0.4890528619289398, - "learning_rate": 7.966483614202128e-05, - "loss": 0.79, - "step": 324 - }, - { - "epoch": 1.675257731958763, - "grad_norm": 0.4526829719543457, - "learning_rate": 7.952722153198054e-05, - "loss": 0.572, - "step": 325 - }, - { - "epoch": 1.6804123711340206, - "grad_norm": 0.5673860907554626, - "learning_rate": 7.938926261462366e-05, - "loss": 0.706, - "step": 326 - }, - { - "epoch": 1.6855670103092784, - "grad_norm": 0.4477851092815399, - "learning_rate": 7.925096099864464e-05, - "loss": 0.553, - "step": 327 - }, - { - "epoch": 1.690721649484536, - "grad_norm": 0.5353184342384338, - "learning_rate": 7.911231829673356e-05, - "loss": 0.7405, - "step": 328 - }, - { - "epoch": 1.6958762886597938, - "grad_norm": 0.4556489586830139, - "learning_rate": 7.897333612555785e-05, - "loss": 0.5462, - "step": 329 - }, - { - "epoch": 1.7010309278350515, - "grad_norm": 0.5018810033798218, - "learning_rate": 7.883401610574336e-05, - "loss": 0.7072, - "step": 330 - }, - { - "epoch": 1.7061855670103094, - "grad_norm": 0.5862541794776917, - "learning_rate": 7.869435986185547e-05, - "loss": 0.8802, - "step": 331 - }, - { - "epoch": 1.7113402061855671, - "grad_norm": 0.491769939661026, - "learning_rate": 7.855436902238017e-05, - "loss": 0.832, - "step": 332 - }, - { - "epoch": 1.7164948453608249, - "grad_norm": 0.46636489033699036, - "learning_rate": 7.841404521970505e-05, - "loss": 0.5585, - "step": 333 - }, - { - "epoch": 1.7216494845360826, - "grad_norm": 0.4456227123737335, - "learning_rate": 7.82733900901003e-05, - "loss": 0.4713, - "step": 334 - }, - { - "epoch": 1.7268041237113403, - "grad_norm": 0.4480153024196625, - "learning_rate": 7.813240527369959e-05, - "loss": 0.6575, - "step": 335 - }, - { - "epoch": 1.731958762886598, - "grad_norm": 0.4899657368659973, - "learning_rate": 7.799109241448091e-05, - "loss": 0.6507, - "step": 336 - }, - { - "epoch": 1.7371134020618557, - "grad_norm": 0.552197277545929, - "learning_rate": 7.784945316024756e-05, - "loss": 0.7384, - "step": 337 - }, - { - "epoch": 1.7422680412371134, - "grad_norm": 0.4821660816669464, - "learning_rate": 7.770748916260875e-05, - "loss": 0.72, - "step": 338 - }, - { - "epoch": 1.7474226804123711, - "grad_norm": 0.5010591745376587, - "learning_rate": 7.756520207696041e-05, - "loss": 0.7132, - "step": 339 - }, - { - "epoch": 1.7525773195876289, - "grad_norm": 0.4646522104740143, - "learning_rate": 7.742259356246593e-05, - "loss": 0.6314, - "step": 340 - }, - { - "epoch": 1.7577319587628866, - "grad_norm": 0.48994216322898865, - "learning_rate": 7.727966528203678e-05, - "loss": 0.5914, - "step": 341 - }, - { - "epoch": 1.7628865979381443, - "grad_norm": 0.4667370319366455, - "learning_rate": 7.71364189023131e-05, - "loss": 0.7351, - "step": 342 - }, - { - "epoch": 1.768041237113402, - "grad_norm": 0.5273435115814209, - "learning_rate": 7.699285609364424e-05, - "loss": 0.7077, - "step": 343 - }, - { - "epoch": 1.7731958762886597, - "grad_norm": 0.6191979050636292, - "learning_rate": 7.68489785300694e-05, - "loss": 0.8277, - "step": 344 - }, - { - "epoch": 1.7783505154639174, - "grad_norm": 0.5085694789886475, - "learning_rate": 7.670478788929802e-05, - "loss": 0.8608, - "step": 345 - }, - { - "epoch": 1.7835051546391751, - "grad_norm": 0.47608375549316406, - "learning_rate": 7.656028585269018e-05, - "loss": 0.6082, - "step": 346 - }, - { - "epoch": 1.7886597938144329, - "grad_norm": 0.5139414072036743, - "learning_rate": 7.641547410523709e-05, - "loss": 0.6301, - "step": 347 - }, - { - "epoch": 1.7938144329896906, - "grad_norm": 0.45298558473587036, - "learning_rate": 7.627035433554138e-05, - "loss": 0.695, - "step": 348 - }, - { - "epoch": 1.7989690721649485, - "grad_norm": 0.5817694664001465, - "learning_rate": 7.612492823579745e-05, - "loss": 0.7001, - "step": 349 - }, - { - "epoch": 1.8041237113402062, - "grad_norm": 0.481673926115036, - "learning_rate": 7.597919750177168e-05, - "loss": 0.6867, - "step": 350 - }, - { - "epoch": 1.809278350515464, - "grad_norm": 0.5240175724029541, - "learning_rate": 7.583316383278273e-05, - "loss": 0.6443, - "step": 351 - }, - { - "epoch": 1.8144329896907216, - "grad_norm": 0.5637550950050354, - "learning_rate": 7.568682893168164e-05, - "loss": 0.7145, - "step": 352 - }, - { - "epoch": 1.8195876288659794, - "grad_norm": 0.4581276476383209, - "learning_rate": 7.554019450483208e-05, - "loss": 0.7877, - "step": 353 - }, - { - "epoch": 1.824742268041237, - "grad_norm": 0.5422832369804382, - "learning_rate": 7.539326226209031e-05, - "loss": 0.6451, - "step": 354 - }, - { - "epoch": 1.829896907216495, - "grad_norm": 0.45504269003868103, - "learning_rate": 7.524603391678541e-05, - "loss": 0.816, - "step": 355 - }, - { - "epoch": 1.8350515463917527, - "grad_norm": 0.5947327017784119, - "learning_rate": 7.509851118569915e-05, - "loss": 0.674, - "step": 356 - }, - { - "epoch": 1.8402061855670104, - "grad_norm": 0.38084176182746887, - "learning_rate": 7.495069578904608e-05, - "loss": 0.5898, - "step": 357 - }, - { - "epoch": 1.8453608247422681, - "grad_norm": 0.4741254448890686, - "learning_rate": 7.48025894504534e-05, - "loss": 0.6011, - "step": 358 - }, - { - "epoch": 1.8505154639175259, - "grad_norm": 0.5517462491989136, - "learning_rate": 7.465419389694092e-05, - "loss": 0.7738, - "step": 359 - }, - { - "epoch": 1.8556701030927836, - "grad_norm": 0.5621674656867981, - "learning_rate": 7.450551085890087e-05, - "loss": 0.8294, - "step": 360 - }, - { - "epoch": 1.8608247422680413, - "grad_norm": 0.4534311592578888, - "learning_rate": 7.435654207007773e-05, - "loss": 0.635, - "step": 361 - }, - { - "epoch": 1.865979381443299, - "grad_norm": 0.5818427205085754, - "learning_rate": 7.420728926754803e-05, - "loss": 0.824, - "step": 362 - }, - { - "epoch": 1.8711340206185567, - "grad_norm": 0.43529012799263, - "learning_rate": 7.405775419170014e-05, - "loss": 0.5617, - "step": 363 - }, - { - "epoch": 1.8762886597938144, - "grad_norm": 0.39464038610458374, - "learning_rate": 7.390793858621386e-05, - "loss": 0.6568, - "step": 364 - }, - { - "epoch": 1.8814432989690721, - "grad_norm": 0.5439510941505432, - "learning_rate": 7.375784419804019e-05, - "loss": 0.6493, - "step": 365 - }, - { - "epoch": 1.8865979381443299, - "grad_norm": 0.44549572467803955, - "learning_rate": 7.360747277738094e-05, - "loss": 0.6491, - "step": 366 - }, - { - "epoch": 1.8917525773195876, - "grad_norm": 0.5053935050964355, - "learning_rate": 7.345682607766826e-05, - "loss": 0.8553, - "step": 367 - }, - { - "epoch": 1.8969072164948453, - "grad_norm": 0.4746989905834198, - "learning_rate": 7.330590585554428e-05, - "loss": 0.7245, - "step": 368 - }, - { - "epoch": 1.902061855670103, - "grad_norm": 0.3898842930793762, - "learning_rate": 7.315471387084056e-05, - "loss": 0.5421, - "step": 369 - }, - { - "epoch": 1.9072164948453607, - "grad_norm": 0.5751209855079651, - "learning_rate": 7.300325188655761e-05, - "loss": 0.815, - "step": 370 - }, - { - "epoch": 1.9123711340206184, - "grad_norm": 0.5123909115791321, - "learning_rate": 7.285152166884432e-05, - "loss": 0.7358, - "step": 371 - }, - { - "epoch": 1.9175257731958761, - "grad_norm": 0.47217753529548645, - "learning_rate": 7.269952498697734e-05, - "loss": 0.7771, - "step": 372 - }, - { - "epoch": 1.922680412371134, - "grad_norm": 0.5343344807624817, - "learning_rate": 7.25472636133405e-05, - "loss": 0.6233, - "step": 373 - }, - { - "epoch": 1.9278350515463918, - "grad_norm": 0.5458410978317261, - "learning_rate": 7.23947393234041e-05, - "loss": 0.8072, - "step": 374 - }, - { - "epoch": 1.9329896907216495, - "grad_norm": 0.6140435338020325, - "learning_rate": 7.224195389570422e-05, - "loss": 0.7253, - "step": 375 - }, - { - "epoch": 1.9381443298969072, - "grad_norm": 0.6178478598594666, - "learning_rate": 7.208890911182197e-05, - "loss": 0.9358, - "step": 376 - }, - { - "epoch": 1.943298969072165, - "grad_norm": 0.5554368495941162, - "learning_rate": 7.193560675636277e-05, - "loss": 0.8622, - "step": 377 - }, - { - "epoch": 1.9484536082474226, - "grad_norm": 0.5233482718467712, - "learning_rate": 7.178204861693545e-05, - "loss": 0.768, - "step": 378 - }, - { - "epoch": 1.9536082474226806, - "grad_norm": 0.4429621994495392, - "learning_rate": 7.162823648413151e-05, - "loss": 0.6282, - "step": 379 - }, - { - "epoch": 1.9587628865979383, - "grad_norm": 0.44777798652648926, - "learning_rate": 7.14741721515041e-05, - "loss": 0.6284, - "step": 380 - }, - { - "epoch": 1.963917525773196, - "grad_norm": 0.4569181501865387, - "learning_rate": 7.131985741554728e-05, - "loss": 0.6296, - "step": 381 - }, - { - "epoch": 1.9690721649484537, - "grad_norm": 0.5347330570220947, - "learning_rate": 7.116529407567489e-05, - "loss": 0.7547, - "step": 382 - }, - { - "epoch": 1.9742268041237114, - "grad_norm": 0.5041331052780151, - "learning_rate": 7.101048393419977e-05, - "loss": 0.6748, - "step": 383 - }, - { - "epoch": 1.9793814432989691, - "grad_norm": 0.487981915473938, - "learning_rate": 7.085542879631253e-05, - "loss": 0.7449, - "step": 384 - }, - { - "epoch": 1.9845360824742269, - "grad_norm": 0.4471663236618042, - "learning_rate": 7.070013047006068e-05, - "loss": 0.6517, - "step": 385 - }, - { - "epoch": 1.9896907216494846, - "grad_norm": 0.4358983039855957, - "learning_rate": 7.054459076632743e-05, - "loss": 0.7218, - "step": 386 - }, - { - "epoch": 1.9948453608247423, - "grad_norm": 0.4451632797718048, - "learning_rate": 7.038881149881058e-05, - "loss": 0.6756, - "step": 387 - }, - { - "epoch": 2.0, - "grad_norm": 0.6731226444244385, - "learning_rate": 7.02327944840015e-05, - "loss": 0.7974, - "step": 388 - }, - { - "epoch": 2.0, - "eval_loss": 0.7645891904830933, - "eval_runtime": 23.115, - "eval_samples_per_second": 7.095, - "eval_steps_per_second": 1.774, - "step": 388 - }, - { - "epoch": 2.0051546391752577, - "grad_norm": 0.46283644437789917, - "learning_rate": 7.007654154116377e-05, - "loss": 0.6405, - "step": 389 - }, - { - "epoch": 2.0103092783505154, - "grad_norm": 0.4309590756893158, - "learning_rate": 6.992005449231208e-05, - "loss": 0.6401, - "step": 390 - }, - { - "epoch": 2.015463917525773, - "grad_norm": 0.5006699562072754, - "learning_rate": 6.976333516219096e-05, - "loss": 0.6664, - "step": 391 - }, - { - "epoch": 2.020618556701031, - "grad_norm": 0.5041174292564392, - "learning_rate": 6.960638537825352e-05, - "loss": 0.587, - "step": 392 - }, - { - "epoch": 2.0257731958762886, - "grad_norm": 0.4952574670314789, - "learning_rate": 6.944920697064004e-05, - "loss": 0.7003, - "step": 393 - }, - { - "epoch": 2.0309278350515463, - "grad_norm": 0.5484703183174133, - "learning_rate": 6.929180177215678e-05, - "loss": 0.6198, - "step": 394 - }, - { - "epoch": 2.036082474226804, - "grad_norm": 0.4701229929924011, - "learning_rate": 6.91341716182545e-05, - "loss": 0.6247, - "step": 395 - }, - { - "epoch": 2.0412371134020617, - "grad_norm": 0.5044362545013428, - "learning_rate": 6.897631834700709e-05, - "loss": 0.5468, - "step": 396 - }, - { - "epoch": 2.0463917525773194, - "grad_norm": 0.5583397746086121, - "learning_rate": 6.881824379909017e-05, - "loss": 0.6784, - "step": 397 - }, - { - "epoch": 2.051546391752577, - "grad_norm": 0.43878433108329773, - "learning_rate": 6.865994981775957e-05, - "loss": 0.5321, - "step": 398 - }, - { - "epoch": 2.056701030927835, - "grad_norm": 0.5170990228652954, - "learning_rate": 6.850143824882986e-05, - "loss": 0.5962, - "step": 399 - }, - { - "epoch": 2.0618556701030926, - "grad_norm": 0.5732204914093018, - "learning_rate": 6.834271094065283e-05, - "loss": 0.7051, - "step": 400 - }, - { - "epoch": 2.0670103092783507, - "grad_norm": 0.5963968634605408, - "learning_rate": 6.818376974409593e-05, - "loss": 0.6314, - "step": 401 - }, - { - "epoch": 2.0721649484536084, - "grad_norm": 0.5836310386657715, - "learning_rate": 6.802461651252073e-05, - "loss": 0.729, - "step": 402 - }, - { - "epoch": 2.077319587628866, - "grad_norm": 0.49259456992149353, - "learning_rate": 6.786525310176123e-05, - "loss": 0.6406, - "step": 403 - }, - { - "epoch": 2.082474226804124, - "grad_norm": 0.5154966115951538, - "learning_rate": 6.770568137010226e-05, - "loss": 0.6839, - "step": 404 - }, - { - "epoch": 2.0876288659793816, - "grad_norm": 0.5059208869934082, - "learning_rate": 6.754590317825785e-05, - "loss": 0.6115, - "step": 405 - }, - { - "epoch": 2.0927835051546393, - "grad_norm": 0.6629896759986877, - "learning_rate": 6.738592038934946e-05, - "loss": 0.7747, - "step": 406 - }, - { - "epoch": 2.097938144329897, - "grad_norm": 0.5021480321884155, - "learning_rate": 6.722573486888427e-05, - "loss": 0.6272, - "step": 407 - }, - { - "epoch": 2.1030927835051547, - "grad_norm": 0.5166428089141846, - "learning_rate": 6.706534848473352e-05, - "loss": 0.7801, - "step": 408 - }, - { - "epoch": 2.1082474226804124, - "grad_norm": 0.6196724772453308, - "learning_rate": 6.69047631071106e-05, - "loss": 0.7115, - "step": 409 - }, - { - "epoch": 2.11340206185567, - "grad_norm": 0.5208120346069336, - "learning_rate": 6.674398060854931e-05, - "loss": 0.7663, - "step": 410 - }, - { - "epoch": 2.118556701030928, - "grad_norm": 0.5327072739601135, - "learning_rate": 6.658300286388203e-05, - "loss": 0.6416, - "step": 411 - }, - { - "epoch": 2.1237113402061856, - "grad_norm": 0.3883470892906189, - "learning_rate": 6.642183175021779e-05, - "loss": 0.5093, - "step": 412 - }, - { - "epoch": 2.1288659793814433, - "grad_norm": 0.6093575358390808, - "learning_rate": 6.62604691469205e-05, - "loss": 0.7734, - "step": 413 - }, - { - "epoch": 2.134020618556701, - "grad_norm": 0.7094951868057251, - "learning_rate": 6.609891693558692e-05, - "loss": 0.536, - "step": 414 - }, - { - "epoch": 2.1391752577319587, - "grad_norm": 0.6357293725013733, - "learning_rate": 6.59371770000248e-05, - "loss": 0.7157, - "step": 415 - }, - { - "epoch": 2.1443298969072164, - "grad_norm": 0.5292159914970398, - "learning_rate": 6.577525122623084e-05, - "loss": 0.6215, - "step": 416 - }, - { - "epoch": 2.149484536082474, - "grad_norm": 0.5167898535728455, - "learning_rate": 6.561314150236882e-05, - "loss": 0.5474, - "step": 417 - }, - { - "epoch": 2.154639175257732, - "grad_norm": 0.6716488599777222, - "learning_rate": 6.545084971874738e-05, - "loss": 0.6841, - "step": 418 - }, - { - "epoch": 2.1597938144329896, - "grad_norm": 0.6054789423942566, - "learning_rate": 6.528837776779819e-05, - "loss": 0.5964, - "step": 419 - }, - { - "epoch": 2.1649484536082473, - "grad_norm": 0.5557959675788879, - "learning_rate": 6.51257275440538e-05, - "loss": 0.6791, - "step": 420 - }, - { - "epoch": 2.170103092783505, - "grad_norm": 0.4804280400276184, - "learning_rate": 6.496290094412546e-05, - "loss": 0.5954, - "step": 421 - }, - { - "epoch": 2.1752577319587627, - "grad_norm": 0.6319596767425537, - "learning_rate": 6.479989986668118e-05, - "loss": 0.6853, - "step": 422 - }, - { - "epoch": 2.1804123711340204, - "grad_norm": 0.5729424953460693, - "learning_rate": 6.463672621242342e-05, - "loss": 0.5408, - "step": 423 - }, - { - "epoch": 2.1855670103092786, - "grad_norm": 0.5444086790084839, - "learning_rate": 6.447338188406704e-05, - "loss": 0.7406, - "step": 424 - }, - { - "epoch": 2.1907216494845363, - "grad_norm": 0.5574979782104492, - "learning_rate": 6.430986878631707e-05, - "loss": 0.521, - "step": 425 - }, - { - "epoch": 2.195876288659794, - "grad_norm": 0.569279134273529, - "learning_rate": 6.41461888258465e-05, - "loss": 0.5863, - "step": 426 - }, - { - "epoch": 2.2010309278350517, - "grad_norm": 0.6239138245582581, - "learning_rate": 6.398234391127406e-05, - "loss": 0.8244, - "step": 427 - }, - { - "epoch": 2.2061855670103094, - "grad_norm": 0.5632001757621765, - "learning_rate": 6.381833595314195e-05, - "loss": 0.6423, - "step": 428 - }, - { - "epoch": 2.211340206185567, - "grad_norm": 0.6038775444030762, - "learning_rate": 6.365416686389358e-05, - "loss": 0.7034, - "step": 429 - }, - { - "epoch": 2.216494845360825, - "grad_norm": 0.45070335268974304, - "learning_rate": 6.348983855785121e-05, - "loss": 0.5302, - "step": 430 - }, - { - "epoch": 2.2216494845360826, - "grad_norm": 0.5909480452537537, - "learning_rate": 6.332535295119377e-05, - "loss": 0.6037, - "step": 431 - }, - { - "epoch": 2.2268041237113403, - "grad_norm": 0.5120466351509094, - "learning_rate": 6.31607119619343e-05, - "loss": 0.5515, - "step": 432 - }, - { - "epoch": 2.231958762886598, - "grad_norm": 0.5868075489997864, - "learning_rate": 6.299591750989779e-05, - "loss": 0.6891, - "step": 433 - }, - { - "epoch": 2.2371134020618557, - "grad_norm": 0.5526292324066162, - "learning_rate": 6.283097151669869e-05, - "loss": 0.6259, - "step": 434 - }, - { - "epoch": 2.2422680412371134, - "grad_norm": 0.6284964084625244, - "learning_rate": 6.266587590571852e-05, - "loss": 0.8128, - "step": 435 - }, - { - "epoch": 2.247422680412371, - "grad_norm": 0.6394625306129456, - "learning_rate": 6.250063260208346e-05, - "loss": 0.7159, - "step": 436 - }, - { - "epoch": 2.252577319587629, - "grad_norm": 0.4809950590133667, - "learning_rate": 6.233524353264187e-05, - "loss": 0.5191, - "step": 437 - }, - { - "epoch": 2.2577319587628866, - "grad_norm": 0.5848096609115601, - "learning_rate": 6.216971062594179e-05, - "loss": 0.6013, - "step": 438 - }, - { - "epoch": 2.2628865979381443, - "grad_norm": 0.6152583360671997, - "learning_rate": 6.200403581220861e-05, - "loss": 0.5314, - "step": 439 - }, - { - "epoch": 2.268041237113402, - "grad_norm": 0.5509049296379089, - "learning_rate": 6.183822102332234e-05, - "loss": 0.6, - "step": 440 - }, - { - "epoch": 2.2731958762886597, - "grad_norm": 0.611883282661438, - "learning_rate": 6.167226819279528e-05, - "loss": 0.7169, - "step": 441 - }, - { - "epoch": 2.2783505154639174, - "grad_norm": 0.6232669353485107, - "learning_rate": 6.150617925574933e-05, - "loss": 0.7636, - "step": 442 - }, - { - "epoch": 2.283505154639175, - "grad_norm": 0.5470356345176697, - "learning_rate": 6.13399561488935e-05, - "loss": 0.6322, - "step": 443 - }, - { - "epoch": 2.288659793814433, - "grad_norm": 0.6054553389549255, - "learning_rate": 6.117360081050136e-05, - "loss": 0.6398, - "step": 444 - }, - { - "epoch": 2.2938144329896906, - "grad_norm": 0.7440930604934692, - "learning_rate": 6.1007115180388285e-05, - "loss": 0.7202, - "step": 445 - }, - { - "epoch": 2.2989690721649483, - "grad_norm": 0.6226584911346436, - "learning_rate": 6.0840501199889046e-05, - "loss": 0.573, - "step": 446 - }, - { - "epoch": 2.304123711340206, - "grad_norm": 0.6319738030433655, - "learning_rate": 6.067376081183499e-05, - "loss": 0.6071, - "step": 447 - }, - { - "epoch": 2.3092783505154637, - "grad_norm": 0.5800845623016357, - "learning_rate": 6.050689596053151e-05, - "loss": 0.6816, - "step": 448 - }, - { - "epoch": 2.3144329896907214, - "grad_norm": 0.5708872079849243, - "learning_rate": 6.0339908591735296e-05, - "loss": 0.6731, - "step": 449 - }, - { - "epoch": 2.319587628865979, - "grad_norm": 0.5780487060546875, - "learning_rate": 6.01728006526317e-05, - "loss": 0.5966, - "step": 450 - }, - { - "epoch": 2.3247422680412373, - "grad_norm": 0.6469062566757202, - "learning_rate": 6.0005574091811964e-05, - "loss": 0.5754, - "step": 451 - }, - { - "epoch": 2.329896907216495, - "grad_norm": 0.5494647026062012, - "learning_rate": 5.9838230859250586e-05, - "loss": 0.5701, - "step": 452 - }, - { - "epoch": 2.3350515463917527, - "grad_norm": 0.5937216877937317, - "learning_rate": 5.967077290628249e-05, - "loss": 0.6324, - "step": 453 - }, - { - "epoch": 2.3402061855670104, - "grad_norm": 0.5645222663879395, - "learning_rate": 5.950320218558037e-05, - "loss": 0.639, - "step": 454 - }, - { - "epoch": 2.345360824742268, - "grad_norm": 0.7451580762863159, - "learning_rate": 5.9335520651131814e-05, - "loss": 0.607, - "step": 455 - }, - { - "epoch": 2.350515463917526, - "grad_norm": 0.5033421516418457, - "learning_rate": 5.9167730258216627e-05, - "loss": 0.5358, - "step": 456 - }, - { - "epoch": 2.3556701030927836, - "grad_norm": 0.6781536936759949, - "learning_rate": 5.899983296338392e-05, - "loss": 0.6033, - "step": 457 - }, - { - "epoch": 2.3608247422680413, - "grad_norm": 0.6157387495040894, - "learning_rate": 5.8831830724429384e-05, - "loss": 0.5855, - "step": 458 - }, - { - "epoch": 2.365979381443299, - "grad_norm": 0.6350586414337158, - "learning_rate": 5.866372550037242e-05, - "loss": 0.5524, - "step": 459 - }, - { - "epoch": 2.3711340206185567, - "grad_norm": 0.5306968688964844, - "learning_rate": 5.849551925143334e-05, - "loss": 0.6551, - "step": 460 - }, - { - "epoch": 2.3762886597938144, - "grad_norm": 0.6288350820541382, - "learning_rate": 5.8327213939010414e-05, - "loss": 0.6348, - "step": 461 - }, - { - "epoch": 2.381443298969072, - "grad_norm": 0.530822217464447, - "learning_rate": 5.815881152565712e-05, - "loss": 0.6298, - "step": 462 - }, - { - "epoch": 2.38659793814433, - "grad_norm": 0.5995349287986755, - "learning_rate": 5.799031397505913e-05, - "loss": 0.6262, - "step": 463 - }, - { - "epoch": 2.3917525773195876, - "grad_norm": 0.512988805770874, - "learning_rate": 5.782172325201155e-05, - "loss": 0.615, - "step": 464 - }, - { - "epoch": 2.3969072164948453, - "grad_norm": 0.5762256979942322, - "learning_rate": 5.7653041322395895e-05, - "loss": 0.5785, - "step": 465 - }, - { - "epoch": 2.402061855670103, - "grad_norm": 0.6673002243041992, - "learning_rate": 5.748427015315722e-05, - "loss": 0.777, - "step": 466 - }, - { - "epoch": 2.4072164948453607, - "grad_norm": 0.5349386930465698, - "learning_rate": 5.7315411712281186e-05, - "loss": 0.5766, - "step": 467 - }, - { - "epoch": 2.4123711340206184, - "grad_norm": 0.5195199847221375, - "learning_rate": 5.714646796877108e-05, - "loss": 0.5243, - "step": 468 - }, - { - "epoch": 2.417525773195876, - "grad_norm": 0.5453920960426331, - "learning_rate": 5.697744089262491e-05, - "loss": 0.5514, - "step": 469 - }, - { - "epoch": 2.422680412371134, - "grad_norm": 0.669133722782135, - "learning_rate": 5.680833245481234e-05, - "loss": 0.5849, - "step": 470 - }, - { - "epoch": 2.4278350515463916, - "grad_norm": 0.6312981843948364, - "learning_rate": 5.6639144627251816e-05, - "loss": 0.7814, - "step": 471 - }, - { - "epoch": 2.4329896907216497, - "grad_norm": 0.5807788968086243, - "learning_rate": 5.646987938278753e-05, - "loss": 0.561, - "step": 472 - }, - { - "epoch": 2.4381443298969074, - "grad_norm": 0.5659583210945129, - "learning_rate": 5.630053869516635e-05, - "loss": 0.5743, - "step": 473 - }, - { - "epoch": 2.443298969072165, - "grad_norm": 0.5923808217048645, - "learning_rate": 5.6131124539014926e-05, - "loss": 0.5651, - "step": 474 - }, - { - "epoch": 2.448453608247423, - "grad_norm": 0.5530904531478882, - "learning_rate": 5.596163888981656e-05, - "loss": 0.6827, - "step": 475 - }, - { - "epoch": 2.4536082474226806, - "grad_norm": 0.5958693623542786, - "learning_rate": 5.5792083723888225e-05, - "loss": 0.6882, - "step": 476 - }, - { - "epoch": 2.4587628865979383, - "grad_norm": 0.5510756373405457, - "learning_rate": 5.5622461018357486e-05, - "loss": 0.6592, - "step": 477 - }, - { - "epoch": 2.463917525773196, - "grad_norm": 0.4968793988227844, - "learning_rate": 5.5452772751139496e-05, - "loss": 0.4862, - "step": 478 - }, - { - "epoch": 2.4690721649484537, - "grad_norm": 0.7546355724334717, - "learning_rate": 5.5283020900913886e-05, - "loss": 0.6766, - "step": 479 - }, - { - "epoch": 2.4742268041237114, - "grad_norm": 0.7161941528320312, - "learning_rate": 5.511320744710171e-05, - "loss": 0.9015, - "step": 480 - }, - { - "epoch": 2.479381443298969, - "grad_norm": 0.690072238445282, - "learning_rate": 5.494333436984238e-05, - "loss": 0.6505, - "step": 481 - }, - { - "epoch": 2.484536082474227, - "grad_norm": 0.5719749927520752, - "learning_rate": 5.477340364997051e-05, - "loss": 0.6629, - "step": 482 - }, - { - "epoch": 2.4896907216494846, - "grad_norm": 0.5597423911094666, - "learning_rate": 5.460341726899291e-05, - "loss": 0.6062, - "step": 483 - }, - { - "epoch": 2.4948453608247423, - "grad_norm": 0.6454108953475952, - "learning_rate": 5.4433377209065414e-05, - "loss": 0.6228, - "step": 484 - }, - { - "epoch": 2.5, - "grad_norm": 0.5420824289321899, - "learning_rate": 5.4263285452969806e-05, - "loss": 0.5897, - "step": 485 - }, - { - "epoch": 2.5, - "eval_loss": 0.7722232341766357, - "eval_runtime": 23.0614, - "eval_samples_per_second": 7.111, - "eval_steps_per_second": 1.778, - "step": 485 - }, - { - "epoch": 2.5051546391752577, - "grad_norm": 0.6164212226867676, - "learning_rate": 5.409314398409067e-05, - "loss": 0.562, - "step": 486 - }, - { - "epoch": 2.5103092783505154, - "grad_norm": 0.5126146078109741, - "learning_rate": 5.392295478639225e-05, - "loss": 0.697, - "step": 487 - }, - { - "epoch": 2.515463917525773, - "grad_norm": 0.6363269686698914, - "learning_rate": 5.3752719844395405e-05, - "loss": 0.7262, - "step": 488 - }, - { - "epoch": 2.520618556701031, - "grad_norm": 0.5842899084091187, - "learning_rate": 5.358244114315434e-05, - "loss": 0.6244, - "step": 489 - }, - { - "epoch": 2.5257731958762886, - "grad_norm": 0.5761100649833679, - "learning_rate": 5.341212066823355e-05, - "loss": 0.6786, - "step": 490 - }, - { - "epoch": 2.5309278350515463, - "grad_norm": 0.6791864037513733, - "learning_rate": 5.324176040568465e-05, - "loss": 0.6026, - "step": 491 - }, - { - "epoch": 2.536082474226804, - "grad_norm": 0.6826097965240479, - "learning_rate": 5.307136234202318e-05, - "loss": 0.6967, - "step": 492 - }, - { - "epoch": 2.5412371134020617, - "grad_norm": 0.5074894428253174, - "learning_rate": 5.290092846420548e-05, - "loss": 0.6554, - "step": 493 - }, - { - "epoch": 2.5463917525773194, - "grad_norm": 0.5714629292488098, - "learning_rate": 5.27304607596055e-05, - "loss": 0.7671, - "step": 494 - }, - { - "epoch": 2.551546391752577, - "grad_norm": 0.5728024244308472, - "learning_rate": 5.255996121599167e-05, - "loss": 0.6811, - "step": 495 - }, - { - "epoch": 2.556701030927835, - "grad_norm": 0.626042902469635, - "learning_rate": 5.2389431821503606e-05, - "loss": 0.6168, - "step": 496 - }, - { - "epoch": 2.5618556701030926, - "grad_norm": 0.6739702224731445, - "learning_rate": 5.221887456462907e-05, - "loss": 0.7175, - "step": 497 - }, - { - "epoch": 2.5670103092783503, - "grad_norm": 0.6577808856964111, - "learning_rate": 5.2048291434180716e-05, - "loss": 0.6459, - "step": 498 - }, - { - "epoch": 2.572164948453608, - "grad_norm": 0.6304759383201599, - "learning_rate": 5.1877684419272875e-05, - "loss": 0.687, - "step": 499 - }, - { - "epoch": 2.5773195876288657, - "grad_norm": 0.6931384801864624, - "learning_rate": 5.1707055509298396e-05, - "loss": 0.6961, - "step": 500 - }, - { - "epoch": 2.582474226804124, - "grad_norm": 0.6150075793266296, - "learning_rate": 5.153640669390546e-05, - "loss": 0.7175, - "step": 501 - }, - { - "epoch": 2.5876288659793816, - "grad_norm": 0.6940797567367554, - "learning_rate": 5.1365739962974304e-05, - "loss": 0.6951, - "step": 502 - }, - { - "epoch": 2.5927835051546393, - "grad_norm": 0.6109462976455688, - "learning_rate": 5.119505730659413e-05, - "loss": 0.5898, - "step": 503 - }, - { - "epoch": 2.597938144329897, - "grad_norm": 0.5795748829841614, - "learning_rate": 5.102436071503982e-05, - "loss": 0.5456, - "step": 504 - }, - { - "epoch": 2.6030927835051547, - "grad_norm": 0.6374852657318115, - "learning_rate": 5.0853652178748746e-05, - "loss": 0.5659, - "step": 505 - }, - { - "epoch": 2.6082474226804124, - "grad_norm": 0.6344237923622131, - "learning_rate": 5.068293368829755e-05, - "loss": 0.6678, - "step": 506 - }, - { - "epoch": 2.61340206185567, - "grad_norm": 0.6286026835441589, - "learning_rate": 5.0512207234379004e-05, - "loss": 0.5786, - "step": 507 - }, - { - "epoch": 2.618556701030928, - "grad_norm": 0.6388340592384338, - "learning_rate": 5.0341474807778663e-05, - "loss": 0.7377, - "step": 508 - }, - { - "epoch": 2.6237113402061856, - "grad_norm": 0.5540049076080322, - "learning_rate": 5.017073839935178e-05, - "loss": 0.6299, - "step": 509 - }, - { - "epoch": 2.6288659793814433, - "grad_norm": 0.7146233320236206, - "learning_rate": 5e-05, - "loss": 0.64, - "step": 510 - }, - { - "epoch": 2.634020618556701, - "grad_norm": 0.5796692967414856, - "learning_rate": 4.982926160064823e-05, - "loss": 0.5692, - "step": 511 - }, - { - "epoch": 2.6391752577319587, - "grad_norm": 0.6053364872932434, - "learning_rate": 4.965852519222134e-05, - "loss": 0.5449, - "step": 512 - }, - { - "epoch": 2.6443298969072164, - "grad_norm": 0.6368183493614197, - "learning_rate": 4.948779276562101e-05, - "loss": 0.6213, - "step": 513 - }, - { - "epoch": 2.649484536082474, - "grad_norm": 0.6399299502372742, - "learning_rate": 4.9317066311702456e-05, - "loss": 0.7879, - "step": 514 - }, - { - "epoch": 2.654639175257732, - "grad_norm": 0.8016248941421509, - "learning_rate": 4.9146347821251266e-05, - "loss": 0.6358, - "step": 515 - }, - { - "epoch": 2.6597938144329896, - "grad_norm": 0.6955182552337646, - "learning_rate": 4.89756392849602e-05, - "loss": 0.7572, - "step": 516 - }, - { - "epoch": 2.6649484536082473, - "grad_norm": 0.7389543056488037, - "learning_rate": 4.880494269340588e-05, - "loss": 0.6965, - "step": 517 - }, - { - "epoch": 2.670103092783505, - "grad_norm": 0.5461128950119019, - "learning_rate": 4.863426003702572e-05, - "loss": 0.6847, - "step": 518 - }, - { - "epoch": 2.675257731958763, - "grad_norm": 0.578420877456665, - "learning_rate": 4.8463593306094555e-05, - "loss": 0.5675, - "step": 519 - }, - { - "epoch": 2.680412371134021, - "grad_norm": 0.4487128257751465, - "learning_rate": 4.829294449070161e-05, - "loss": 0.5949, - "step": 520 - }, - { - "epoch": 2.6855670103092786, - "grad_norm": 0.5900899171829224, - "learning_rate": 4.8122315580727136e-05, - "loss": 0.5375, - "step": 521 - }, - { - "epoch": 2.6907216494845363, - "grad_norm": 0.6414706110954285, - "learning_rate": 4.795170856581929e-05, - "loss": 0.7276, - "step": 522 - }, - { - "epoch": 2.695876288659794, - "grad_norm": 0.6505663394927979, - "learning_rate": 4.778112543537094e-05, - "loss": 0.7086, - "step": 523 - }, - { - "epoch": 2.7010309278350517, - "grad_norm": 0.5941667556762695, - "learning_rate": 4.7610568178496405e-05, - "loss": 0.5269, - "step": 524 - }, - { - "epoch": 2.7061855670103094, - "grad_norm": 0.6392844319343567, - "learning_rate": 4.744003878400835e-05, - "loss": 0.6261, - "step": 525 - }, - { - "epoch": 2.711340206185567, - "grad_norm": 0.608360767364502, - "learning_rate": 4.726953924039451e-05, - "loss": 0.6867, - "step": 526 - }, - { - "epoch": 2.716494845360825, - "grad_norm": 0.5300198197364807, - "learning_rate": 4.709907153579454e-05, - "loss": 0.5922, - "step": 527 - }, - { - "epoch": 2.7216494845360826, - "grad_norm": 0.6723424792289734, - "learning_rate": 4.692863765797683e-05, - "loss": 0.6799, - "step": 528 - }, - { - "epoch": 2.7268041237113403, - "grad_norm": 0.5378847122192383, - "learning_rate": 4.675823959431535e-05, - "loss": 0.5339, - "step": 529 - }, - { - "epoch": 2.731958762886598, - "grad_norm": 0.6574480533599854, - "learning_rate": 4.658787933176646e-05, - "loss": 0.5919, - "step": 530 - }, - { - "epoch": 2.7371134020618557, - "grad_norm": 0.633269727230072, - "learning_rate": 4.641755885684566e-05, - "loss": 0.8605, - "step": 531 - }, - { - "epoch": 2.7422680412371134, - "grad_norm": 0.6804322004318237, - "learning_rate": 4.624728015560461e-05, - "loss": 0.7026, - "step": 532 - }, - { - "epoch": 2.747422680412371, - "grad_norm": 0.6961231231689453, - "learning_rate": 4.607704521360776e-05, - "loss": 0.7013, - "step": 533 - }, - { - "epoch": 2.752577319587629, - "grad_norm": 0.5691372752189636, - "learning_rate": 4.590685601590936e-05, - "loss": 0.5371, - "step": 534 - }, - { - "epoch": 2.7577319587628866, - "grad_norm": 0.7646403908729553, - "learning_rate": 4.57367145470302e-05, - "loss": 0.6463, - "step": 535 - }, - { - "epoch": 2.7628865979381443, - "grad_norm": 0.6509224772453308, - "learning_rate": 4.5566622790934604e-05, - "loss": 0.66, - "step": 536 - }, - { - "epoch": 2.768041237113402, - "grad_norm": 0.6796517968177795, - "learning_rate": 4.5396582731007095e-05, - "loss": 0.7068, - "step": 537 - }, - { - "epoch": 2.7731958762886597, - "grad_norm": 0.6646586060523987, - "learning_rate": 4.52265963500295e-05, - "loss": 0.6509, - "step": 538 - }, - { - "epoch": 2.7783505154639174, - "grad_norm": 0.713420569896698, - "learning_rate": 4.505666563015763e-05, - "loss": 0.5404, - "step": 539 - }, - { - "epoch": 2.783505154639175, - "grad_norm": 0.6599990129470825, - "learning_rate": 4.4886792552898286e-05, - "loss": 0.8083, - "step": 540 - }, - { - "epoch": 2.788659793814433, - "grad_norm": 0.7151883244514465, - "learning_rate": 4.471697909908613e-05, - "loss": 0.6625, - "step": 541 - }, - { - "epoch": 2.7938144329896906, - "grad_norm": 0.6712698936462402, - "learning_rate": 4.454722724886051e-05, - "loss": 0.773, - "step": 542 - }, - { - "epoch": 2.7989690721649483, - "grad_norm": 0.7360748648643494, - "learning_rate": 4.437753898164254e-05, - "loss": 0.7999, - "step": 543 - }, - { - "epoch": 2.804123711340206, - "grad_norm": 0.6323828101158142, - "learning_rate": 4.420791627611179e-05, - "loss": 0.6039, - "step": 544 - }, - { - "epoch": 2.8092783505154637, - "grad_norm": 0.6517678499221802, - "learning_rate": 4.403836111018346e-05, - "loss": 0.7171, - "step": 545 - }, - { - "epoch": 2.8144329896907214, - "grad_norm": 0.5681766271591187, - "learning_rate": 4.3868875460985085e-05, - "loss": 0.51, - "step": 546 - }, - { - "epoch": 2.819587628865979, - "grad_norm": 0.6580605506896973, - "learning_rate": 4.369946130483364e-05, - "loss": 0.6876, - "step": 547 - }, - { - "epoch": 2.824742268041237, - "grad_norm": 0.6996796131134033, - "learning_rate": 4.353012061721249e-05, - "loss": 0.6901, - "step": 548 - }, - { - "epoch": 2.829896907216495, - "grad_norm": 0.8233026266098022, - "learning_rate": 4.336085537274818e-05, - "loss": 0.7917, - "step": 549 - }, - { - "epoch": 2.8350515463917527, - "grad_norm": 0.5782327651977539, - "learning_rate": 4.319166754518768e-05, - "loss": 0.5233, - "step": 550 - }, - { - "epoch": 2.8402061855670104, - "grad_norm": 0.6856580376625061, - "learning_rate": 4.3022559107375106e-05, - "loss": 0.7887, - "step": 551 - }, - { - "epoch": 2.845360824742268, - "grad_norm": 0.5951405167579651, - "learning_rate": 4.285353203122893e-05, - "loss": 0.6688, - "step": 552 - }, - { - "epoch": 2.850515463917526, - "grad_norm": 0.604046642780304, - "learning_rate": 4.268458828771883e-05, - "loss": 0.6287, - "step": 553 - }, - { - "epoch": 2.8556701030927836, - "grad_norm": 0.6061568856239319, - "learning_rate": 4.251572984684281e-05, - "loss": 0.6911, - "step": 554 - }, - { - "epoch": 2.8608247422680413, - "grad_norm": 0.682518720626831, - "learning_rate": 4.234695867760412e-05, - "loss": 0.604, - "step": 555 - }, - { - "epoch": 2.865979381443299, - "grad_norm": 0.6043644547462463, - "learning_rate": 4.2178276747988446e-05, - "loss": 0.7149, - "step": 556 - }, - { - "epoch": 2.8711340206185567, - "grad_norm": 0.6495589017868042, - "learning_rate": 4.200968602494087e-05, - "loss": 0.6281, - "step": 557 - }, - { - "epoch": 2.8762886597938144, - "grad_norm": 0.5073755383491516, - "learning_rate": 4.18411884743429e-05, - "loss": 0.5349, - "step": 558 - }, - { - "epoch": 2.881443298969072, - "grad_norm": 0.7583365440368652, - "learning_rate": 4.16727860609896e-05, - "loss": 0.6454, - "step": 559 - }, - { - "epoch": 2.88659793814433, - "grad_norm": 0.6484652757644653, - "learning_rate": 4.150448074856667e-05, - "loss": 0.7474, - "step": 560 - }, - { - "epoch": 2.8917525773195876, - "grad_norm": 0.7517526149749756, - "learning_rate": 4.1336274499627596e-05, - "loss": 0.7225, - "step": 561 - }, - { - "epoch": 2.8969072164948453, - "grad_norm": 0.7365071773529053, - "learning_rate": 4.1168169275570635e-05, - "loss": 0.6508, - "step": 562 - }, - { - "epoch": 2.902061855670103, - "grad_norm": 0.5838257670402527, - "learning_rate": 4.1000167036616113e-05, - "loss": 0.6031, - "step": 563 - }, - { - "epoch": 2.9072164948453607, - "grad_norm": 0.6092349886894226, - "learning_rate": 4.083226974178339e-05, - "loss": 0.6745, - "step": 564 - }, - { - "epoch": 2.9123711340206184, - "grad_norm": 0.7184647917747498, - "learning_rate": 4.066447934886819e-05, - "loss": 0.6873, - "step": 565 - }, - { - "epoch": 2.917525773195876, - "grad_norm": 0.7946974039077759, - "learning_rate": 4.049679781441965e-05, - "loss": 0.6382, - "step": 566 - }, - { - "epoch": 2.9226804123711343, - "grad_norm": 0.5720714926719666, - "learning_rate": 4.0329227093717515e-05, - "loss": 0.6175, - "step": 567 - }, - { - "epoch": 2.927835051546392, - "grad_norm": 0.5512979626655579, - "learning_rate": 4.016176914074944e-05, - "loss": 0.6814, - "step": 568 - }, - { - "epoch": 2.9329896907216497, - "grad_norm": 0.6203610301017761, - "learning_rate": 3.999442590818804e-05, - "loss": 0.7352, - "step": 569 - }, - { - "epoch": 2.9381443298969074, - "grad_norm": 0.7232295870780945, - "learning_rate": 3.982719934736832e-05, - "loss": 0.7115, - "step": 570 - }, - { - "epoch": 2.943298969072165, - "grad_norm": 0.8409321904182434, - "learning_rate": 3.9660091408264716e-05, - "loss": 0.7007, - "step": 571 - }, - { - "epoch": 2.948453608247423, - "grad_norm": 0.5926352143287659, - "learning_rate": 3.949310403946849e-05, - "loss": 0.6221, - "step": 572 - }, - { - "epoch": 2.9536082474226806, - "grad_norm": 0.731905996799469, - "learning_rate": 3.9326239188165025e-05, - "loss": 0.7804, - "step": 573 - }, - { - "epoch": 2.9587628865979383, - "grad_norm": 0.6734544038772583, - "learning_rate": 3.915949880011096e-05, - "loss": 0.7285, - "step": 574 - }, - { - "epoch": 2.963917525773196, - "grad_norm": 0.5474951863288879, - "learning_rate": 3.899288481961173e-05, - "loss": 0.5515, - "step": 575 - }, - { - "epoch": 2.9690721649484537, - "grad_norm": 0.8142836093902588, - "learning_rate": 3.8826399189498654e-05, - "loss": 0.7919, - "step": 576 - }, - { - "epoch": 2.9742268041237114, - "grad_norm": 0.7191728949546814, - "learning_rate": 3.86600438511065e-05, - "loss": 0.7446, - "step": 577 - }, - { - "epoch": 2.979381443298969, - "grad_norm": 0.6951670050621033, - "learning_rate": 3.8493820744250685e-05, - "loss": 0.7408, - "step": 578 - }, - { - "epoch": 2.984536082474227, - "grad_norm": 0.48555153608322144, - "learning_rate": 3.832773180720475e-05, - "loss": 0.6051, - "step": 579 - }, - { - "epoch": 2.9896907216494846, - "grad_norm": 0.5975886583328247, - "learning_rate": 3.8161778976677666e-05, - "loss": 0.5546, - "step": 580 - }, - { - "epoch": 2.9948453608247423, - "grad_norm": 0.6971765756607056, - "learning_rate": 3.79959641877914e-05, - "loss": 0.5505, - "step": 581 - }, - { - "epoch": 3.0, - "grad_norm": 0.6793974041938782, - "learning_rate": 3.783028937405821e-05, - "loss": 0.5491, - "step": 582 - }, - { - "epoch": 3.0, - "eval_loss": 0.7669576406478882, - "eval_runtime": 23.0068, - "eval_samples_per_second": 7.128, - "eval_steps_per_second": 1.782, - "step": 582 - }, - { - "epoch": 3.0051546391752577, - "grad_norm": 0.589939296245575, - "learning_rate": 3.766475646735815e-05, - "loss": 0.6386, - "step": 583 - }, - { - "epoch": 3.0103092783505154, - "grad_norm": 0.7189310193061829, - "learning_rate": 3.7499367397916555e-05, - "loss": 0.6847, - "step": 584 - }, - { - "epoch": 3.015463917525773, - "grad_norm": 0.5985479950904846, - "learning_rate": 3.733412409428148e-05, - "loss": 0.5704, - "step": 585 - }, - { - "epoch": 3.020618556701031, - "grad_norm": 0.6598166227340698, - "learning_rate": 3.716902848330133e-05, - "loss": 0.5275, - "step": 586 - }, - { - "epoch": 3.0257731958762886, - "grad_norm": 0.6075910329818726, - "learning_rate": 3.7004082490102226e-05, - "loss": 0.6668, - "step": 587 - }, - { - "epoch": 3.0309278350515463, - "grad_norm": 0.5255468487739563, - "learning_rate": 3.6839288038065734e-05, - "loss": 0.5409, - "step": 588 - }, - { - "epoch": 3.036082474226804, - "grad_norm": 0.6277021765708923, - "learning_rate": 3.667464704880625e-05, - "loss": 0.641, - "step": 589 - }, - { - "epoch": 3.0412371134020617, - "grad_norm": 0.6998080611228943, - "learning_rate": 3.651016144214878e-05, - "loss": 0.5058, - "step": 590 - }, - { - "epoch": 3.0463917525773194, - "grad_norm": 0.7781403064727783, - "learning_rate": 3.634583313610644e-05, - "loss": 0.8171, - "step": 591 - }, - { - "epoch": 3.051546391752577, - "grad_norm": 0.740594208240509, - "learning_rate": 3.618166404685805e-05, - "loss": 0.4836, - "step": 592 - }, - { - "epoch": 3.056701030927835, - "grad_norm": 0.5763426423072815, - "learning_rate": 3.601765608872595e-05, - "loss": 0.636, - "step": 593 - }, - { - "epoch": 3.0618556701030926, - "grad_norm": 0.8265470862388611, - "learning_rate": 3.585381117415349e-05, - "loss": 0.6, - "step": 594 - }, - { - "epoch": 3.0670103092783507, - "grad_norm": 0.6171466112136841, - "learning_rate": 3.5690131213682944e-05, - "loss": 0.4655, - "step": 595 - }, - { - "epoch": 3.0721649484536084, - "grad_norm": 0.6962155103683472, - "learning_rate": 3.5526618115932975e-05, - "loss": 0.811, - "step": 596 - }, - { - "epoch": 3.077319587628866, - "grad_norm": 0.6894726157188416, - "learning_rate": 3.53632737875766e-05, - "loss": 0.4777, - "step": 597 - }, - { - "epoch": 3.082474226804124, - "grad_norm": 0.7357553839683533, - "learning_rate": 3.5200100133318834e-05, - "loss": 0.5643, - "step": 598 - }, - { - "epoch": 3.0876288659793816, - "grad_norm": 0.5785653591156006, - "learning_rate": 3.5037099055874536e-05, - "loss": 0.4896, - "step": 599 - }, - { - "epoch": 3.0927835051546393, - "grad_norm": 0.7625001668930054, - "learning_rate": 3.487427245594622e-05, - "loss": 0.5622, - "step": 600 - }, - { - "epoch": 3.097938144329897, - "grad_norm": 0.692327082157135, - "learning_rate": 3.47116222322018e-05, - "loss": 0.5542, - "step": 601 - }, - { - "epoch": 3.1030927835051547, - "grad_norm": 0.6226824522018433, - "learning_rate": 3.4549150281252636e-05, - "loss": 0.5613, - "step": 602 - }, - { - "epoch": 3.1082474226804124, - "grad_norm": 0.6324117183685303, - "learning_rate": 3.4386858497631205e-05, - "loss": 0.5095, - "step": 603 - }, - { - "epoch": 3.11340206185567, - "grad_norm": 0.6947232484817505, - "learning_rate": 3.422474877376917e-05, - "loss": 0.5573, - "step": 604 - }, - { - "epoch": 3.118556701030928, - "grad_norm": 0.6270745992660522, - "learning_rate": 3.406282299997521e-05, - "loss": 0.6132, - "step": 605 - }, - { - "epoch": 3.1237113402061856, - "grad_norm": 0.7347365021705627, - "learning_rate": 3.3901083064413095e-05, - "loss": 0.6026, - "step": 606 - }, - { - "epoch": 3.1288659793814433, - "grad_norm": 0.5122291445732117, - "learning_rate": 3.3739530853079516e-05, - "loss": 0.4393, - "step": 607 - }, - { - "epoch": 3.134020618556701, - "grad_norm": 0.6253957152366638, - "learning_rate": 3.357816824978222e-05, - "loss": 0.5027, - "step": 608 - }, - { - "epoch": 3.1391752577319587, - "grad_norm": 0.6118744015693665, - "learning_rate": 3.341699713611799e-05, - "loss": 0.5994, - "step": 609 - }, - { - "epoch": 3.1443298969072164, - "grad_norm": 0.7655370235443115, - "learning_rate": 3.325601939145069e-05, - "loss": 0.5523, - "step": 610 - }, - { - "epoch": 3.149484536082474, - "grad_norm": 0.7117406725883484, - "learning_rate": 3.309523689288941e-05, - "loss": 0.5523, - "step": 611 - }, - { - "epoch": 3.154639175257732, - "grad_norm": 0.6880069375038147, - "learning_rate": 3.293465151526649e-05, - "loss": 0.6398, - "step": 612 - }, - { - "epoch": 3.1597938144329896, - "grad_norm": 0.6741546988487244, - "learning_rate": 3.277426513111575e-05, - "loss": 0.6644, - "step": 613 - }, - { - "epoch": 3.1649484536082473, - "grad_norm": 0.6650920510292053, - "learning_rate": 3.261407961065056e-05, - "loss": 0.6558, - "step": 614 - }, - { - "epoch": 3.170103092783505, - "grad_norm": 0.6102427840232849, - "learning_rate": 3.245409682174217e-05, - "loss": 0.4753, - "step": 615 - }, - { - "epoch": 3.1752577319587627, - "grad_norm": 0.6675355434417725, - "learning_rate": 3.229431862989775e-05, - "loss": 0.6975, - "step": 616 - }, - { - "epoch": 3.1804123711340204, - "grad_norm": 0.7167260646820068, - "learning_rate": 3.2134746898238774e-05, - "loss": 0.5593, - "step": 617 - }, - { - "epoch": 3.1855670103092786, - "grad_norm": 0.7053158283233643, - "learning_rate": 3.197538348747927e-05, - "loss": 0.5882, - "step": 618 - }, - { - "epoch": 3.1907216494845363, - "grad_norm": 0.5679628849029541, - "learning_rate": 3.181623025590405e-05, - "loss": 0.4235, - "step": 619 - }, - { - "epoch": 3.195876288659794, - "grad_norm": 0.4699335992336273, - "learning_rate": 3.165728905934718e-05, - "loss": 0.3866, - "step": 620 - }, - { - "epoch": 3.2010309278350517, - "grad_norm": 0.7112507224082947, - "learning_rate": 3.149856175117014e-05, - "loss": 0.4572, - "step": 621 - }, - { - "epoch": 3.2061855670103094, - "grad_norm": 0.7620934247970581, - "learning_rate": 3.134005018224044e-05, - "loss": 0.6401, - "step": 622 - }, - { - "epoch": 3.211340206185567, - "grad_norm": 0.8001824021339417, - "learning_rate": 3.118175620090983e-05, - "loss": 0.5794, - "step": 623 - }, - { - "epoch": 3.216494845360825, - "grad_norm": 0.6476466655731201, - "learning_rate": 3.1023681652992926e-05, - "loss": 0.6406, - "step": 624 - }, - { - "epoch": 3.2216494845360826, - "grad_norm": 0.7699551582336426, - "learning_rate": 3.086582838174551e-05, - "loss": 0.6377, - "step": 625 - }, - { - "epoch": 3.2268041237113403, - "grad_norm": 0.6394461393356323, - "learning_rate": 3.070819822784323e-05, - "loss": 0.562, - "step": 626 - }, - { - "epoch": 3.231958762886598, - "grad_norm": 0.7696752548217773, - "learning_rate": 3.055079302935997e-05, - "loss": 0.541, - "step": 627 - }, - { - "epoch": 3.2371134020618557, - "grad_norm": 0.7611710429191589, - "learning_rate": 3.0393614621746498e-05, - "loss": 0.552, - "step": 628 - }, - { - "epoch": 3.2422680412371134, - "grad_norm": 0.7990087270736694, - "learning_rate": 3.023666483780905e-05, - "loss": 0.6217, - "step": 629 - }, - { - "epoch": 3.247422680412371, - "grad_norm": 0.6985018849372864, - "learning_rate": 3.007994550768793e-05, - "loss": 0.554, - "step": 630 - }, - { - "epoch": 3.252577319587629, - "grad_norm": 0.7422522902488708, - "learning_rate": 2.9923458458836258e-05, - "loss": 0.6494, - "step": 631 - }, - { - "epoch": 3.2577319587628866, - "grad_norm": 0.7043977975845337, - "learning_rate": 2.9767205515998518e-05, - "loss": 0.483, - "step": 632 - }, - { - "epoch": 3.2628865979381443, - "grad_norm": 0.9038345813751221, - "learning_rate": 2.9611188501189435e-05, - "loss": 0.6607, - "step": 633 - }, - { - "epoch": 3.268041237113402, - "grad_norm": 0.7127012014389038, - "learning_rate": 2.9455409233672592e-05, - "loss": 0.5094, - "step": 634 - }, - { - "epoch": 3.2731958762886597, - "grad_norm": 0.6978604197502136, - "learning_rate": 2.929986952993933e-05, - "loss": 0.6598, - "step": 635 - }, - { - "epoch": 3.2783505154639174, - "grad_norm": 0.880826473236084, - "learning_rate": 2.9144571203687476e-05, - "loss": 0.5985, - "step": 636 - }, - { - "epoch": 3.283505154639175, - "grad_norm": 0.7255879044532776, - "learning_rate": 2.8989516065800238e-05, - "loss": 0.5601, - "step": 637 - }, - { - "epoch": 3.288659793814433, - "grad_norm": 0.8017705082893372, - "learning_rate": 2.8834705924325118e-05, - "loss": 0.6538, - "step": 638 - }, - { - "epoch": 3.2938144329896906, - "grad_norm": 0.6248070001602173, - "learning_rate": 2.8680142584452742e-05, - "loss": 0.5287, - "step": 639 - }, - { - "epoch": 3.2989690721649483, - "grad_norm": 0.778263509273529, - "learning_rate": 2.8525827848495913e-05, - "loss": 0.6826, - "step": 640 - }, - { - "epoch": 3.304123711340206, - "grad_norm": 0.7028930187225342, - "learning_rate": 2.83717635158685e-05, - "loss": 0.6151, - "step": 641 - }, - { - "epoch": 3.3092783505154637, - "grad_norm": 0.785013735294342, - "learning_rate": 2.8217951383064544e-05, - "loss": 0.7159, - "step": 642 - }, - { - "epoch": 3.3144329896907214, - "grad_norm": 0.686386227607727, - "learning_rate": 2.8064393243637222e-05, - "loss": 0.5513, - "step": 643 - }, - { - "epoch": 3.319587628865979, - "grad_norm": 0.6582889556884766, - "learning_rate": 2.791109088817803e-05, - "loss": 0.4522, - "step": 644 - }, - { - "epoch": 3.3247422680412373, - "grad_norm": 0.6924684047698975, - "learning_rate": 2.7758046104295797e-05, - "loss": 0.6779, - "step": 645 - }, - { - "epoch": 3.329896907216495, - "grad_norm": 0.8219475746154785, - "learning_rate": 2.760526067659591e-05, - "loss": 0.5771, - "step": 646 - }, - { - "epoch": 3.3350515463917527, - "grad_norm": 0.8043853640556335, - "learning_rate": 2.7452736386659516e-05, - "loss": 0.7175, - "step": 647 - }, - { - "epoch": 3.3402061855670104, - "grad_norm": 0.6423444151878357, - "learning_rate": 2.7300475013022663e-05, - "loss": 0.5696, - "step": 648 - }, - { - "epoch": 3.345360824742268, - "grad_norm": 0.6770009994506836, - "learning_rate": 2.7148478331155702e-05, - "loss": 0.501, - "step": 649 - }, - { - "epoch": 3.350515463917526, - "grad_norm": 0.7062271237373352, - "learning_rate": 2.6996748113442394e-05, - "loss": 0.4872, - "step": 650 - }, - { - "epoch": 3.3556701030927836, - "grad_norm": 0.6703154444694519, - "learning_rate": 2.6845286129159464e-05, - "loss": 0.4888, - "step": 651 - }, - { - "epoch": 3.3608247422680413, - "grad_norm": 0.6826891303062439, - "learning_rate": 2.669409414445574e-05, - "loss": 0.5669, - "step": 652 - }, - { - "epoch": 3.365979381443299, - "grad_norm": 0.8772823810577393, - "learning_rate": 2.6543173922331743e-05, - "loss": 0.7018, - "step": 653 - }, - { - "epoch": 3.3711340206185567, - "grad_norm": 0.6661205887794495, - "learning_rate": 2.639252722261908e-05, - "loss": 0.5039, - "step": 654 - }, - { - "epoch": 3.3762886597938144, - "grad_norm": 0.7768886685371399, - "learning_rate": 2.624215580195981e-05, - "loss": 0.5708, - "step": 655 - }, - { - "epoch": 3.381443298969072, - "grad_norm": 0.7129529118537903, - "learning_rate": 2.6092061413786156e-05, - "loss": 0.5247, - "step": 656 - }, - { - "epoch": 3.38659793814433, - "grad_norm": 0.7244031429290771, - "learning_rate": 2.5942245808299886e-05, - "loss": 0.5497, - "step": 657 - }, - { - "epoch": 3.3917525773195876, - "grad_norm": 0.7318927049636841, - "learning_rate": 2.5792710732451997e-05, - "loss": 0.6757, - "step": 658 - }, - { - "epoch": 3.3969072164948453, - "grad_norm": 0.6742333173751831, - "learning_rate": 2.56434579299223e-05, - "loss": 0.6005, - "step": 659 - }, - { - "epoch": 3.402061855670103, - "grad_norm": 0.82306969165802, - "learning_rate": 2.5494489141099153e-05, - "loss": 0.6058, - "step": 660 - }, - { - "epoch": 3.4072164948453607, - "grad_norm": 0.5822560787200928, - "learning_rate": 2.534580610305909e-05, - "loss": 0.4509, - "step": 661 - }, - { - "epoch": 3.4123711340206184, - "grad_norm": 0.6220210790634155, - "learning_rate": 2.5197410549546595e-05, - "loss": 0.3713, - "step": 662 - }, - { - "epoch": 3.417525773195876, - "grad_norm": 0.6950068473815918, - "learning_rate": 2.5049304210953933e-05, - "loss": 0.532, - "step": 663 - }, - { - "epoch": 3.422680412371134, - "grad_norm": 0.735115647315979, - "learning_rate": 2.4901488814300856e-05, - "loss": 0.5285, - "step": 664 - }, - { - "epoch": 3.4278350515463916, - "grad_norm": 0.7899560928344727, - "learning_rate": 2.4753966083214615e-05, - "loss": 0.5996, - "step": 665 - }, - { - "epoch": 3.4329896907216497, - "grad_norm": 0.724655032157898, - "learning_rate": 2.4606737737909697e-05, - "loss": 0.5587, - "step": 666 - }, - { - "epoch": 3.4381443298969074, - "grad_norm": 0.8545822501182556, - "learning_rate": 2.4459805495167942e-05, - "loss": 0.6505, - "step": 667 - }, - { - "epoch": 3.443298969072165, - "grad_norm": 0.7957090735435486, - "learning_rate": 2.4313171068318357e-05, - "loss": 0.6138, - "step": 668 - }, - { - "epoch": 3.448453608247423, - "grad_norm": 0.9017428159713745, - "learning_rate": 2.4166836167217283e-05, - "loss": 0.6846, - "step": 669 - }, - { - "epoch": 3.4536082474226806, - "grad_norm": 0.6370732188224792, - "learning_rate": 2.4020802498228335e-05, - "loss": 0.5674, - "step": 670 - }, - { - "epoch": 3.4587628865979383, - "grad_norm": 0.8172600269317627, - "learning_rate": 2.3875071764202563e-05, - "loss": 0.606, - "step": 671 - }, - { - "epoch": 3.463917525773196, - "grad_norm": 0.8876442909240723, - "learning_rate": 2.3729645664458638e-05, - "loss": 0.6484, - "step": 672 - }, - { - "epoch": 3.4690721649484537, - "grad_norm": 0.6860740184783936, - "learning_rate": 2.3584525894762928e-05, - "loss": 0.4335, - "step": 673 - }, - { - "epoch": 3.4742268041237114, - "grad_norm": 0.6609134674072266, - "learning_rate": 2.3439714147309845e-05, - "loss": 0.5055, - "step": 674 - }, - { - "epoch": 3.479381443298969, - "grad_norm": 0.768539309501648, - "learning_rate": 2.329521211070199e-05, - "loss": 0.5386, - "step": 675 - }, - { - "epoch": 3.484536082474227, - "grad_norm": 0.7136728763580322, - "learning_rate": 2.3151021469930613e-05, - "loss": 0.6505, - "step": 676 - }, - { - "epoch": 3.4896907216494846, - "grad_norm": 0.7553372979164124, - "learning_rate": 2.3007143906355767e-05, - "loss": 0.5787, - "step": 677 - }, - { - "epoch": 3.4948453608247423, - "grad_norm": 0.9047206044197083, - "learning_rate": 2.2863581097686925e-05, - "loss": 0.8126, - "step": 678 - }, - { - "epoch": 3.5, - "grad_norm": 0.665693461894989, - "learning_rate": 2.2720334717963222e-05, - "loss": 0.5814, - "step": 679 - }, - { - "epoch": 3.5, - "eval_loss": 0.7873027920722961, - "eval_runtime": 23.1508, - "eval_samples_per_second": 7.084, - "eval_steps_per_second": 1.771, - "step": 679 - }, - { - "epoch": 3.5051546391752577, - "grad_norm": 0.6601768136024475, - "learning_rate": 2.2577406437534054e-05, - "loss": 0.5339, - "step": 680 - }, - { - "epoch": 3.5103092783505154, - "grad_norm": 0.7132072448730469, - "learning_rate": 2.2434797923039598e-05, - "loss": 0.5322, - "step": 681 - }, - { - "epoch": 3.515463917525773, - "grad_norm": 0.7644983530044556, - "learning_rate": 2.2292510837391267e-05, - "loss": 0.6137, - "step": 682 - }, - { - "epoch": 3.520618556701031, - "grad_norm": 0.6903513669967651, - "learning_rate": 2.2150546839752438e-05, - "loss": 0.6657, - "step": 683 - }, - { - "epoch": 3.5257731958762886, - "grad_norm": 0.840147852897644, - "learning_rate": 2.2008907585519095e-05, - "loss": 0.5655, - "step": 684 - }, - { - "epoch": 3.5309278350515463, - "grad_norm": 0.7994117736816406, - "learning_rate": 2.186759472630045e-05, - "loss": 0.8075, - "step": 685 - }, - { - "epoch": 3.536082474226804, - "grad_norm": 0.7331596612930298, - "learning_rate": 2.172660990989971e-05, - "loss": 0.5532, - "step": 686 - }, - { - "epoch": 3.5412371134020617, - "grad_norm": 0.7885189652442932, - "learning_rate": 2.1585954780294947e-05, - "loss": 0.6312, - "step": 687 - }, - { - "epoch": 3.5463917525773194, - "grad_norm": 0.8121082186698914, - "learning_rate": 2.144563097761984e-05, - "loss": 0.54, - "step": 688 - }, - { - "epoch": 3.551546391752577, - "grad_norm": 0.7569543123245239, - "learning_rate": 2.130564013814453e-05, - "loss": 0.6202, - "step": 689 - }, - { - "epoch": 3.556701030927835, - "grad_norm": 0.6454798579216003, - "learning_rate": 2.1165983894256647e-05, - "loss": 0.5141, - "step": 690 - }, - { - "epoch": 3.5618556701030926, - "grad_norm": 0.8562419414520264, - "learning_rate": 2.102666387444215e-05, - "loss": 0.6766, - "step": 691 - }, - { - "epoch": 3.5670103092783503, - "grad_norm": 0.6552653908729553, - "learning_rate": 2.0887681703266453e-05, - "loss": 0.4334, - "step": 692 - }, - { - "epoch": 3.572164948453608, - "grad_norm": 0.6466341614723206, - "learning_rate": 2.0749039001355375e-05, - "loss": 0.6025, - "step": 693 - }, - { - "epoch": 3.5773195876288657, - "grad_norm": 0.7250998616218567, - "learning_rate": 2.061073738537635e-05, - "loss": 0.5688, - "step": 694 - }, - { - "epoch": 3.582474226804124, - "grad_norm": 0.7872217297554016, - "learning_rate": 2.0472778468019454e-05, - "loss": 0.5565, - "step": 695 - }, - { - "epoch": 3.5876288659793816, - "grad_norm": 0.6609041690826416, - "learning_rate": 2.0335163857978744e-05, - "loss": 0.4888, - "step": 696 - }, - { - "epoch": 3.5927835051546393, - "grad_norm": 0.9081669449806213, - "learning_rate": 2.019789515993336e-05, - "loss": 0.6244, - "step": 697 - }, - { - "epoch": 3.597938144329897, - "grad_norm": 0.7092580199241638, - "learning_rate": 2.0060973974528874e-05, - "loss": 0.4923, - "step": 698 - }, - { - "epoch": 3.6030927835051547, - "grad_norm": 0.7079344987869263, - "learning_rate": 1.992440189835869e-05, - "loss": 0.7202, - "step": 699 - }, - { - "epoch": 3.6082474226804124, - "grad_norm": 0.813388466835022, - "learning_rate": 1.9788180523945277e-05, - "loss": 0.5914, - "step": 700 - }, - { - "epoch": 3.61340206185567, - "grad_norm": 0.8015619516372681, - "learning_rate": 1.9652311439721764e-05, - "loss": 0.6758, - "step": 701 - }, - { - "epoch": 3.618556701030928, - "grad_norm": 0.6869595050811768, - "learning_rate": 1.9516796230013272e-05, - "loss": 0.5988, - "step": 702 - }, - { - "epoch": 3.6237113402061856, - "grad_norm": 0.6902271509170532, - "learning_rate": 1.9381636475018577e-05, - "loss": 0.5267, - "step": 703 - }, - { - "epoch": 3.6288659793814433, - "grad_norm": 0.6685857176780701, - "learning_rate": 1.9246833750791526e-05, - "loss": 0.4817, - "step": 704 - }, - { - "epoch": 3.634020618556701, - "grad_norm": 0.6419634222984314, - "learning_rate": 1.9112389629222823e-05, - "loss": 0.6316, - "step": 705 - }, - { - "epoch": 3.6391752577319587, - "grad_norm": 0.8290701508522034, - "learning_rate": 1.8978305678021595e-05, - "loss": 0.6548, - "step": 706 - }, - { - "epoch": 3.6443298969072164, - "grad_norm": 0.6211696267127991, - "learning_rate": 1.884458346069713e-05, - "loss": 0.5943, - "step": 707 - }, - { - "epoch": 3.649484536082474, - "grad_norm": 0.7741536498069763, - "learning_rate": 1.8711224536540678e-05, - "loss": 0.5249, - "step": 708 - }, - { - "epoch": 3.654639175257732, - "grad_norm": 0.7374269962310791, - "learning_rate": 1.857823046060722e-05, - "loss": 0.6726, - "step": 709 - }, - { - "epoch": 3.6597938144329896, - "grad_norm": 0.6983458995819092, - "learning_rate": 1.8445602783697374e-05, - "loss": 0.5565, - "step": 710 - }, - { - "epoch": 3.6649484536082473, - "grad_norm": 0.7474353313446045, - "learning_rate": 1.831334305233928e-05, - "loss": 0.5496, - "step": 711 - }, - { - "epoch": 3.670103092783505, - "grad_norm": 0.6772493720054626, - "learning_rate": 1.8181452808770637e-05, - "loss": 0.5827, - "step": 712 - }, - { - "epoch": 3.675257731958763, - "grad_norm": 0.6143507957458496, - "learning_rate": 1.804993359092059e-05, - "loss": 0.5753, - "step": 713 - }, - { - "epoch": 3.680412371134021, - "grad_norm": 0.6349582076072693, - "learning_rate": 1.7918786932391944e-05, - "loss": 0.5396, - "step": 714 - }, - { - "epoch": 3.6855670103092786, - "grad_norm": 0.7443505525588989, - "learning_rate": 1.778801436244319e-05, - "loss": 0.6715, - "step": 715 - }, - { - "epoch": 3.6907216494845363, - "grad_norm": 0.7070426940917969, - "learning_rate": 1.765761740597065e-05, - "loss": 0.6967, - "step": 716 - }, - { - "epoch": 3.695876288659794, - "grad_norm": 0.7999926805496216, - "learning_rate": 1.7527597583490822e-05, - "loss": 0.6274, - "step": 717 - }, - { - "epoch": 3.7010309278350517, - "grad_norm": 0.571897566318512, - "learning_rate": 1.739795641112248e-05, - "loss": 0.4957, - "step": 718 - }, - { - "epoch": 3.7061855670103094, - "grad_norm": 0.8360907435417175, - "learning_rate": 1.726869540056915e-05, - "loss": 0.6236, - "step": 719 - }, - { - "epoch": 3.711340206185567, - "grad_norm": 0.689716100692749, - "learning_rate": 1.713981605910137e-05, - "loss": 0.6277, - "step": 720 - }, - { - "epoch": 3.716494845360825, - "grad_norm": 0.7487063407897949, - "learning_rate": 1.70113198895392e-05, - "loss": 0.528, - "step": 721 - }, - { - "epoch": 3.7216494845360826, - "grad_norm": 0.84012770652771, - "learning_rate": 1.6883208390234628e-05, - "loss": 0.5405, - "step": 722 - }, - { - "epoch": 3.7268041237113403, - "grad_norm": 0.6260896921157837, - "learning_rate": 1.6755483055054105e-05, - "loss": 0.4328, - "step": 723 - }, - { - "epoch": 3.731958762886598, - "grad_norm": 0.8109961152076721, - "learning_rate": 1.662814537336122e-05, - "loss": 0.5839, - "step": 724 - }, - { - "epoch": 3.7371134020618557, - "grad_norm": 0.9204162955284119, - "learning_rate": 1.650119682999918e-05, - "loss": 0.7058, - "step": 725 - }, - { - "epoch": 3.7422680412371134, - "grad_norm": 0.5918567776679993, - "learning_rate": 1.6374638905273643e-05, - "loss": 0.4917, - "step": 726 - }, - { - "epoch": 3.747422680412371, - "grad_norm": 0.7899376749992371, - "learning_rate": 1.624847307493534e-05, - "loss": 0.5678, - "step": 727 - }, - { - "epoch": 3.752577319587629, - "grad_norm": 0.8229061961174011, - "learning_rate": 1.6122700810162966e-05, - "loss": 0.6471, - "step": 728 - }, - { - "epoch": 3.7577319587628866, - "grad_norm": 0.737942636013031, - "learning_rate": 1.5997323577545915e-05, - "loss": 0.6076, - "step": 729 - }, - { - "epoch": 3.7628865979381443, - "grad_norm": 0.7289573550224304, - "learning_rate": 1.5872342839067306e-05, - "loss": 0.5477, - "step": 730 - }, - { - "epoch": 3.768041237113402, - "grad_norm": 0.7133212685585022, - "learning_rate": 1.5747760052086803e-05, - "loss": 0.7274, - "step": 731 - }, - { - "epoch": 3.7731958762886597, - "grad_norm": 0.7502733469009399, - "learning_rate": 1.5623576669323743e-05, - "loss": 0.6026, - "step": 732 - }, - { - "epoch": 3.7783505154639174, - "grad_norm": 0.620846152305603, - "learning_rate": 1.5499794138840122e-05, - "loss": 0.5394, - "step": 733 - }, - { - "epoch": 3.783505154639175, - "grad_norm": 0.6744952201843262, - "learning_rate": 1.5376413904023722e-05, - "loss": 0.4315, - "step": 734 - }, - { - "epoch": 3.788659793814433, - "grad_norm": 0.6577085256576538, - "learning_rate": 1.525343740357128e-05, - "loss": 0.5313, - "step": 735 - }, - { - "epoch": 3.7938144329896906, - "grad_norm": 0.8090968132019043, - "learning_rate": 1.5130866071471717e-05, - "loss": 0.573, - "step": 736 - }, - { - "epoch": 3.7989690721649483, - "grad_norm": 0.8299245834350586, - "learning_rate": 1.500870133698945e-05, - "loss": 0.624, - "step": 737 - }, - { - "epoch": 3.804123711340206, - "grad_norm": 0.7030582427978516, - "learning_rate": 1.4886944624647647e-05, - "loss": 0.5036, - "step": 738 - }, - { - "epoch": 3.8092783505154637, - "grad_norm": 0.6806750297546387, - "learning_rate": 1.4765597354211713e-05, - "loss": 0.497, - "step": 739 - }, - { - "epoch": 3.8144329896907214, - "grad_norm": 0.7690914273262024, - "learning_rate": 1.4644660940672627e-05, - "loss": 0.5217, - "step": 740 - }, - { - "epoch": 3.819587628865979, - "grad_norm": 0.6966343522071838, - "learning_rate": 1.4524136794230547e-05, - "loss": 0.5966, - "step": 741 - }, - { - "epoch": 3.824742268041237, - "grad_norm": 0.6488124132156372, - "learning_rate": 1.4404026320278318e-05, - "loss": 0.454, - "step": 742 - }, - { - "epoch": 3.829896907216495, - "grad_norm": 0.7203002572059631, - "learning_rate": 1.4284330919385036e-05, - "loss": 0.6012, - "step": 743 - }, - { - "epoch": 3.8350515463917527, - "grad_norm": 0.8698861598968506, - "learning_rate": 1.4165051987279831e-05, - "loss": 0.7224, - "step": 744 - }, - { - "epoch": 3.8402061855670104, - "grad_norm": 0.7975336909294128, - "learning_rate": 1.404619091483546e-05, - "loss": 0.7323, - "step": 745 - }, - { - "epoch": 3.845360824742268, - "grad_norm": 0.697154700756073, - "learning_rate": 1.3927749088052217e-05, - "loss": 0.5375, - "step": 746 - }, - { - "epoch": 3.850515463917526, - "grad_norm": 0.857922375202179, - "learning_rate": 1.3809727888041668e-05, - "loss": 0.6702, - "step": 747 - }, - { - "epoch": 3.8556701030927836, - "grad_norm": 0.7249358296394348, - "learning_rate": 1.3692128691010592e-05, - "loss": 0.578, - "step": 748 - }, - { - "epoch": 3.8608247422680413, - "grad_norm": 1.0248481035232544, - "learning_rate": 1.3574952868244922e-05, - "loss": 0.6964, - "step": 749 - }, - { - "epoch": 3.865979381443299, - "grad_norm": 0.8700373768806458, - "learning_rate": 1.3458201786093794e-05, - "loss": 0.686, - "step": 750 - }, - { - "epoch": 3.8711340206185567, - "grad_norm": 0.7813423871994019, - "learning_rate": 1.334187680595358e-05, - "loss": 0.5817, - "step": 751 - }, - { - "epoch": 3.8762886597938144, - "grad_norm": 0.6536322832107544, - "learning_rate": 1.3225979284251954e-05, - "loss": 0.5981, - "step": 752 - }, - { - "epoch": 3.881443298969072, - "grad_norm": 0.8255348205566406, - "learning_rate": 1.3110510572432221e-05, - "loss": 0.7288, - "step": 753 - }, - { - "epoch": 3.88659793814433, - "grad_norm": 0.7284218668937683, - "learning_rate": 1.2995472016937404e-05, - "loss": 0.5522, - "step": 754 - }, - { - "epoch": 3.8917525773195876, - "grad_norm": 0.7565793395042419, - "learning_rate": 1.2880864959194665e-05, - "loss": 0.6455, - "step": 755 - }, - { - "epoch": 3.8969072164948453, - "grad_norm": 0.7215232253074646, - "learning_rate": 1.2766690735599568e-05, - "loss": 0.5919, - "step": 756 - }, - { - "epoch": 3.902061855670103, - "grad_norm": 0.5804430246353149, - "learning_rate": 1.2652950677500574e-05, - "loss": 0.5657, - "step": 757 - }, - { - "epoch": 3.9072164948453607, - "grad_norm": 0.6634606122970581, - "learning_rate": 1.253964611118345e-05, - "loss": 0.4888, - "step": 758 - }, - { - "epoch": 3.9123711340206184, - "grad_norm": 0.5982066988945007, - "learning_rate": 1.2426778357855873e-05, - "loss": 0.539, - "step": 759 - }, - { - "epoch": 3.917525773195876, - "grad_norm": 0.7859188914299011, - "learning_rate": 1.2314348733631959e-05, - "loss": 0.6816, - "step": 760 - }, - { - "epoch": 3.9226804123711343, - "grad_norm": 0.7802892327308655, - "learning_rate": 1.2202358549516923e-05, - "loss": 0.6872, - "step": 761 - }, - { - "epoch": 3.927835051546392, - "grad_norm": 0.699567437171936, - "learning_rate": 1.209080911139187e-05, - "loss": 0.5798, - "step": 762 - }, - { - "epoch": 3.9329896907216497, - "grad_norm": 0.777010440826416, - "learning_rate": 1.1979701719998453e-05, - "loss": 0.6204, - "step": 763 - }, - { - "epoch": 3.9381443298969074, - "grad_norm": 0.7714626789093018, - "learning_rate": 1.1869037670923815e-05, - "loss": 0.7335, - "step": 764 - }, - { - "epoch": 3.943298969072165, - "grad_norm": 0.7658986449241638, - "learning_rate": 1.1758818254585369e-05, - "loss": 0.6169, - "step": 765 - }, - { - "epoch": 3.948453608247423, - "grad_norm": 0.7642006278038025, - "learning_rate": 1.164904475621587e-05, - "loss": 0.6777, - "step": 766 - }, - { - "epoch": 3.9536082474226806, - "grad_norm": 0.8413475751876831, - "learning_rate": 1.1539718455848309e-05, - "loss": 0.4705, - "step": 767 - }, - { - "epoch": 3.9587628865979383, - "grad_norm": 0.7557904720306396, - "learning_rate": 1.1430840628301093e-05, - "loss": 0.5588, - "step": 768 - }, - { - "epoch": 3.963917525773196, - "grad_norm": 0.7613516449928284, - "learning_rate": 1.1322412543163135e-05, - "loss": 0.6615, - "step": 769 - }, - { - "epoch": 3.9690721649484537, - "grad_norm": 0.7262148261070251, - "learning_rate": 1.1214435464779006e-05, - "loss": 0.6986, - "step": 770 - }, - { - "epoch": 3.9742268041237114, - "grad_norm": 0.6863129734992981, - "learning_rate": 1.1106910652234276e-05, - "loss": 0.5653, - "step": 771 - }, - { - "epoch": 3.979381443298969, - "grad_norm": 0.7940157055854797, - "learning_rate": 1.099983935934077e-05, - "loss": 0.6754, - "step": 772 - }, - { - "epoch": 3.984536082474227, - "grad_norm": 0.8549270629882812, - "learning_rate": 1.089322283462197e-05, - "loss": 0.6918, - "step": 773 - }, - { - "epoch": 3.9896907216494846, - "grad_norm": 0.6270641088485718, - "learning_rate": 1.0787062321298442e-05, - "loss": 0.5433, - "step": 774 - }, - { - "epoch": 3.9948453608247423, - "grad_norm": 0.758388340473175, - "learning_rate": 1.0681359057273388e-05, - "loss": 0.6541, - "step": 775 - }, - { - "epoch": 4.0, - "grad_norm": 1.1503102779388428, - "learning_rate": 1.0576114275118131e-05, - "loss": 0.6927, - "step": 776 - }, - { - "epoch": 4.0, - "eval_loss": 0.7827529907226562, - "eval_runtime": 23.077, - "eval_samples_per_second": 7.107, - "eval_steps_per_second": 1.777, - "step": 776 - }, - { - "epoch": 4.005154639175258, - "grad_norm": 0.6961793899536133, - "learning_rate": 1.0471329202057823e-05, - "loss": 0.4523, - "step": 777 - }, - { - "epoch": 4.010309278350515, - "grad_norm": 0.5918589234352112, - "learning_rate": 1.0367005059957096e-05, - "loss": 0.4715, - "step": 778 - }, - { - "epoch": 4.015463917525773, - "grad_norm": 0.7088159322738647, - "learning_rate": 1.0263143065305769e-05, - "loss": 0.4329, - "step": 779 - }, - { - "epoch": 4.020618556701031, - "grad_norm": 0.6903660893440247, - "learning_rate": 1.0159744429204777e-05, - "loss": 0.5347, - "step": 780 - }, - { - "epoch": 4.025773195876289, - "grad_norm": 0.6392103433609009, - "learning_rate": 1.005681035735192e-05, - "loss": 0.5343, - "step": 781 - }, - { - "epoch": 4.030927835051546, - "grad_norm": 0.6996196508407593, - "learning_rate": 9.954342050027921e-06, - "loss": 0.5407, - "step": 782 - }, - { - "epoch": 4.036082474226804, - "grad_norm": 0.6675930619239807, - "learning_rate": 9.852340702082318e-06, - "loss": 0.4305, - "step": 783 - }, - { - "epoch": 4.041237113402062, - "grad_norm": 0.6556795239448547, - "learning_rate": 9.750807502919652e-06, - "loss": 0.5781, - "step": 784 - }, - { - "epoch": 4.046391752577319, - "grad_norm": 0.7473887205123901, - "learning_rate": 9.64974363648548e-06, - "loss": 0.733, - "step": 785 - }, - { - "epoch": 4.051546391752577, - "grad_norm": 0.774716317653656, - "learning_rate": 9.549150281252633e-06, - "loss": 0.5335, - "step": 786 - }, - { - "epoch": 4.056701030927835, - "grad_norm": 0.6763889193534851, - "learning_rate": 9.449028610207494e-06, - "loss": 0.5087, - "step": 787 - }, - { - "epoch": 4.061855670103093, - "grad_norm": 0.8375513553619385, - "learning_rate": 9.349379790836243e-06, - "loss": 0.6384, - "step": 788 - }, - { - "epoch": 4.06701030927835, - "grad_norm": 0.7478683590888977, - "learning_rate": 9.25020498511135e-06, - "loss": 0.6164, - "step": 789 - }, - { - "epoch": 4.072164948453608, - "grad_norm": 0.7629753947257996, - "learning_rate": 9.151505349477902e-06, - "loss": 0.5738, - "step": 790 - }, - { - "epoch": 4.077319587628866, - "grad_norm": 0.6709279417991638, - "learning_rate": 9.053282034840238e-06, - "loss": 0.5869, - "step": 791 - }, - { - "epoch": 4.082474226804123, - "grad_norm": 0.6376342177391052, - "learning_rate": 8.955536186548425e-06, - "loss": 0.5019, - "step": 792 - }, - { - "epoch": 4.087628865979381, - "grad_norm": 0.7179467082023621, - "learning_rate": 8.858268944384995e-06, - "loss": 0.5092, - "step": 793 - }, - { - "epoch": 4.092783505154639, - "grad_norm": 0.789688229560852, - "learning_rate": 8.761481442551573e-06, - "loss": 0.6966, - "step": 794 - }, - { - "epoch": 4.097938144329897, - "grad_norm": 0.8050559163093567, - "learning_rate": 8.665174809655708e-06, - "loss": 0.6005, - "step": 795 - }, - { - "epoch": 4.103092783505154, - "grad_norm": 0.7821665406227112, - "learning_rate": 8.569350168697704e-06, - "loss": 0.6106, - "step": 796 - }, - { - "epoch": 4.108247422680412, - "grad_norm": 0.778394341468811, - "learning_rate": 8.474008637057478e-06, - "loss": 0.5891, - "step": 797 - }, - { - "epoch": 4.11340206185567, - "grad_norm": 0.9033699035644531, - "learning_rate": 8.379151326481587e-06, - "loss": 0.5541, - "step": 798 - }, - { - "epoch": 4.118556701030927, - "grad_norm": 0.8777570724487305, - "learning_rate": 8.284779343070265e-06, - "loss": 0.5675, - "step": 799 - }, - { - "epoch": 4.123711340206185, - "grad_norm": 0.6416880488395691, - "learning_rate": 8.19089378726447e-06, - "loss": 0.4717, - "step": 800 - }, - { - "epoch": 4.128865979381443, - "grad_norm": 0.7612876296043396, - "learning_rate": 8.097495753833078e-06, - "loss": 0.6074, - "step": 801 - }, - { - "epoch": 4.134020618556701, - "grad_norm": 0.8172696828842163, - "learning_rate": 8.004586331860175e-06, - "loss": 0.6051, - "step": 802 - }, - { - "epoch": 4.139175257731959, - "grad_norm": 0.7682762145996094, - "learning_rate": 7.91216660473228e-06, - "loss": 0.5949, - "step": 803 - }, - { - "epoch": 4.144329896907217, - "grad_norm": 0.7049791812896729, - "learning_rate": 7.820237650125712e-06, - "loss": 0.6494, - "step": 804 - }, - { - "epoch": 4.149484536082475, - "grad_norm": 0.9258434176445007, - "learning_rate": 7.728800539994113e-06, - "loss": 0.7203, - "step": 805 - }, - { - "epoch": 4.154639175257732, - "grad_norm": 0.7941154837608337, - "learning_rate": 7.637856340555822e-06, - "loss": 0.5589, - "step": 806 - }, - { - "epoch": 4.15979381443299, - "grad_norm": 0.8220565319061279, - "learning_rate": 7.547406112281557e-06, - "loss": 0.5536, - "step": 807 - }, - { - "epoch": 4.164948453608248, - "grad_norm": 0.7357336282730103, - "learning_rate": 7.457450909881969e-06, - "loss": 0.5021, - "step": 808 - }, - { - "epoch": 4.170103092783505, - "grad_norm": 0.8894550800323486, - "learning_rate": 7.367991782295391e-06, - "loss": 0.4791, - "step": 809 - }, - { - "epoch": 4.175257731958763, - "grad_norm": 0.8629480004310608, - "learning_rate": 7.2790297726755716e-06, - "loss": 0.4588, - "step": 810 - }, - { - "epoch": 4.180412371134021, - "grad_norm": 0.8441846370697021, - "learning_rate": 7.190565918379549e-06, - "loss": 0.6378, - "step": 811 - }, - { - "epoch": 4.185567010309279, - "grad_norm": 0.8104022145271301, - "learning_rate": 7.1026012509555265e-06, - "loss": 0.5777, - "step": 812 - }, - { - "epoch": 4.190721649484536, - "grad_norm": 0.855922281742096, - "learning_rate": 7.015136796130828e-06, - "loss": 0.6161, - "step": 813 - }, - { - "epoch": 4.195876288659794, - "grad_norm": 0.7620590329170227, - "learning_rate": 6.928173573800006e-06, - "loss": 0.5267, - "step": 814 - }, - { - "epoch": 4.201030927835052, - "grad_norm": 0.8558603525161743, - "learning_rate": 6.8417125980128675e-06, - "loss": 0.6221, - "step": 815 - }, - { - "epoch": 4.206185567010309, - "grad_norm": 0.7612592577934265, - "learning_rate": 6.755754876962711e-06, - "loss": 0.5259, - "step": 816 - }, - { - "epoch": 4.211340206185567, - "grad_norm": 0.6571881771087646, - "learning_rate": 6.670301412974511e-06, - "loss": 0.4846, - "step": 817 - }, - { - "epoch": 4.216494845360825, - "grad_norm": 0.8638062477111816, - "learning_rate": 6.585353202493322e-06, - "loss": 0.5727, - "step": 818 - }, - { - "epoch": 4.221649484536083, - "grad_norm": 0.7832995653152466, - "learning_rate": 6.500911236072532e-06, - "loss": 0.6475, - "step": 819 - }, - { - "epoch": 4.22680412371134, - "grad_norm": 0.7984924912452698, - "learning_rate": 6.416976498362432e-06, - "loss": 0.4848, - "step": 820 - }, - { - "epoch": 4.231958762886598, - "grad_norm": 0.743242084980011, - "learning_rate": 6.333549968098684e-06, - "loss": 0.5729, - "step": 821 - }, - { - "epoch": 4.237113402061856, - "grad_norm": 0.8120692372322083, - "learning_rate": 6.250632618090868e-06, - "loss": 0.4599, - "step": 822 - }, - { - "epoch": 4.242268041237113, - "grad_norm": 0.8082096576690674, - "learning_rate": 6.168225415211226e-06, - "loss": 0.6201, - "step": 823 - }, - { - "epoch": 4.247422680412371, - "grad_norm": 0.8432015180587769, - "learning_rate": 6.0863293203833105e-06, - "loss": 0.6398, - "step": 824 - }, - { - "epoch": 4.252577319587629, - "grad_norm": 0.7766333818435669, - "learning_rate": 6.004945288570813e-06, - "loss": 0.517, - "step": 825 } ], "logging_steps": 1, @@ -5873,7 +279,7 @@ "attributes": {} } }, - "total_flos": 3.1905813118530355e+17, + "total_flos": 1.354811597389824e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null