{ "best_metric": 0.8083848357200623, "best_model_checkpoint": "./kaggle/working/eGTZANplus/checkpoint-220", "epoch": 20.0, "eval_steps": 10, "global_step": 1080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19, "grad_norm": 1.4207828044891357, "learning_rate": 0.00019814814814814814, "loss": 2.4003, "step": 10 }, { "epoch": 0.19, "eval_accuracy": 0.19576719576719576, "eval_loss": 2.282846689224243, "eval_runtime": 1.8415, "eval_samples_per_second": 102.636, "eval_steps_per_second": 6.517, "step": 10 }, { "epoch": 0.37, "grad_norm": 1.2310184240341187, "learning_rate": 0.0001962962962962963, "loss": 2.1703, "step": 20 }, { "epoch": 0.37, "eval_accuracy": 0.35978835978835977, "eval_loss": 1.9852432012557983, "eval_runtime": 1.793, "eval_samples_per_second": 105.41, "eval_steps_per_second": 6.693, "step": 20 }, { "epoch": 0.56, "grad_norm": 2.0627870559692383, "learning_rate": 0.00019444444444444446, "loss": 1.9696, "step": 30 }, { "epoch": 0.56, "eval_accuracy": 0.3915343915343915, "eval_loss": 1.8232808113098145, "eval_runtime": 1.786, "eval_samples_per_second": 105.821, "eval_steps_per_second": 6.719, "step": 30 }, { "epoch": 0.74, "grad_norm": 1.5207180976867676, "learning_rate": 0.0001925925925925926, "loss": 1.8051, "step": 40 }, { "epoch": 0.74, "eval_accuracy": 0.48677248677248675, "eval_loss": 1.6591798067092896, "eval_runtime": 1.7501, "eval_samples_per_second": 107.997, "eval_steps_per_second": 6.857, "step": 40 }, { "epoch": 0.93, "grad_norm": 2.221734046936035, "learning_rate": 0.00019074074074074075, "loss": 1.6692, "step": 50 }, { "epoch": 0.93, "eval_accuracy": 0.582010582010582, "eval_loss": 1.5287415981292725, "eval_runtime": 1.7993, "eval_samples_per_second": 105.039, "eval_steps_per_second": 6.669, "step": 50 }, { "epoch": 1.11, "grad_norm": 1.5369292497634888, "learning_rate": 0.0001890740740740741, "loss": 1.5283, "step": 60 }, { "epoch": 1.11, "eval_accuracy": 0.5608465608465608, "eval_loss": 1.4252889156341553, "eval_runtime": 1.7582, "eval_samples_per_second": 107.493, "eval_steps_per_second": 6.825, "step": 60 }, { "epoch": 1.3, "grad_norm": 1.9959373474121094, "learning_rate": 0.00018722222222222222, "loss": 1.3981, "step": 70 }, { "epoch": 1.3, "eval_accuracy": 0.5925925925925926, "eval_loss": 1.3883891105651855, "eval_runtime": 1.7749, "eval_samples_per_second": 106.485, "eval_steps_per_second": 6.761, "step": 70 }, { "epoch": 1.48, "grad_norm": 2.101576805114746, "learning_rate": 0.00018537037037037038, "loss": 1.3047, "step": 80 }, { "epoch": 1.48, "eval_accuracy": 0.5767195767195767, "eval_loss": 1.356843113899231, "eval_runtime": 1.7875, "eval_samples_per_second": 105.735, "eval_steps_per_second": 6.713, "step": 80 }, { "epoch": 1.67, "grad_norm": 1.9240992069244385, "learning_rate": 0.00018351851851851854, "loss": 1.1325, "step": 90 }, { "epoch": 1.67, "eval_accuracy": 0.6349206349206349, "eval_loss": 1.2104465961456299, "eval_runtime": 1.7741, "eval_samples_per_second": 106.533, "eval_steps_per_second": 6.764, "step": 90 }, { "epoch": 1.85, "grad_norm": 1.6294556856155396, "learning_rate": 0.00018166666666666667, "loss": 1.2004, "step": 100 }, { "epoch": 1.85, "eval_accuracy": 0.6137566137566137, "eval_loss": 1.263272762298584, "eval_runtime": 1.8419, "eval_samples_per_second": 102.609, "eval_steps_per_second": 6.515, "step": 100 }, { "epoch": 2.04, "grad_norm": 4.842734336853027, "learning_rate": 0.0001798148148148148, "loss": 1.0475, "step": 110 }, { "epoch": 2.04, "eval_accuracy": 0.5555555555555556, "eval_loss": 1.3616496324539185, "eval_runtime": 1.7824, "eval_samples_per_second": 106.036, "eval_steps_per_second": 6.732, "step": 110 }, { "epoch": 2.22, "grad_norm": 1.8519538640975952, "learning_rate": 0.00017796296296296296, "loss": 0.9801, "step": 120 }, { "epoch": 2.22, "eval_accuracy": 0.671957671957672, "eval_loss": 1.1471754312515259, "eval_runtime": 1.796, "eval_samples_per_second": 105.234, "eval_steps_per_second": 6.682, "step": 120 }, { "epoch": 2.41, "grad_norm": 3.018026351928711, "learning_rate": 0.00017611111111111112, "loss": 0.862, "step": 130 }, { "epoch": 2.41, "eval_accuracy": 0.6984126984126984, "eval_loss": 1.0452642440795898, "eval_runtime": 1.7578, "eval_samples_per_second": 107.521, "eval_steps_per_second": 6.827, "step": 130 }, { "epoch": 2.59, "grad_norm": 2.8672127723693848, "learning_rate": 0.00017425925925925928, "loss": 0.8905, "step": 140 }, { "epoch": 2.59, "eval_accuracy": 0.6825396825396826, "eval_loss": 0.9718140363693237, "eval_runtime": 1.8323, "eval_samples_per_second": 103.148, "eval_steps_per_second": 6.549, "step": 140 }, { "epoch": 2.78, "grad_norm": 3.5106003284454346, "learning_rate": 0.00017240740740740742, "loss": 0.7839, "step": 150 }, { "epoch": 2.78, "eval_accuracy": 0.6666666666666666, "eval_loss": 1.0531541109085083, "eval_runtime": 1.7655, "eval_samples_per_second": 107.049, "eval_steps_per_second": 6.797, "step": 150 }, { "epoch": 2.96, "grad_norm": 2.7532589435577393, "learning_rate": 0.00017055555555555555, "loss": 0.8304, "step": 160 }, { "epoch": 2.96, "eval_accuracy": 0.6878306878306878, "eval_loss": 0.96842360496521, "eval_runtime": 1.8371, "eval_samples_per_second": 102.881, "eval_steps_per_second": 6.532, "step": 160 }, { "epoch": 3.15, "grad_norm": 2.1222331523895264, "learning_rate": 0.0001687037037037037, "loss": 0.883, "step": 170 }, { "epoch": 3.15, "eval_accuracy": 0.6931216931216931, "eval_loss": 0.9298208951950073, "eval_runtime": 1.7867, "eval_samples_per_second": 105.782, "eval_steps_per_second": 6.716, "step": 170 }, { "epoch": 3.33, "grad_norm": 2.5858914852142334, "learning_rate": 0.00016685185185185187, "loss": 0.5714, "step": 180 }, { "epoch": 3.33, "eval_accuracy": 0.6772486772486772, "eval_loss": 0.9491019248962402, "eval_runtime": 1.7856, "eval_samples_per_second": 105.846, "eval_steps_per_second": 6.72, "step": 180 }, { "epoch": 3.52, "grad_norm": 1.7296024560928345, "learning_rate": 0.000165, "loss": 0.5209, "step": 190 }, { "epoch": 3.52, "eval_accuracy": 0.6984126984126984, "eval_loss": 0.914806604385376, "eval_runtime": 1.7453, "eval_samples_per_second": 108.289, "eval_steps_per_second": 6.875, "step": 190 }, { "epoch": 3.7, "grad_norm": 4.235101699829102, "learning_rate": 0.00016314814814814816, "loss": 0.5404, "step": 200 }, { "epoch": 3.7, "eval_accuracy": 0.671957671957672, "eval_loss": 1.0290465354919434, "eval_runtime": 1.8123, "eval_samples_per_second": 104.285, "eval_steps_per_second": 6.621, "step": 200 }, { "epoch": 3.89, "grad_norm": 3.8817615509033203, "learning_rate": 0.0001612962962962963, "loss": 0.6133, "step": 210 }, { "epoch": 3.89, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.9116460680961609, "eval_runtime": 1.7735, "eval_samples_per_second": 106.57, "eval_steps_per_second": 6.766, "step": 210 }, { "epoch": 4.07, "grad_norm": 1.743445634841919, "learning_rate": 0.00015944444444444445, "loss": 0.4347, "step": 220 }, { "epoch": 4.07, "eval_accuracy": 0.7354497354497355, "eval_loss": 0.8083848357200623, "eval_runtime": 1.8193, "eval_samples_per_second": 103.884, "eval_steps_per_second": 6.596, "step": 220 }, { "epoch": 4.26, "grad_norm": 1.8867310285568237, "learning_rate": 0.0001575925925925926, "loss": 0.3659, "step": 230 }, { "epoch": 4.26, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.890904426574707, "eval_runtime": 1.7392, "eval_samples_per_second": 108.672, "eval_steps_per_second": 6.9, "step": 230 }, { "epoch": 4.44, "grad_norm": 2.56878399848938, "learning_rate": 0.00015574074074074074, "loss": 0.4439, "step": 240 }, { "epoch": 4.44, "eval_accuracy": 0.6825396825396826, "eval_loss": 0.9554860591888428, "eval_runtime": 1.7559, "eval_samples_per_second": 107.64, "eval_steps_per_second": 6.834, "step": 240 }, { "epoch": 4.63, "grad_norm": 1.9487425088882446, "learning_rate": 0.0001538888888888889, "loss": 0.3335, "step": 250 }, { "epoch": 4.63, "eval_accuracy": 0.708994708994709, "eval_loss": 0.931969404220581, "eval_runtime": 1.8636, "eval_samples_per_second": 101.417, "eval_steps_per_second": 6.439, "step": 250 }, { "epoch": 4.81, "grad_norm": 2.4911906719207764, "learning_rate": 0.00015203703703703703, "loss": 0.3695, "step": 260 }, { "epoch": 4.81, "eval_accuracy": 0.7037037037037037, "eval_loss": 0.9643996357917786, "eval_runtime": 1.743, "eval_samples_per_second": 108.437, "eval_steps_per_second": 6.885, "step": 260 }, { "epoch": 5.0, "grad_norm": 0.4799601137638092, "learning_rate": 0.0001501851851851852, "loss": 0.3018, "step": 270 }, { "epoch": 5.0, "eval_accuracy": 0.6455026455026455, "eval_loss": 1.1127641201019287, "eval_runtime": 1.8057, "eval_samples_per_second": 104.667, "eval_steps_per_second": 6.646, "step": 270 }, { "epoch": 5.19, "grad_norm": 0.8545930981636047, "learning_rate": 0.00014833333333333335, "loss": 0.2418, "step": 280 }, { "epoch": 5.19, "eval_accuracy": 0.7301587301587301, "eval_loss": 0.8752605319023132, "eval_runtime": 1.7714, "eval_samples_per_second": 106.698, "eval_steps_per_second": 6.774, "step": 280 }, { "epoch": 5.37, "grad_norm": 2.0490822792053223, "learning_rate": 0.00014648148148148148, "loss": 0.2305, "step": 290 }, { "epoch": 5.37, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.9517038464546204, "eval_runtime": 1.7422, "eval_samples_per_second": 108.483, "eval_steps_per_second": 6.888, "step": 290 }, { "epoch": 5.56, "grad_norm": 1.5348315238952637, "learning_rate": 0.00014462962962962962, "loss": 0.238, "step": 300 }, { "epoch": 5.56, "eval_accuracy": 0.7248677248677249, "eval_loss": 0.9478802680969238, "eval_runtime": 1.7999, "eval_samples_per_second": 105.006, "eval_steps_per_second": 6.667, "step": 300 }, { "epoch": 5.74, "grad_norm": 2.6169273853302, "learning_rate": 0.00014277777777777778, "loss": 0.2099, "step": 310 }, { "epoch": 5.74, "eval_accuracy": 0.671957671957672, "eval_loss": 1.103389024734497, "eval_runtime": 1.8453, "eval_samples_per_second": 102.42, "eval_steps_per_second": 6.503, "step": 310 }, { "epoch": 5.93, "grad_norm": 2.5781023502349854, "learning_rate": 0.00014092592592592594, "loss": 0.2284, "step": 320 }, { "epoch": 5.93, "eval_accuracy": 0.6825396825396826, "eval_loss": 1.031624674797058, "eval_runtime": 1.7579, "eval_samples_per_second": 107.517, "eval_steps_per_second": 6.826, "step": 320 }, { "epoch": 6.11, "grad_norm": 3.042239189147949, "learning_rate": 0.0001390740740740741, "loss": 0.1694, "step": 330 }, { "epoch": 6.11, "eval_accuracy": 0.6613756613756614, "eval_loss": 1.1174468994140625, "eval_runtime": 1.7854, "eval_samples_per_second": 105.856, "eval_steps_per_second": 6.721, "step": 330 }, { "epoch": 6.3, "grad_norm": 0.8211657404899597, "learning_rate": 0.00013722222222222223, "loss": 0.1715, "step": 340 }, { "epoch": 6.3, "eval_accuracy": 0.6772486772486772, "eval_loss": 1.1067023277282715, "eval_runtime": 1.8157, "eval_samples_per_second": 104.091, "eval_steps_per_second": 6.609, "step": 340 }, { "epoch": 6.48, "grad_norm": 1.9425742626190186, "learning_rate": 0.00013537037037037036, "loss": 0.123, "step": 350 }, { "epoch": 6.48, "eval_accuracy": 0.7142857142857143, "eval_loss": 1.0037899017333984, "eval_runtime": 1.786, "eval_samples_per_second": 105.821, "eval_steps_per_second": 6.719, "step": 350 }, { "epoch": 6.67, "grad_norm": 2.7061989307403564, "learning_rate": 0.00013351851851851852, "loss": 0.1297, "step": 360 }, { "epoch": 6.67, "eval_accuracy": 0.6772486772486772, "eval_loss": 1.1142699718475342, "eval_runtime": 1.7368, "eval_samples_per_second": 108.818, "eval_steps_per_second": 6.909, "step": 360 }, { "epoch": 6.85, "grad_norm": 2.478459358215332, "learning_rate": 0.00013166666666666668, "loss": 0.2191, "step": 370 }, { "epoch": 6.85, "eval_accuracy": 0.7354497354497355, "eval_loss": 0.9896882176399231, "eval_runtime": 1.7802, "eval_samples_per_second": 106.167, "eval_steps_per_second": 6.741, "step": 370 }, { "epoch": 7.04, "grad_norm": 1.6921576261520386, "learning_rate": 0.0001298148148148148, "loss": 0.1206, "step": 380 }, { "epoch": 7.04, "eval_accuracy": 0.7407407407407407, "eval_loss": 0.962655782699585, "eval_runtime": 1.7667, "eval_samples_per_second": 106.982, "eval_steps_per_second": 6.793, "step": 380 }, { "epoch": 7.22, "grad_norm": 0.8060858845710754, "learning_rate": 0.00012796296296296297, "loss": 0.1071, "step": 390 }, { "epoch": 7.22, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0495431423187256, "eval_runtime": 1.7473, "eval_samples_per_second": 108.168, "eval_steps_per_second": 6.868, "step": 390 }, { "epoch": 7.41, "grad_norm": 0.38671812415122986, "learning_rate": 0.0001261111111111111, "loss": 0.1102, "step": 400 }, { "epoch": 7.41, "eval_accuracy": 0.7301587301587301, "eval_loss": 1.0441887378692627, "eval_runtime": 1.7747, "eval_samples_per_second": 106.496, "eval_steps_per_second": 6.762, "step": 400 }, { "epoch": 7.59, "grad_norm": 1.2801034450531006, "learning_rate": 0.0001242592592592593, "loss": 0.1269, "step": 410 }, { "epoch": 7.59, "eval_accuracy": 0.7407407407407407, "eval_loss": 1.0281165838241577, "eval_runtime": 1.811, "eval_samples_per_second": 104.363, "eval_steps_per_second": 6.626, "step": 410 }, { "epoch": 7.78, "grad_norm": 0.92644864320755, "learning_rate": 0.00012240740740740742, "loss": 0.0694, "step": 420 }, { "epoch": 7.78, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.0361741781234741, "eval_runtime": 1.7423, "eval_samples_per_second": 108.479, "eval_steps_per_second": 6.888, "step": 420 }, { "epoch": 7.96, "grad_norm": 0.8203582167625427, "learning_rate": 0.00012055555555555555, "loss": 0.0548, "step": 430 }, { "epoch": 7.96, "eval_accuracy": 0.746031746031746, "eval_loss": 1.071204423904419, "eval_runtime": 1.7384, "eval_samples_per_second": 108.723, "eval_steps_per_second": 6.903, "step": 430 }, { "epoch": 8.15, "grad_norm": 4.490820407867432, "learning_rate": 0.00011870370370370371, "loss": 0.062, "step": 440 }, { "epoch": 8.15, "eval_accuracy": 0.7301587301587301, "eval_loss": 1.035632610321045, "eval_runtime": 1.8141, "eval_samples_per_second": 104.182, "eval_steps_per_second": 6.615, "step": 440 }, { "epoch": 8.33, "grad_norm": 1.979749321937561, "learning_rate": 0.00011685185185185186, "loss": 0.0542, "step": 450 }, { "epoch": 8.33, "eval_accuracy": 0.6984126984126984, "eval_loss": 1.2573037147521973, "eval_runtime": 1.7824, "eval_samples_per_second": 106.034, "eval_steps_per_second": 6.732, "step": 450 }, { "epoch": 8.52, "grad_norm": 4.157647609710693, "learning_rate": 0.00011499999999999999, "loss": 0.0823, "step": 460 }, { "epoch": 8.52, "eval_accuracy": 0.7195767195767195, "eval_loss": 1.1037700176239014, "eval_runtime": 1.7489, "eval_samples_per_second": 108.066, "eval_steps_per_second": 6.861, "step": 460 }, { "epoch": 8.7, "grad_norm": 0.08767159283161163, "learning_rate": 0.00011314814814814816, "loss": 0.1354, "step": 470 }, { "epoch": 8.7, "eval_accuracy": 0.7407407407407407, "eval_loss": 1.0803223848342896, "eval_runtime": 1.7889, "eval_samples_per_second": 105.654, "eval_steps_per_second": 6.708, "step": 470 }, { "epoch": 8.89, "grad_norm": 0.6974061131477356, "learning_rate": 0.0001112962962962963, "loss": 0.0798, "step": 480 }, { "epoch": 8.89, "eval_accuracy": 0.671957671957672, "eval_loss": 1.2207469940185547, "eval_runtime": 1.7456, "eval_samples_per_second": 108.27, "eval_steps_per_second": 6.874, "step": 480 }, { "epoch": 9.07, "grad_norm": 2.0027213096618652, "learning_rate": 0.00010944444444444445, "loss": 0.0963, "step": 490 }, { "epoch": 9.07, "eval_accuracy": 0.656084656084656, "eval_loss": 1.337466835975647, "eval_runtime": 1.7654, "eval_samples_per_second": 107.06, "eval_steps_per_second": 6.797, "step": 490 }, { "epoch": 9.26, "grad_norm": 0.14471650123596191, "learning_rate": 0.0001075925925925926, "loss": 0.0557, "step": 500 }, { "epoch": 9.26, "eval_accuracy": 0.6984126984126984, "eval_loss": 1.2044044733047485, "eval_runtime": 1.9948, "eval_samples_per_second": 94.745, "eval_steps_per_second": 6.016, "step": 500 }, { "epoch": 9.44, "grad_norm": 0.07393530756235123, "learning_rate": 0.00010574074074074075, "loss": 0.0491, "step": 510 }, { "epoch": 9.44, "eval_accuracy": 0.7248677248677249, "eval_loss": 1.18802809715271, "eval_runtime": 1.8204, "eval_samples_per_second": 103.822, "eval_steps_per_second": 6.592, "step": 510 }, { "epoch": 9.63, "grad_norm": 0.12744389474391937, "learning_rate": 0.0001038888888888889, "loss": 0.0502, "step": 520 }, { "epoch": 9.63, "eval_accuracy": 0.746031746031746, "eval_loss": 1.098527193069458, "eval_runtime": 1.7601, "eval_samples_per_second": 107.378, "eval_steps_per_second": 6.818, "step": 520 }, { "epoch": 9.81, "grad_norm": 0.07471567392349243, "learning_rate": 0.00010203703703703704, "loss": 0.0396, "step": 530 }, { "epoch": 9.81, "eval_accuracy": 0.708994708994709, "eval_loss": 1.214396595954895, "eval_runtime": 1.7884, "eval_samples_per_second": 105.68, "eval_steps_per_second": 6.71, "step": 530 }, { "epoch": 10.0, "grad_norm": 0.16710619628429413, "learning_rate": 0.00010018518518518518, "loss": 0.0717, "step": 540 }, { "epoch": 10.0, "eval_accuracy": 0.7037037037037037, "eval_loss": 1.2163357734680176, "eval_runtime": 1.7401, "eval_samples_per_second": 108.615, "eval_steps_per_second": 6.896, "step": 540 }, { "epoch": 10.19, "grad_norm": 0.07553374022245407, "learning_rate": 9.833333333333333e-05, "loss": 0.0279, "step": 550 }, { "epoch": 10.19, "eval_accuracy": 0.7142857142857143, "eval_loss": 1.119241714477539, "eval_runtime": 1.766, "eval_samples_per_second": 107.023, "eval_steps_per_second": 6.795, "step": 550 }, { "epoch": 10.37, "grad_norm": 0.07353632897138596, "learning_rate": 9.648148148148149e-05, "loss": 0.0329, "step": 560 }, { "epoch": 10.37, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.1961112022399902, "eval_runtime": 1.8216, "eval_samples_per_second": 103.758, "eval_steps_per_second": 6.588, "step": 560 }, { "epoch": 10.56, "grad_norm": 0.5441647171974182, "learning_rate": 9.462962962962963e-05, "loss": 0.028, "step": 570 }, { "epoch": 10.56, "eval_accuracy": 0.6984126984126984, "eval_loss": 1.1282387971878052, "eval_runtime": 1.7883, "eval_samples_per_second": 105.689, "eval_steps_per_second": 6.71, "step": 570 }, { "epoch": 10.74, "grad_norm": 0.07243653386831284, "learning_rate": 9.277777777777778e-05, "loss": 0.0373, "step": 580 }, { "epoch": 10.74, "eval_accuracy": 0.7195767195767195, "eval_loss": 1.0716224908828735, "eval_runtime": 1.736, "eval_samples_per_second": 108.873, "eval_steps_per_second": 6.913, "step": 580 }, { "epoch": 10.93, "grad_norm": 0.04851379618048668, "learning_rate": 9.092592592592593e-05, "loss": 0.0368, "step": 590 }, { "epoch": 10.93, "eval_accuracy": 0.7142857142857143, "eval_loss": 1.1750774383544922, "eval_runtime": 1.7848, "eval_samples_per_second": 105.895, "eval_steps_per_second": 6.723, "step": 590 }, { "epoch": 11.11, "grad_norm": 0.05160636082291603, "learning_rate": 8.907407407407407e-05, "loss": 0.0485, "step": 600 }, { "epoch": 11.11, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.0984432697296143, "eval_runtime": 1.7772, "eval_samples_per_second": 106.345, "eval_steps_per_second": 6.752, "step": 600 }, { "epoch": 11.3, "grad_norm": 0.054380565881729126, "learning_rate": 8.722222222222223e-05, "loss": 0.0234, "step": 610 }, { "epoch": 11.3, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0418734550476074, "eval_runtime": 1.7977, "eval_samples_per_second": 105.132, "eval_steps_per_second": 6.675, "step": 610 }, { "epoch": 11.48, "grad_norm": 0.32195061445236206, "learning_rate": 8.537037037037038e-05, "loss": 0.028, "step": 620 }, { "epoch": 11.48, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0536975860595703, "eval_runtime": 1.7586, "eval_samples_per_second": 107.47, "eval_steps_per_second": 6.823, "step": 620 }, { "epoch": 11.67, "grad_norm": 1.8460614681243896, "learning_rate": 8.351851851851852e-05, "loss": 0.0237, "step": 630 }, { "epoch": 11.67, "eval_accuracy": 0.746031746031746, "eval_loss": 1.0571786165237427, "eval_runtime": 1.7901, "eval_samples_per_second": 105.578, "eval_steps_per_second": 6.703, "step": 630 }, { "epoch": 11.85, "grad_norm": 1.7614848613739014, "learning_rate": 8.166666666666667e-05, "loss": 0.0198, "step": 640 }, { "epoch": 11.85, "eval_accuracy": 0.746031746031746, "eval_loss": 1.0192136764526367, "eval_runtime": 1.7683, "eval_samples_per_second": 106.885, "eval_steps_per_second": 6.786, "step": 640 }, { "epoch": 12.04, "grad_norm": 0.22871683537960052, "learning_rate": 7.981481481481481e-05, "loss": 0.02, "step": 650 }, { "epoch": 12.04, "eval_accuracy": 0.7195767195767195, "eval_loss": 1.244175672531128, "eval_runtime": 1.8603, "eval_samples_per_second": 101.595, "eval_steps_per_second": 6.45, "step": 650 }, { "epoch": 12.22, "grad_norm": 0.03752712532877922, "learning_rate": 7.796296296296297e-05, "loss": 0.0216, "step": 660 }, { "epoch": 12.22, "eval_accuracy": 0.7407407407407407, "eval_loss": 1.1395213603973389, "eval_runtime": 1.7992, "eval_samples_per_second": 105.048, "eval_steps_per_second": 6.67, "step": 660 }, { "epoch": 12.41, "grad_norm": 0.09251418709754944, "learning_rate": 7.61111111111111e-05, "loss": 0.0309, "step": 670 }, { "epoch": 12.41, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.1767151355743408, "eval_runtime": 1.8204, "eval_samples_per_second": 103.823, "eval_steps_per_second": 6.592, "step": 670 }, { "epoch": 12.59, "grad_norm": 0.03858701139688492, "learning_rate": 7.425925925925927e-05, "loss": 0.0315, "step": 680 }, { "epoch": 12.59, "eval_accuracy": 0.7248677248677249, "eval_loss": 1.1881897449493408, "eval_runtime": 1.7853, "eval_samples_per_second": 105.862, "eval_steps_per_second": 6.721, "step": 680 }, { "epoch": 12.78, "grad_norm": 0.2986956536769867, "learning_rate": 7.240740740740741e-05, "loss": 0.017, "step": 690 }, { "epoch": 12.78, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.1652072668075562, "eval_runtime": 1.8006, "eval_samples_per_second": 104.965, "eval_steps_per_second": 6.664, "step": 690 }, { "epoch": 12.96, "grad_norm": 0.23789283633232117, "learning_rate": 7.055555555555556e-05, "loss": 0.02, "step": 700 }, { "epoch": 12.96, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.1011323928833008, "eval_runtime": 1.7393, "eval_samples_per_second": 108.665, "eval_steps_per_second": 6.899, "step": 700 }, { "epoch": 13.15, "grad_norm": 0.0361974723637104, "learning_rate": 6.87037037037037e-05, "loss": 0.0174, "step": 710 }, { "epoch": 13.15, "eval_accuracy": 0.7354497354497355, "eval_loss": 1.092558741569519, "eval_runtime": 1.8005, "eval_samples_per_second": 104.97, "eval_steps_per_second": 6.665, "step": 710 }, { "epoch": 13.33, "grad_norm": 0.04739515110850334, "learning_rate": 6.685185185185185e-05, "loss": 0.012, "step": 720 }, { "epoch": 13.33, "eval_accuracy": 0.746031746031746, "eval_loss": 1.0852241516113281, "eval_runtime": 1.787, "eval_samples_per_second": 105.766, "eval_steps_per_second": 6.715, "step": 720 }, { "epoch": 13.52, "grad_norm": 0.035977743566036224, "learning_rate": 6.500000000000001e-05, "loss": 0.0296, "step": 730 }, { "epoch": 13.52, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0534002780914307, "eval_runtime": 1.7706, "eval_samples_per_second": 106.746, "eval_steps_per_second": 6.778, "step": 730 }, { "epoch": 13.7, "grad_norm": 0.3354228436946869, "learning_rate": 6.314814814814815e-05, "loss": 0.0142, "step": 740 }, { "epoch": 13.7, "eval_accuracy": 0.746031746031746, "eval_loss": 1.0607830286026, "eval_runtime": 1.8039, "eval_samples_per_second": 104.775, "eval_steps_per_second": 6.652, "step": 740 }, { "epoch": 13.89, "grad_norm": 0.031177503988146782, "learning_rate": 6.12962962962963e-05, "loss": 0.0199, "step": 750 }, { "epoch": 13.89, "eval_accuracy": 0.746031746031746, "eval_loss": 1.0850036144256592, "eval_runtime": 1.7472, "eval_samples_per_second": 108.174, "eval_steps_per_second": 6.868, "step": 750 }, { "epoch": 14.07, "grad_norm": 0.2141834944486618, "learning_rate": 5.9444444444444445e-05, "loss": 0.0169, "step": 760 }, { "epoch": 14.07, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0736693143844604, "eval_runtime": 1.7821, "eval_samples_per_second": 106.054, "eval_steps_per_second": 6.734, "step": 760 }, { "epoch": 14.26, "grad_norm": 0.028399189934134483, "learning_rate": 5.75925925925926e-05, "loss": 0.0139, "step": 770 }, { "epoch": 14.26, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0717233419418335, "eval_runtime": 1.8135, "eval_samples_per_second": 104.221, "eval_steps_per_second": 6.617, "step": 770 }, { "epoch": 14.44, "grad_norm": 0.03289506584405899, "learning_rate": 5.574074074074075e-05, "loss": 0.0173, "step": 780 }, { "epoch": 14.44, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0707134008407593, "eval_runtime": 1.7856, "eval_samples_per_second": 105.849, "eval_steps_per_second": 6.721, "step": 780 }, { "epoch": 14.63, "grad_norm": 0.032911308109760284, "learning_rate": 5.388888888888889e-05, "loss": 0.0101, "step": 790 }, { "epoch": 14.63, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.070402979850769, "eval_runtime": 1.7933, "eval_samples_per_second": 105.391, "eval_steps_per_second": 6.691, "step": 790 }, { "epoch": 14.81, "grad_norm": 0.43361806869506836, "learning_rate": 5.203703703703704e-05, "loss": 0.0286, "step": 800 }, { "epoch": 14.81, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.0845017433166504, "eval_runtime": 1.7994, "eval_samples_per_second": 105.033, "eval_steps_per_second": 6.669, "step": 800 }, { "epoch": 15.0, "grad_norm": 0.05939367786049843, "learning_rate": 5.018518518518519e-05, "loss": 0.0135, "step": 810 }, { "epoch": 15.0, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0972745418548584, "eval_runtime": 1.7785, "eval_samples_per_second": 106.271, "eval_steps_per_second": 6.747, "step": 810 }, { "epoch": 15.19, "grad_norm": 0.030746394768357277, "learning_rate": 4.8333333333333334e-05, "loss": 0.0129, "step": 820 }, { "epoch": 15.19, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0909744501113892, "eval_runtime": 1.7304, "eval_samples_per_second": 109.222, "eval_steps_per_second": 6.935, "step": 820 }, { "epoch": 15.37, "grad_norm": 0.026390748098492622, "learning_rate": 4.648148148148148e-05, "loss": 0.0117, "step": 830 }, { "epoch": 15.37, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.0890551805496216, "eval_runtime": 1.8164, "eval_samples_per_second": 104.051, "eval_steps_per_second": 6.606, "step": 830 }, { "epoch": 15.56, "grad_norm": 0.028341053053736687, "learning_rate": 4.462962962962963e-05, "loss": 0.014, "step": 840 }, { "epoch": 15.56, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0884122848510742, "eval_runtime": 1.8336, "eval_samples_per_second": 103.079, "eval_steps_per_second": 6.545, "step": 840 }, { "epoch": 15.74, "grad_norm": 0.027172435075044632, "learning_rate": 4.277777777777778e-05, "loss": 0.0093, "step": 850 }, { "epoch": 15.74, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0879539251327515, "eval_runtime": 1.7368, "eval_samples_per_second": 108.818, "eval_steps_per_second": 6.909, "step": 850 }, { "epoch": 15.93, "grad_norm": 0.4558853209018707, "learning_rate": 4.092592592592593e-05, "loss": 0.0264, "step": 860 }, { "epoch": 15.93, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0861279964447021, "eval_runtime": 1.8295, "eval_samples_per_second": 103.306, "eval_steps_per_second": 6.559, "step": 860 }, { "epoch": 16.11, "grad_norm": 0.023086287081241608, "learning_rate": 3.9074074074074076e-05, "loss": 0.0117, "step": 870 }, { "epoch": 16.11, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0812128782272339, "eval_runtime": 1.783, "eval_samples_per_second": 106.0, "eval_steps_per_second": 6.73, "step": 870 }, { "epoch": 16.3, "grad_norm": 0.16555258631706238, "learning_rate": 3.722222222222222e-05, "loss": 0.0131, "step": 880 }, { "epoch": 16.3, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.084083080291748, "eval_runtime": 1.7979, "eval_samples_per_second": 105.125, "eval_steps_per_second": 6.675, "step": 880 }, { "epoch": 16.48, "grad_norm": 0.1985342651605606, "learning_rate": 3.537037037037037e-05, "loss": 0.0107, "step": 890 }, { "epoch": 16.48, "eval_accuracy": 0.7513227513227513, "eval_loss": 1.0908081531524658, "eval_runtime": 1.8371, "eval_samples_per_second": 102.877, "eval_steps_per_second": 6.532, "step": 890 }, { "epoch": 16.67, "grad_norm": 0.023619532585144043, "learning_rate": 3.351851851851852e-05, "loss": 0.0253, "step": 900 }, { "epoch": 16.67, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0818437337875366, "eval_runtime": 1.8128, "eval_samples_per_second": 104.258, "eval_steps_per_second": 6.62, "step": 900 }, { "epoch": 16.85, "grad_norm": 0.031866107136011124, "learning_rate": 3.1666666666666666e-05, "loss": 0.0113, "step": 910 }, { "epoch": 16.85, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.0804176330566406, "eval_runtime": 1.7557, "eval_samples_per_second": 107.647, "eval_steps_per_second": 6.835, "step": 910 }, { "epoch": 17.04, "grad_norm": 0.027054764330387115, "learning_rate": 2.981481481481482e-05, "loss": 0.0117, "step": 920 }, { "epoch": 17.04, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.0813896656036377, "eval_runtime": 1.8358, "eval_samples_per_second": 102.952, "eval_steps_per_second": 6.537, "step": 920 }, { "epoch": 17.22, "grad_norm": 0.025050414726138115, "learning_rate": 2.7962962962962965e-05, "loss": 0.0158, "step": 930 }, { "epoch": 17.22, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0813225507736206, "eval_runtime": 1.7643, "eval_samples_per_second": 107.126, "eval_steps_per_second": 6.802, "step": 930 }, { "epoch": 17.41, "grad_norm": 0.024830004200339317, "learning_rate": 2.6111111111111114e-05, "loss": 0.011, "step": 940 }, { "epoch": 17.41, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.080676794052124, "eval_runtime": 1.759, "eval_samples_per_second": 107.45, "eval_steps_per_second": 6.822, "step": 940 }, { "epoch": 17.59, "grad_norm": 0.024760620668530464, "learning_rate": 2.425925925925926e-05, "loss": 0.0137, "step": 950 }, { "epoch": 17.59, "eval_accuracy": 0.7671957671957672, "eval_loss": 1.0803221464157104, "eval_runtime": 1.7971, "eval_samples_per_second": 105.168, "eval_steps_per_second": 6.677, "step": 950 }, { "epoch": 17.78, "grad_norm": 0.025229470804333687, "learning_rate": 2.240740740740741e-05, "loss": 0.0112, "step": 960 }, { "epoch": 17.78, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0807117223739624, "eval_runtime": 1.7675, "eval_samples_per_second": 106.93, "eval_steps_per_second": 6.789, "step": 960 }, { "epoch": 17.96, "grad_norm": 0.02313585951924324, "learning_rate": 2.0555555555555555e-05, "loss": 0.0172, "step": 970 }, { "epoch": 17.96, "eval_accuracy": 0.7566137566137566, "eval_loss": 1.0821946859359741, "eval_runtime": 1.8179, "eval_samples_per_second": 103.964, "eval_steps_per_second": 6.601, "step": 970 }, { "epoch": 18.15, "grad_norm": 0.024956317618489265, "learning_rate": 1.8703703703703704e-05, "loss": 0.0132, "step": 980 }, { "epoch": 18.15, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0860090255737305, "eval_runtime": 1.7729, "eval_samples_per_second": 106.607, "eval_steps_per_second": 6.769, "step": 980 }, { "epoch": 18.33, "grad_norm": 0.02182234823703766, "learning_rate": 1.6851851851851853e-05, "loss": 0.0127, "step": 990 }, { "epoch": 18.33, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0875723361968994, "eval_runtime": 1.7863, "eval_samples_per_second": 105.804, "eval_steps_per_second": 6.718, "step": 990 }, { "epoch": 18.52, "grad_norm": 0.024420464411377907, "learning_rate": 1.5e-05, "loss": 0.0152, "step": 1000 }, { "epoch": 18.52, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0873754024505615, "eval_runtime": 1.7723, "eval_samples_per_second": 106.644, "eval_steps_per_second": 6.771, "step": 1000 }, { "epoch": 18.7, "grad_norm": 0.18311668932437897, "learning_rate": 1.3148148148148148e-05, "loss": 0.0096, "step": 1010 }, { "epoch": 18.7, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.088024377822876, "eval_runtime": 1.8979, "eval_samples_per_second": 99.583, "eval_steps_per_second": 6.323, "step": 1010 }, { "epoch": 18.89, "grad_norm": 0.023139068856835365, "learning_rate": 1.1296296296296297e-05, "loss": 0.0107, "step": 1020 }, { "epoch": 18.89, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.08987557888031, "eval_runtime": 1.8132, "eval_samples_per_second": 104.237, "eval_steps_per_second": 6.618, "step": 1020 }, { "epoch": 19.07, "grad_norm": 0.024323537945747375, "learning_rate": 9.444444444444445e-06, "loss": 0.0124, "step": 1030 }, { "epoch": 19.07, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0899451971054077, "eval_runtime": 1.7841, "eval_samples_per_second": 105.934, "eval_steps_per_second": 6.726, "step": 1030 }, { "epoch": 19.26, "grad_norm": 0.20473988354206085, "learning_rate": 7.592592592592593e-06, "loss": 0.0187, "step": 1040 }, { "epoch": 19.26, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0915277004241943, "eval_runtime": 1.7828, "eval_samples_per_second": 106.015, "eval_steps_per_second": 6.731, "step": 1040 }, { "epoch": 19.44, "grad_norm": 0.021954894065856934, "learning_rate": 5.740740740740741e-06, "loss": 0.0159, "step": 1050 }, { "epoch": 19.44, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0916674137115479, "eval_runtime": 1.7554, "eval_samples_per_second": 107.665, "eval_steps_per_second": 6.836, "step": 1050 }, { "epoch": 19.63, "grad_norm": 0.02447775937616825, "learning_rate": 3.888888888888889e-06, "loss": 0.0107, "step": 1060 }, { "epoch": 19.63, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.091030240058899, "eval_runtime": 1.7566, "eval_samples_per_second": 107.597, "eval_steps_per_second": 6.832, "step": 1060 }, { "epoch": 19.81, "grad_norm": 0.02190612629055977, "learning_rate": 2.0370370370370375e-06, "loss": 0.0105, "step": 1070 }, { "epoch": 19.81, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0911825895309448, "eval_runtime": 1.7879, "eval_samples_per_second": 105.71, "eval_steps_per_second": 6.712, "step": 1070 }, { "epoch": 20.0, "grad_norm": 0.0392175130546093, "learning_rate": 1.851851851851852e-07, "loss": 0.0076, "step": 1080 }, { "epoch": 20.0, "eval_accuracy": 0.7619047619047619, "eval_loss": 1.0909953117370605, "eval_runtime": 1.7898, "eval_samples_per_second": 105.596, "eval_steps_per_second": 6.704, "step": 1080 }, { "epoch": 20.0, "step": 1080, "total_flos": 2.681093741830963e+18, "train_loss": 0.29336747460895113, "train_runtime": 795.2059, "train_samples_per_second": 42.681, "train_steps_per_second": 1.358 } ], "logging_steps": 10, "max_steps": 1080, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10, "total_flos": 2.681093741830963e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }