{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24991175432403812, "eval_steps": 500, "global_step": 531, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004706436051300153, "grad_norm": 0.6595008969306946, "learning_rate": 2.5000000000000004e-07, "loss": 9.6515, "step": 1 }, { "epoch": 0.0009412872102600306, "grad_norm": 0.7391405701637268, "learning_rate": 5.000000000000001e-07, "loss": 9.5434, "step": 2 }, { "epoch": 0.0014119308153900459, "grad_norm": 0.8721428513526917, "learning_rate": 7.5e-07, "loss": 9.0645, "step": 3 }, { "epoch": 0.0018825744205200612, "grad_norm": 0.9540417790412903, "learning_rate": 1.0000000000000002e-06, "loss": 9.0978, "step": 4 }, { "epoch": 0.0023532180256500765, "grad_norm": 1.0068703889846802, "learning_rate": 1.25e-06, "loss": 8.8096, "step": 5 }, { "epoch": 0.0028238616307800918, "grad_norm": 0.7046281695365906, "learning_rate": 1.5e-06, "loss": 9.5863, "step": 6 }, { "epoch": 0.003294505235910107, "grad_norm": 1.027761459350586, "learning_rate": 1.7500000000000002e-06, "loss": 9.3746, "step": 7 }, { "epoch": 0.0037651488410401224, "grad_norm": 0.7785173058509827, "learning_rate": 2.0000000000000003e-06, "loss": 9.1443, "step": 8 }, { "epoch": 0.004235792446170138, "grad_norm": 0.8485608696937561, "learning_rate": 2.25e-06, "loss": 9.2293, "step": 9 }, { "epoch": 0.004706436051300153, "grad_norm": 0.8275871872901917, "learning_rate": 2.5e-06, "loss": 8.7838, "step": 10 }, { "epoch": 0.005177079656430168, "grad_norm": 0.5895422101020813, "learning_rate": 2.7500000000000004e-06, "loss": 9.5706, "step": 11 }, { "epoch": 0.0056477232615601836, "grad_norm": 0.9113247394561768, "learning_rate": 3e-06, "loss": 8.9198, "step": 12 }, { "epoch": 0.006118366866690199, "grad_norm": 0.7459664940834045, "learning_rate": 3.2500000000000002e-06, "loss": 9.2372, "step": 13 }, { "epoch": 0.006589010471820214, "grad_norm": 0.6556370854377747, "learning_rate": 3.5000000000000004e-06, "loss": 9.1809, "step": 14 }, { "epoch": 0.0070596540769502295, "grad_norm": 0.719078540802002, "learning_rate": 3.75e-06, "loss": 9.2422, "step": 15 }, { "epoch": 0.007530297682080245, "grad_norm": 0.8138344287872314, "learning_rate": 4.000000000000001e-06, "loss": 9.271, "step": 16 }, { "epoch": 0.00800094128721026, "grad_norm": 0.7246189713478088, "learning_rate": 4.250000000000001e-06, "loss": 9.5405, "step": 17 }, { "epoch": 0.008471584892340275, "grad_norm": 0.8132815361022949, "learning_rate": 4.5e-06, "loss": 9.7983, "step": 18 }, { "epoch": 0.00894222849747029, "grad_norm": 0.5946951508522034, "learning_rate": 4.75e-06, "loss": 9.733, "step": 19 }, { "epoch": 0.009412872102600306, "grad_norm": 0.5157704949378967, "learning_rate": 5e-06, "loss": 9.6086, "step": 20 }, { "epoch": 0.009883515707730321, "grad_norm": 0.5629891157150269, "learning_rate": 5.25e-06, "loss": 9.2102, "step": 21 }, { "epoch": 0.010354159312860337, "grad_norm": 0.48590287566185, "learning_rate": 5.500000000000001e-06, "loss": 9.7732, "step": 22 }, { "epoch": 0.010824802917990352, "grad_norm": 0.5960127711296082, "learning_rate": 5.750000000000001e-06, "loss": 9.3421, "step": 23 }, { "epoch": 0.011295446523120367, "grad_norm": 0.48235076665878296, "learning_rate": 6e-06, "loss": 9.5374, "step": 24 }, { "epoch": 0.011766090128250382, "grad_norm": 0.4856416881084442, "learning_rate": 6.25e-06, "loss": 9.2162, "step": 25 }, { "epoch": 0.012236733733380398, "grad_norm": 0.45604783296585083, "learning_rate": 6.5000000000000004e-06, "loss": 9.3802, "step": 26 }, { "epoch": 0.012707377338510413, "grad_norm": 0.4940997064113617, "learning_rate": 6.750000000000001e-06, "loss": 8.9352, "step": 27 }, { "epoch": 0.013178020943640428, "grad_norm": 0.5067102909088135, "learning_rate": 7.000000000000001e-06, "loss": 9.6871, "step": 28 }, { "epoch": 0.013648664548770444, "grad_norm": 0.5070438385009766, "learning_rate": 7.25e-06, "loss": 9.1244, "step": 29 }, { "epoch": 0.014119308153900459, "grad_norm": 0.47256559133529663, "learning_rate": 7.5e-06, "loss": 9.6139, "step": 30 }, { "epoch": 0.014589951759030474, "grad_norm": 0.6668869853019714, "learning_rate": 7.75e-06, "loss": 8.9173, "step": 31 }, { "epoch": 0.01506059536416049, "grad_norm": 0.7926103472709656, "learning_rate": 8.000000000000001e-06, "loss": 8.8604, "step": 32 }, { "epoch": 0.015531238969290505, "grad_norm": 0.4389215409755707, "learning_rate": 8.25e-06, "loss": 9.42, "step": 33 }, { "epoch": 0.01600188257442052, "grad_norm": 0.527125895023346, "learning_rate": 8.500000000000002e-06, "loss": 9.5552, "step": 34 }, { "epoch": 0.016472526179550535, "grad_norm": 0.5376142263412476, "learning_rate": 8.75e-06, "loss": 9.1412, "step": 35 }, { "epoch": 0.01694316978468055, "grad_norm": 0.4762144386768341, "learning_rate": 9e-06, "loss": 9.2153, "step": 36 }, { "epoch": 0.017413813389810566, "grad_norm": 0.46567338705062866, "learning_rate": 9.25e-06, "loss": 9.3836, "step": 37 }, { "epoch": 0.01788445699494058, "grad_norm": 0.4322827458381653, "learning_rate": 9.5e-06, "loss": 8.9984, "step": 38 }, { "epoch": 0.018355100600070597, "grad_norm": 0.42570286989212036, "learning_rate": 9.750000000000002e-06, "loss": 9.0916, "step": 39 }, { "epoch": 0.018825744205200612, "grad_norm": 0.43363815546035767, "learning_rate": 1e-05, "loss": 9.0663, "step": 40 }, { "epoch": 0.019296387810330627, "grad_norm": 0.3969482481479645, "learning_rate": 1.025e-05, "loss": 9.4064, "step": 41 }, { "epoch": 0.019767031415460642, "grad_norm": 0.4335750639438629, "learning_rate": 1.05e-05, "loss": 9.262, "step": 42 }, { "epoch": 0.020237675020590658, "grad_norm": 0.4210178852081299, "learning_rate": 1.075e-05, "loss": 9.4898, "step": 43 }, { "epoch": 0.020708318625720673, "grad_norm": 0.39311668276786804, "learning_rate": 1.1000000000000001e-05, "loss": 9.7063, "step": 44 }, { "epoch": 0.02117896223085069, "grad_norm": 0.39521753787994385, "learning_rate": 1.125e-05, "loss": 9.3065, "step": 45 }, { "epoch": 0.021649605835980704, "grad_norm": 0.42978909611701965, "learning_rate": 1.1500000000000002e-05, "loss": 8.9722, "step": 46 }, { "epoch": 0.02212024944111072, "grad_norm": 0.47351160645484924, "learning_rate": 1.175e-05, "loss": 8.9028, "step": 47 }, { "epoch": 0.022590893046240734, "grad_norm": 0.4192260801792145, "learning_rate": 1.2e-05, "loss": 8.913, "step": 48 }, { "epoch": 0.02306153665137075, "grad_norm": 0.42306703329086304, "learning_rate": 1.225e-05, "loss": 9.3223, "step": 49 }, { "epoch": 0.023532180256500765, "grad_norm": 0.40158239006996155, "learning_rate": 1.25e-05, "loss": 9.5922, "step": 50 }, { "epoch": 0.02400282386163078, "grad_norm": 0.5165021419525146, "learning_rate": 1.2750000000000002e-05, "loss": 9.24, "step": 51 }, { "epoch": 0.024473467466760795, "grad_norm": 0.3930136263370514, "learning_rate": 1.3000000000000001e-05, "loss": 8.7955, "step": 52 }, { "epoch": 0.02494411107189081, "grad_norm": 0.3975488543510437, "learning_rate": 1.3250000000000002e-05, "loss": 8.7474, "step": 53 }, { "epoch": 0.025414754677020826, "grad_norm": 0.46201732754707336, "learning_rate": 1.3500000000000001e-05, "loss": 9.1239, "step": 54 }, { "epoch": 0.02588539828215084, "grad_norm": 0.42599615454673767, "learning_rate": 1.3750000000000002e-05, "loss": 9.2889, "step": 55 }, { "epoch": 0.026356041887280857, "grad_norm": 0.3889259994029999, "learning_rate": 1.4000000000000001e-05, "loss": 9.5315, "step": 56 }, { "epoch": 0.026826685492410872, "grad_norm": 0.3762259781360626, "learning_rate": 1.4249999999999999e-05, "loss": 9.3968, "step": 57 }, { "epoch": 0.027297329097540887, "grad_norm": 0.4486519396305084, "learning_rate": 1.45e-05, "loss": 9.2345, "step": 58 }, { "epoch": 0.027767972702670903, "grad_norm": 0.43613263964653015, "learning_rate": 1.475e-05, "loss": 9.293, "step": 59 }, { "epoch": 0.028238616307800918, "grad_norm": 0.40770891308784485, "learning_rate": 1.5e-05, "loss": 8.9544, "step": 60 }, { "epoch": 0.028709259912930933, "grad_norm": 0.36603429913520813, "learning_rate": 1.525e-05, "loss": 9.5768, "step": 61 }, { "epoch": 0.02917990351806095, "grad_norm": 0.41165047883987427, "learning_rate": 1.55e-05, "loss": 9.0203, "step": 62 }, { "epoch": 0.029650547123190964, "grad_norm": 0.4514125883579254, "learning_rate": 1.575e-05, "loss": 9.2653, "step": 63 }, { "epoch": 0.03012119072832098, "grad_norm": 0.41333243250846863, "learning_rate": 1.6000000000000003e-05, "loss": 8.9577, "step": 64 }, { "epoch": 0.030591834333450994, "grad_norm": 0.42950087785720825, "learning_rate": 1.6250000000000002e-05, "loss": 9.341, "step": 65 }, { "epoch": 0.03106247793858101, "grad_norm": 0.4158640205860138, "learning_rate": 1.65e-05, "loss": 9.6118, "step": 66 }, { "epoch": 0.031533121543711025, "grad_norm": 0.39954355359077454, "learning_rate": 1.675e-05, "loss": 9.0818, "step": 67 }, { "epoch": 0.03200376514884104, "grad_norm": 0.38233450055122375, "learning_rate": 1.7000000000000003e-05, "loss": 9.3953, "step": 68 }, { "epoch": 0.032474408753971055, "grad_norm": 0.37950408458709717, "learning_rate": 1.725e-05, "loss": 9.3594, "step": 69 }, { "epoch": 0.03294505235910107, "grad_norm": 0.475953608751297, "learning_rate": 1.75e-05, "loss": 9.0956, "step": 70 }, { "epoch": 0.033415695964231086, "grad_norm": 0.4252181947231293, "learning_rate": 1.775e-05, "loss": 9.1928, "step": 71 }, { "epoch": 0.0338863395693611, "grad_norm": 0.3946019411087036, "learning_rate": 1.8e-05, "loss": 9.1933, "step": 72 }, { "epoch": 0.03435698317449112, "grad_norm": 0.4342809021472931, "learning_rate": 1.825e-05, "loss": 9.2859, "step": 73 }, { "epoch": 0.03482762677962113, "grad_norm": 0.3921419084072113, "learning_rate": 1.85e-05, "loss": 9.1214, "step": 74 }, { "epoch": 0.03529827038475115, "grad_norm": 0.3992595374584198, "learning_rate": 1.8750000000000002e-05, "loss": 9.332, "step": 75 }, { "epoch": 0.03576891398988116, "grad_norm": 0.40269696712493896, "learning_rate": 1.9e-05, "loss": 9.4244, "step": 76 }, { "epoch": 0.03623955759501118, "grad_norm": 0.41852205991744995, "learning_rate": 1.925e-05, "loss": 9.3765, "step": 77 }, { "epoch": 0.03671020120014119, "grad_norm": 0.5162649750709534, "learning_rate": 1.9500000000000003e-05, "loss": 8.3471, "step": 78 }, { "epoch": 0.03718084480527121, "grad_norm": 0.4802299737930298, "learning_rate": 1.9750000000000002e-05, "loss": 9.3251, "step": 79 }, { "epoch": 0.037651488410401224, "grad_norm": 0.4261873960494995, "learning_rate": 2e-05, "loss": 9.5181, "step": 80 }, { "epoch": 0.03812213201553124, "grad_norm": 0.4193435311317444, "learning_rate": 2.025e-05, "loss": 9.4217, "step": 81 }, { "epoch": 0.038592775620661254, "grad_norm": 0.4148464798927307, "learning_rate": 2.05e-05, "loss": 8.7618, "step": 82 }, { "epoch": 0.03906341922579127, "grad_norm": 0.4396406412124634, "learning_rate": 2.075e-05, "loss": 9.4059, "step": 83 }, { "epoch": 0.039534062830921285, "grad_norm": 0.43215858936309814, "learning_rate": 2.1e-05, "loss": 9.0061, "step": 84 }, { "epoch": 0.0400047064360513, "grad_norm": 0.4347785711288452, "learning_rate": 2.125e-05, "loss": 8.5384, "step": 85 }, { "epoch": 0.040475350041181316, "grad_norm": 0.47068068385124207, "learning_rate": 2.15e-05, "loss": 9.2299, "step": 86 }, { "epoch": 0.04094599364631133, "grad_norm": 0.44863706827163696, "learning_rate": 2.175e-05, "loss": 8.7932, "step": 87 }, { "epoch": 0.041416637251441346, "grad_norm": 0.4525277316570282, "learning_rate": 2.2000000000000003e-05, "loss": 9.1699, "step": 88 }, { "epoch": 0.04188728085657136, "grad_norm": 0.41207849979400635, "learning_rate": 2.2250000000000002e-05, "loss": 9.4979, "step": 89 }, { "epoch": 0.04235792446170138, "grad_norm": 0.4179534912109375, "learning_rate": 2.25e-05, "loss": 9.1519, "step": 90 }, { "epoch": 0.04282856806683139, "grad_norm": 0.472789466381073, "learning_rate": 2.275e-05, "loss": 9.1048, "step": 91 }, { "epoch": 0.04329921167196141, "grad_norm": 0.44435739517211914, "learning_rate": 2.3000000000000003e-05, "loss": 9.2816, "step": 92 }, { "epoch": 0.04376985527709142, "grad_norm": 0.41012299060821533, "learning_rate": 2.3250000000000003e-05, "loss": 9.4546, "step": 93 }, { "epoch": 0.04424049888222144, "grad_norm": 0.4100490212440491, "learning_rate": 2.35e-05, "loss": 9.4397, "step": 94 }, { "epoch": 0.04471114248735145, "grad_norm": 0.4229314923286438, "learning_rate": 2.375e-05, "loss": 8.9033, "step": 95 }, { "epoch": 0.04518178609248147, "grad_norm": 0.39841172099113464, "learning_rate": 2.4e-05, "loss": 9.3391, "step": 96 }, { "epoch": 0.045652429697611484, "grad_norm": 0.4041540324687958, "learning_rate": 2.425e-05, "loss": 9.3347, "step": 97 }, { "epoch": 0.0461230733027415, "grad_norm": 0.4046013653278351, "learning_rate": 2.45e-05, "loss": 9.4645, "step": 98 }, { "epoch": 0.046593716907871514, "grad_norm": 0.3989504277706146, "learning_rate": 2.4750000000000002e-05, "loss": 9.2343, "step": 99 }, { "epoch": 0.04706436051300153, "grad_norm": 0.41768062114715576, "learning_rate": 2.5e-05, "loss": 9.6114, "step": 100 }, { "epoch": 0.047535004118131545, "grad_norm": 0.4360901713371277, "learning_rate": 2.525e-05, "loss": 9.3584, "step": 101 }, { "epoch": 0.04800564772326156, "grad_norm": 0.5093626976013184, "learning_rate": 2.5500000000000003e-05, "loss": 9.3969, "step": 102 }, { "epoch": 0.048476291328391576, "grad_norm": 0.5148160457611084, "learning_rate": 2.5750000000000002e-05, "loss": 9.3607, "step": 103 }, { "epoch": 0.04894693493352159, "grad_norm": 0.4556065797805786, "learning_rate": 2.6000000000000002e-05, "loss": 8.6494, "step": 104 }, { "epoch": 0.049417578538651606, "grad_norm": 0.48136287927627563, "learning_rate": 2.625e-05, "loss": 8.8816, "step": 105 }, { "epoch": 0.04988822214378162, "grad_norm": 0.4007977247238159, "learning_rate": 2.6500000000000004e-05, "loss": 9.0173, "step": 106 }, { "epoch": 0.05035886574891164, "grad_norm": 0.5088827610015869, "learning_rate": 2.6750000000000003e-05, "loss": 9.4898, "step": 107 }, { "epoch": 0.05082950935404165, "grad_norm": 0.4222247898578644, "learning_rate": 2.7000000000000002e-05, "loss": 9.5039, "step": 108 }, { "epoch": 0.05130015295917167, "grad_norm": 0.42676958441734314, "learning_rate": 2.725e-05, "loss": 9.3007, "step": 109 }, { "epoch": 0.05177079656430168, "grad_norm": 0.4315201938152313, "learning_rate": 2.7500000000000004e-05, "loss": 9.1473, "step": 110 }, { "epoch": 0.0522414401694317, "grad_norm": 0.5586130619049072, "learning_rate": 2.7750000000000004e-05, "loss": 9.486, "step": 111 }, { "epoch": 0.05271208377456171, "grad_norm": 0.4153185486793518, "learning_rate": 2.8000000000000003e-05, "loss": 9.2632, "step": 112 }, { "epoch": 0.05318272737969173, "grad_norm": 0.47736650705337524, "learning_rate": 2.825e-05, "loss": 8.9582, "step": 113 }, { "epoch": 0.053653370984821744, "grad_norm": 0.4127710163593292, "learning_rate": 2.8499999999999998e-05, "loss": 9.3019, "step": 114 }, { "epoch": 0.05412401458995176, "grad_norm": 0.44509121775627136, "learning_rate": 2.8749999999999997e-05, "loss": 9.1081, "step": 115 }, { "epoch": 0.054594658195081774, "grad_norm": 0.4519471526145935, "learning_rate": 2.9e-05, "loss": 9.4795, "step": 116 }, { "epoch": 0.05506530180021179, "grad_norm": 0.4292161464691162, "learning_rate": 2.925e-05, "loss": 9.2027, "step": 117 }, { "epoch": 0.055535945405341805, "grad_norm": 0.46465009450912476, "learning_rate": 2.95e-05, "loss": 9.081, "step": 118 }, { "epoch": 0.05600658901047182, "grad_norm": 0.4395250976085663, "learning_rate": 2.975e-05, "loss": 9.4345, "step": 119 }, { "epoch": 0.056477232615601836, "grad_norm": 0.4673008918762207, "learning_rate": 3e-05, "loss": 9.3435, "step": 120 }, { "epoch": 0.05694787622073185, "grad_norm": 0.4328051209449768, "learning_rate": 3.025e-05, "loss": 8.7147, "step": 121 }, { "epoch": 0.057418519825861866, "grad_norm": 0.444002240896225, "learning_rate": 3.05e-05, "loss": 8.8049, "step": 122 }, { "epoch": 0.05788916343099188, "grad_norm": 0.4078370928764343, "learning_rate": 3.075e-05, "loss": 9.1032, "step": 123 }, { "epoch": 0.0583598070361219, "grad_norm": 0.4445233941078186, "learning_rate": 3.1e-05, "loss": 9.279, "step": 124 }, { "epoch": 0.05883045064125191, "grad_norm": 0.4282757639884949, "learning_rate": 3.125e-05, "loss": 9.4163, "step": 125 }, { "epoch": 0.05930109424638193, "grad_norm": 0.41878628730773926, "learning_rate": 3.15e-05, "loss": 8.9876, "step": 126 }, { "epoch": 0.05977173785151194, "grad_norm": 0.6357080340385437, "learning_rate": 3.175e-05, "loss": 8.4245, "step": 127 }, { "epoch": 0.06024238145664196, "grad_norm": 0.4595104455947876, "learning_rate": 3.2000000000000005e-05, "loss": 9.1227, "step": 128 }, { "epoch": 0.06071302506177197, "grad_norm": 1.0947221517562866, "learning_rate": 3.2250000000000005e-05, "loss": 8.6819, "step": 129 }, { "epoch": 0.06118366866690199, "grad_norm": 0.43211594223976135, "learning_rate": 3.2500000000000004e-05, "loss": 9.1862, "step": 130 }, { "epoch": 0.061654312272032004, "grad_norm": 0.4080043137073517, "learning_rate": 3.275e-05, "loss": 9.0489, "step": 131 }, { "epoch": 0.06212495587716202, "grad_norm": 0.48265427350997925, "learning_rate": 3.3e-05, "loss": 9.257, "step": 132 }, { "epoch": 0.06259559948229203, "grad_norm": 0.45756152272224426, "learning_rate": 3.325e-05, "loss": 8.9598, "step": 133 }, { "epoch": 0.06306624308742205, "grad_norm": 0.3848661780357361, "learning_rate": 3.35e-05, "loss": 9.5542, "step": 134 }, { "epoch": 0.06353688669255206, "grad_norm": 0.43142908811569214, "learning_rate": 3.375000000000001e-05, "loss": 9.0434, "step": 135 }, { "epoch": 0.06400753029768208, "grad_norm": 0.39845573902130127, "learning_rate": 3.4000000000000007e-05, "loss": 9.7228, "step": 136 }, { "epoch": 0.06447817390281209, "grad_norm": 0.4854653775691986, "learning_rate": 3.4250000000000006e-05, "loss": 8.9226, "step": 137 }, { "epoch": 0.06494881750794211, "grad_norm": 0.41691291332244873, "learning_rate": 3.45e-05, "loss": 9.4588, "step": 138 }, { "epoch": 0.06541946111307212, "grad_norm": 0.41709139943122864, "learning_rate": 3.475e-05, "loss": 8.9146, "step": 139 }, { "epoch": 0.06589010471820214, "grad_norm": 0.3843998312950134, "learning_rate": 3.5e-05, "loss": 8.9889, "step": 140 }, { "epoch": 0.06636074832333215, "grad_norm": 0.4418933391571045, "learning_rate": 3.525e-05, "loss": 9.3688, "step": 141 }, { "epoch": 0.06683139192846217, "grad_norm": 0.3844826817512512, "learning_rate": 3.55e-05, "loss": 9.2518, "step": 142 }, { "epoch": 0.06730203553359218, "grad_norm": 0.4951348900794983, "learning_rate": 3.575e-05, "loss": 8.9785, "step": 143 }, { "epoch": 0.0677726791387222, "grad_norm": 0.475685179233551, "learning_rate": 3.6e-05, "loss": 9.0013, "step": 144 }, { "epoch": 0.06824332274385221, "grad_norm": 0.5578158497810364, "learning_rate": 3.625e-05, "loss": 8.9177, "step": 145 }, { "epoch": 0.06871396634898223, "grad_norm": 0.6955916881561279, "learning_rate": 3.65e-05, "loss": 8.9298, "step": 146 }, { "epoch": 0.06918460995411224, "grad_norm": 0.4071875810623169, "learning_rate": 3.675e-05, "loss": 9.1422, "step": 147 }, { "epoch": 0.06965525355924226, "grad_norm": 0.49543336033821106, "learning_rate": 3.7e-05, "loss": 9.4138, "step": 148 }, { "epoch": 0.07012589716437227, "grad_norm": 0.4391457438468933, "learning_rate": 3.7250000000000004e-05, "loss": 9.3566, "step": 149 }, { "epoch": 0.0705965407695023, "grad_norm": 0.4311358630657196, "learning_rate": 3.7500000000000003e-05, "loss": 8.6678, "step": 150 }, { "epoch": 0.0710671843746323, "grad_norm": 0.4233754873275757, "learning_rate": 3.775e-05, "loss": 8.9541, "step": 151 }, { "epoch": 0.07153782797976233, "grad_norm": 0.4653347432613373, "learning_rate": 3.8e-05, "loss": 8.953, "step": 152 }, { "epoch": 0.07200847158489233, "grad_norm": 0.4828343689441681, "learning_rate": 3.825e-05, "loss": 8.9577, "step": 153 }, { "epoch": 0.07247911519002236, "grad_norm": 0.43757960200309753, "learning_rate": 3.85e-05, "loss": 9.2349, "step": 154 }, { "epoch": 0.07294975879515236, "grad_norm": 0.4094442129135132, "learning_rate": 3.875e-05, "loss": 9.424, "step": 155 }, { "epoch": 0.07342040240028239, "grad_norm": 0.536808967590332, "learning_rate": 3.9000000000000006e-05, "loss": 8.9437, "step": 156 }, { "epoch": 0.0738910460054124, "grad_norm": 0.4084169268608093, "learning_rate": 3.9250000000000005e-05, "loss": 9.5204, "step": 157 }, { "epoch": 0.07436168961054242, "grad_norm": 0.4906410574913025, "learning_rate": 3.9500000000000005e-05, "loss": 9.0682, "step": 158 }, { "epoch": 0.07483233321567243, "grad_norm": 0.42850637435913086, "learning_rate": 3.9750000000000004e-05, "loss": 9.0241, "step": 159 }, { "epoch": 0.07530297682080245, "grad_norm": 0.3832900822162628, "learning_rate": 4e-05, "loss": 9.4956, "step": 160 }, { "epoch": 0.07577362042593246, "grad_norm": 0.39132505655288696, "learning_rate": 4.025e-05, "loss": 9.4623, "step": 161 }, { "epoch": 0.07624426403106248, "grad_norm": 0.44959893822669983, "learning_rate": 4.05e-05, "loss": 9.0518, "step": 162 }, { "epoch": 0.07671490763619249, "grad_norm": 0.41552799940109253, "learning_rate": 4.075e-05, "loss": 9.1268, "step": 163 }, { "epoch": 0.07718555124132251, "grad_norm": 0.42259296774864197, "learning_rate": 4.1e-05, "loss": 9.1533, "step": 164 }, { "epoch": 0.07765619484645252, "grad_norm": 0.4441682994365692, "learning_rate": 4.125e-05, "loss": 8.7568, "step": 165 }, { "epoch": 0.07812683845158254, "grad_norm": 0.42241615056991577, "learning_rate": 4.15e-05, "loss": 9.3366, "step": 166 }, { "epoch": 0.07859748205671255, "grad_norm": 0.3997664153575897, "learning_rate": 4.175e-05, "loss": 8.855, "step": 167 }, { "epoch": 0.07906812566184257, "grad_norm": 0.4293980300426483, "learning_rate": 4.2e-05, "loss": 8.9744, "step": 168 }, { "epoch": 0.07953876926697258, "grad_norm": 0.4279899001121521, "learning_rate": 4.2250000000000004e-05, "loss": 9.0692, "step": 169 }, { "epoch": 0.0800094128721026, "grad_norm": 0.4207955002784729, "learning_rate": 4.25e-05, "loss": 8.8506, "step": 170 }, { "epoch": 0.08048005647723261, "grad_norm": 0.41057008504867554, "learning_rate": 4.275e-05, "loss": 9.2402, "step": 171 }, { "epoch": 0.08095070008236263, "grad_norm": 0.4556719660758972, "learning_rate": 4.3e-05, "loss": 9.3806, "step": 172 }, { "epoch": 0.08142134368749264, "grad_norm": 0.4468841850757599, "learning_rate": 4.325e-05, "loss": 9.0331, "step": 173 }, { "epoch": 0.08189198729262266, "grad_norm": 0.4206986725330353, "learning_rate": 4.35e-05, "loss": 8.6767, "step": 174 }, { "epoch": 0.08236263089775267, "grad_norm": 0.42576491832733154, "learning_rate": 4.375e-05, "loss": 8.7183, "step": 175 }, { "epoch": 0.08283327450288269, "grad_norm": 0.4180700182914734, "learning_rate": 4.4000000000000006e-05, "loss": 8.8461, "step": 176 }, { "epoch": 0.0833039181080127, "grad_norm": 0.3981553614139557, "learning_rate": 4.4250000000000005e-05, "loss": 8.9324, "step": 177 }, { "epoch": 0.08377456171314272, "grad_norm": 0.4038431942462921, "learning_rate": 4.4500000000000004e-05, "loss": 8.7611, "step": 178 }, { "epoch": 0.08424520531827273, "grad_norm": 0.4555639326572418, "learning_rate": 4.4750000000000004e-05, "loss": 8.4839, "step": 179 }, { "epoch": 0.08471584892340275, "grad_norm": 0.39343494176864624, "learning_rate": 4.5e-05, "loss": 9.0263, "step": 180 }, { "epoch": 0.08518649252853276, "grad_norm": 0.4226400852203369, "learning_rate": 4.525e-05, "loss": 8.9829, "step": 181 }, { "epoch": 0.08565713613366278, "grad_norm": 0.3735749125480652, "learning_rate": 4.55e-05, "loss": 9.6609, "step": 182 }, { "epoch": 0.08612777973879279, "grad_norm": 0.4413192868232727, "learning_rate": 4.575e-05, "loss": 9.0126, "step": 183 }, { "epoch": 0.08659842334392281, "grad_norm": 0.3925839364528656, "learning_rate": 4.600000000000001e-05, "loss": 9.2048, "step": 184 }, { "epoch": 0.08706906694905282, "grad_norm": 0.3941839933395386, "learning_rate": 4.6250000000000006e-05, "loss": 9.2662, "step": 185 }, { "epoch": 0.08753971055418285, "grad_norm": 0.47577032446861267, "learning_rate": 4.6500000000000005e-05, "loss": 8.9474, "step": 186 }, { "epoch": 0.08801035415931285, "grad_norm": 0.4306804835796356, "learning_rate": 4.6750000000000005e-05, "loss": 8.8199, "step": 187 }, { "epoch": 0.08848099776444288, "grad_norm": 0.4680851995944977, "learning_rate": 4.7e-05, "loss": 8.7651, "step": 188 }, { "epoch": 0.08895164136957288, "grad_norm": 0.4325461983680725, "learning_rate": 4.7249999999999997e-05, "loss": 9.1391, "step": 189 }, { "epoch": 0.0894222849747029, "grad_norm": 0.7051356434822083, "learning_rate": 4.75e-05, "loss": 8.8018, "step": 190 }, { "epoch": 0.08989292857983291, "grad_norm": 0.37214136123657227, "learning_rate": 4.775e-05, "loss": 9.4374, "step": 191 }, { "epoch": 0.09036357218496294, "grad_norm": 0.4161190688610077, "learning_rate": 4.8e-05, "loss": 9.0213, "step": 192 }, { "epoch": 0.09083421579009295, "grad_norm": 0.39017942547798157, "learning_rate": 4.825e-05, "loss": 9.4081, "step": 193 }, { "epoch": 0.09130485939522297, "grad_norm": 0.3661479353904724, "learning_rate": 4.85e-05, "loss": 9.5162, "step": 194 }, { "epoch": 0.09177550300035298, "grad_norm": 0.4220457077026367, "learning_rate": 4.875e-05, "loss": 8.8268, "step": 195 }, { "epoch": 0.092246146605483, "grad_norm": 0.4123201370239258, "learning_rate": 4.9e-05, "loss": 9.1464, "step": 196 }, { "epoch": 0.092716790210613, "grad_norm": 0.3835439383983612, "learning_rate": 4.9250000000000004e-05, "loss": 9.2391, "step": 197 }, { "epoch": 0.09318743381574303, "grad_norm": 0.3718632459640503, "learning_rate": 4.9500000000000004e-05, "loss": 9.2759, "step": 198 }, { "epoch": 0.09365807742087304, "grad_norm": 0.5267420411109924, "learning_rate": 4.975e-05, "loss": 9.0097, "step": 199 }, { "epoch": 0.09412872102600306, "grad_norm": 0.3542408049106598, "learning_rate": 5e-05, "loss": 9.5282, "step": 200 }, { "epoch": 0.09459936463113307, "grad_norm": 0.40344443917274475, "learning_rate": 4.999999247114854e-05, "loss": 9.3784, "step": 201 }, { "epoch": 0.09507000823626309, "grad_norm": 0.41083309054374695, "learning_rate": 4.999996988459869e-05, "loss": 9.4365, "step": 202 }, { "epoch": 0.0955406518413931, "grad_norm": 0.369400292634964, "learning_rate": 4.9999932240364054e-05, "loss": 9.3167, "step": 203 }, { "epoch": 0.09601129544652312, "grad_norm": 0.36150887608528137, "learning_rate": 4.9999879538467306e-05, "loss": 9.5957, "step": 204 }, { "epoch": 0.09648193905165313, "grad_norm": 0.44035205245018005, "learning_rate": 4.99998117789402e-05, "loss": 8.8501, "step": 205 }, { "epoch": 0.09695258265678315, "grad_norm": 0.42898210883140564, "learning_rate": 4.999972896182352e-05, "loss": 8.8283, "step": 206 }, { "epoch": 0.09742322626191316, "grad_norm": 0.3809720277786255, "learning_rate": 4.999963108716718e-05, "loss": 9.3219, "step": 207 }, { "epoch": 0.09789386986704318, "grad_norm": 0.38228464126586914, "learning_rate": 4.999951815503011e-05, "loss": 9.2669, "step": 208 }, { "epoch": 0.09836451347217319, "grad_norm": 0.3908674120903015, "learning_rate": 4.9999390165480335e-05, "loss": 8.9417, "step": 209 }, { "epoch": 0.09883515707730321, "grad_norm": 0.34623146057128906, "learning_rate": 4.999924711859495e-05, "loss": 9.6014, "step": 210 }, { "epoch": 0.09930580068243322, "grad_norm": 0.3909365236759186, "learning_rate": 4.99990890144601e-05, "loss": 9.1546, "step": 211 }, { "epoch": 0.09977644428756324, "grad_norm": 0.3888709843158722, "learning_rate": 4.999891585317103e-05, "loss": 9.3649, "step": 212 }, { "epoch": 0.10024708789269325, "grad_norm": 0.45398378372192383, "learning_rate": 4.9998727634832024e-05, "loss": 8.9172, "step": 213 }, { "epoch": 0.10071773149782327, "grad_norm": 0.36648306250572205, "learning_rate": 4.9998524359556445e-05, "loss": 9.0638, "step": 214 }, { "epoch": 0.10118837510295328, "grad_norm": 0.37433892488479614, "learning_rate": 4.999830602746673e-05, "loss": 9.3322, "step": 215 }, { "epoch": 0.1016590187080833, "grad_norm": 0.38904431462287903, "learning_rate": 4.99980726386944e-05, "loss": 9.322, "step": 216 }, { "epoch": 0.10212966231321331, "grad_norm": 0.38138681650161743, "learning_rate": 4.9997824193380004e-05, "loss": 9.6177, "step": 217 }, { "epoch": 0.10260030591834333, "grad_norm": 0.39529645442962646, "learning_rate": 4.9997560691673194e-05, "loss": 9.054, "step": 218 }, { "epoch": 0.10307094952347334, "grad_norm": 0.4126908481121063, "learning_rate": 4.999728213373267e-05, "loss": 9.4406, "step": 219 }, { "epoch": 0.10354159312860337, "grad_norm": 0.4137309491634369, "learning_rate": 4.999698851972622e-05, "loss": 9.0403, "step": 220 }, { "epoch": 0.10401223673373337, "grad_norm": 0.4086442291736603, "learning_rate": 4.999667984983069e-05, "loss": 9.3006, "step": 221 }, { "epoch": 0.1044828803388634, "grad_norm": 0.5080444812774658, "learning_rate": 4.999635612423198e-05, "loss": 9.1856, "step": 222 }, { "epoch": 0.1049535239439934, "grad_norm": 0.36199596524238586, "learning_rate": 4.9996017343125085e-05, "loss": 9.3119, "step": 223 }, { "epoch": 0.10542416754912343, "grad_norm": 0.4086923897266388, "learning_rate": 4.9995663506714054e-05, "loss": 9.1335, "step": 224 }, { "epoch": 0.10589481115425343, "grad_norm": 0.42041823267936707, "learning_rate": 4.9995294615212006e-05, "loss": 8.9113, "step": 225 }, { "epoch": 0.10636545475938346, "grad_norm": 0.35369089245796204, "learning_rate": 4.999491066884113e-05, "loss": 9.4732, "step": 226 }, { "epoch": 0.10683609836451347, "grad_norm": 0.8479387164115906, "learning_rate": 4.9994511667832665e-05, "loss": 9.1135, "step": 227 }, { "epoch": 0.10730674196964349, "grad_norm": 0.38847988843917847, "learning_rate": 4.999409761242696e-05, "loss": 9.3632, "step": 228 }, { "epoch": 0.1077773855747735, "grad_norm": 0.43660977482795715, "learning_rate": 4.999366850287337e-05, "loss": 8.6279, "step": 229 }, { "epoch": 0.10824802917990352, "grad_norm": 0.6459296345710754, "learning_rate": 4.999322433943038e-05, "loss": 9.1736, "step": 230 }, { "epoch": 0.10871867278503353, "grad_norm": 0.453952819108963, "learning_rate": 4.99927651223655e-05, "loss": 8.7847, "step": 231 }, { "epoch": 0.10918931639016355, "grad_norm": 0.3641432821750641, "learning_rate": 4.9992290851955325e-05, "loss": 9.1591, "step": 232 }, { "epoch": 0.10965995999529356, "grad_norm": 0.43097686767578125, "learning_rate": 4.999180152848551e-05, "loss": 8.8475, "step": 233 }, { "epoch": 0.11013060360042358, "grad_norm": 0.40101760625839233, "learning_rate": 4.999129715225077e-05, "loss": 9.3003, "step": 234 }, { "epoch": 0.11060124720555359, "grad_norm": 0.38456395268440247, "learning_rate": 4.99907777235549e-05, "loss": 9.0397, "step": 235 }, { "epoch": 0.11107189081068361, "grad_norm": 0.3518768846988678, "learning_rate": 4.9990243242710764e-05, "loss": 9.3619, "step": 236 }, { "epoch": 0.11154253441581362, "grad_norm": 0.43492040038108826, "learning_rate": 4.9989693710040284e-05, "loss": 8.9691, "step": 237 }, { "epoch": 0.11201317802094364, "grad_norm": 0.4434773325920105, "learning_rate": 4.998912912587444e-05, "loss": 8.6355, "step": 238 }, { "epoch": 0.11248382162607365, "grad_norm": 0.4103478193283081, "learning_rate": 4.998854949055328e-05, "loss": 9.0966, "step": 239 }, { "epoch": 0.11295446523120367, "grad_norm": 0.409065842628479, "learning_rate": 4.998795480442595e-05, "loss": 8.9825, "step": 240 }, { "epoch": 0.11342510883633368, "grad_norm": 0.3709560036659241, "learning_rate": 4.9987345067850596e-05, "loss": 9.383, "step": 241 }, { "epoch": 0.1138957524414637, "grad_norm": 0.4049656391143799, "learning_rate": 4.9986720281194496e-05, "loss": 8.8382, "step": 242 }, { "epoch": 0.11436639604659371, "grad_norm": 0.40016597509384155, "learning_rate": 4.998608044483396e-05, "loss": 9.0227, "step": 243 }, { "epoch": 0.11483703965172373, "grad_norm": 0.41628897190093994, "learning_rate": 4.998542555915435e-05, "loss": 9.1208, "step": 244 }, { "epoch": 0.11530768325685374, "grad_norm": 0.37839028239250183, "learning_rate": 4.998475562455013e-05, "loss": 9.2952, "step": 245 }, { "epoch": 0.11577832686198376, "grad_norm": 0.37010782957077026, "learning_rate": 4.99840706414248e-05, "loss": 8.8903, "step": 246 }, { "epoch": 0.11624897046711377, "grad_norm": 0.40624648332595825, "learning_rate": 4.998337061019092e-05, "loss": 9.1322, "step": 247 }, { "epoch": 0.1167196140722438, "grad_norm": 0.330285906791687, "learning_rate": 4.998265553127013e-05, "loss": 9.3509, "step": 248 }, { "epoch": 0.1171902576773738, "grad_norm": 0.4315396249294281, "learning_rate": 4.9981925405093146e-05, "loss": 8.5941, "step": 249 }, { "epoch": 0.11766090128250382, "grad_norm": 0.46557149291038513, "learning_rate": 4.99811802320997e-05, "loss": 8.7841, "step": 250 }, { "epoch": 0.11813154488763383, "grad_norm": 0.40763556957244873, "learning_rate": 4.998042001273864e-05, "loss": 9.0945, "step": 251 }, { "epoch": 0.11860218849276385, "grad_norm": 0.38328826427459717, "learning_rate": 4.9979644747467835e-05, "loss": 9.5115, "step": 252 }, { "epoch": 0.11907283209789386, "grad_norm": 0.3737850487232208, "learning_rate": 4.997885443675424e-05, "loss": 8.6629, "step": 253 }, { "epoch": 0.11954347570302389, "grad_norm": 0.38939982652664185, "learning_rate": 4.997804908107387e-05, "loss": 9.1315, "step": 254 }, { "epoch": 0.1200141193081539, "grad_norm": 0.41033586859703064, "learning_rate": 4.997722868091179e-05, "loss": 8.9948, "step": 255 }, { "epoch": 0.12048476291328392, "grad_norm": 0.4496087431907654, "learning_rate": 4.997639323676214e-05, "loss": 8.7967, "step": 256 }, { "epoch": 0.12095540651841392, "grad_norm": 0.4463037848472595, "learning_rate": 4.997554274912811e-05, "loss": 8.6575, "step": 257 }, { "epoch": 0.12142605012354395, "grad_norm": 0.447477251291275, "learning_rate": 4.997467721852196e-05, "loss": 9.4086, "step": 258 }, { "epoch": 0.12189669372867395, "grad_norm": 0.40504494309425354, "learning_rate": 4.9973796645465e-05, "loss": 9.6567, "step": 259 }, { "epoch": 0.12236733733380398, "grad_norm": 0.4193851351737976, "learning_rate": 4.9972901030487616e-05, "loss": 9.415, "step": 260 }, { "epoch": 0.12283798093893399, "grad_norm": 0.37490740418434143, "learning_rate": 4.997199037412923e-05, "loss": 9.094, "step": 261 }, { "epoch": 0.12330862454406401, "grad_norm": 0.4043318033218384, "learning_rate": 4.997106467693835e-05, "loss": 9.1566, "step": 262 }, { "epoch": 0.12377926814919402, "grad_norm": 0.3795372247695923, "learning_rate": 4.997012393947253e-05, "loss": 9.5975, "step": 263 }, { "epoch": 0.12424991175432404, "grad_norm": 0.38997772336006165, "learning_rate": 4.996916816229837e-05, "loss": 9.3275, "step": 264 }, { "epoch": 0.12472055535945405, "grad_norm": 0.41787171363830566, "learning_rate": 4.9968197345991565e-05, "loss": 8.9184, "step": 265 }, { "epoch": 0.12519119896458406, "grad_norm": 0.4403538703918457, "learning_rate": 4.996721149113682e-05, "loss": 9.0055, "step": 266 }, { "epoch": 0.12566184256971408, "grad_norm": 0.44756266474723816, "learning_rate": 4.996621059832795e-05, "loss": 9.0517, "step": 267 }, { "epoch": 0.1261324861748441, "grad_norm": 0.3958662748336792, "learning_rate": 4.996519466816778e-05, "loss": 9.1983, "step": 268 }, { "epoch": 0.12660312977997412, "grad_norm": 0.5548920035362244, "learning_rate": 4.9964163701268224e-05, "loss": 9.0239, "step": 269 }, { "epoch": 0.12707377338510412, "grad_norm": 0.38231074810028076, "learning_rate": 4.996311769825024e-05, "loss": 9.4057, "step": 270 }, { "epoch": 0.12754441699023414, "grad_norm": 0.37411412596702576, "learning_rate": 4.996205665974384e-05, "loss": 9.147, "step": 271 }, { "epoch": 0.12801506059536416, "grad_norm": 0.36638572812080383, "learning_rate": 4.996098058638809e-05, "loss": 9.3312, "step": 272 }, { "epoch": 0.12848570420049418, "grad_norm": 0.36364972591400146, "learning_rate": 4.995988947883114e-05, "loss": 9.4873, "step": 273 }, { "epoch": 0.12895634780562418, "grad_norm": 0.415054053068161, "learning_rate": 4.9958783337730156e-05, "loss": 9.0241, "step": 274 }, { "epoch": 0.1294269914107542, "grad_norm": 0.616145133972168, "learning_rate": 4.995766216375137e-05, "loss": 9.1209, "step": 275 }, { "epoch": 0.12989763501588422, "grad_norm": 0.3728233575820923, "learning_rate": 4.9956525957570086e-05, "loss": 9.5214, "step": 276 }, { "epoch": 0.13036827862101424, "grad_norm": 0.4377942681312561, "learning_rate": 4.995537471987066e-05, "loss": 8.7668, "step": 277 }, { "epoch": 0.13083892222614424, "grad_norm": 0.4865539073944092, "learning_rate": 4.9954208451346465e-05, "loss": 8.8752, "step": 278 }, { "epoch": 0.13130956583127426, "grad_norm": 0.4728136658668518, "learning_rate": 4.995302715269997e-05, "loss": 9.0947, "step": 279 }, { "epoch": 0.13178020943640428, "grad_norm": 0.40794286131858826, "learning_rate": 4.995183082464269e-05, "loss": 8.9566, "step": 280 }, { "epoch": 0.1322508530415343, "grad_norm": 0.35321590304374695, "learning_rate": 4.995061946789516e-05, "loss": 9.4166, "step": 281 }, { "epoch": 0.1327214966466643, "grad_norm": 0.41053611040115356, "learning_rate": 4.9949393083187005e-05, "loss": 9.0913, "step": 282 }, { "epoch": 0.13319214025179432, "grad_norm": 0.4475056231021881, "learning_rate": 4.9948151671256883e-05, "loss": 8.422, "step": 283 }, { "epoch": 0.13366278385692434, "grad_norm": 0.34866318106651306, "learning_rate": 4.994689523285251e-05, "loss": 9.2168, "step": 284 }, { "epoch": 0.13413342746205437, "grad_norm": 0.4374255836009979, "learning_rate": 4.994562376873064e-05, "loss": 8.9508, "step": 285 }, { "epoch": 0.13460407106718436, "grad_norm": 0.38839930295944214, "learning_rate": 4.9944337279657106e-05, "loss": 8.8695, "step": 286 }, { "epoch": 0.13507471467231438, "grad_norm": 0.4352591335773468, "learning_rate": 4.994303576640674e-05, "loss": 8.7637, "step": 287 }, { "epoch": 0.1355453582774444, "grad_norm": 0.36577296257019043, "learning_rate": 4.994171922976348e-05, "loss": 9.4622, "step": 288 }, { "epoch": 0.13601600188257443, "grad_norm": 0.3764691650867462, "learning_rate": 4.994038767052028e-05, "loss": 9.3536, "step": 289 }, { "epoch": 0.13648664548770442, "grad_norm": 0.3795958161354065, "learning_rate": 4.993904108947914e-05, "loss": 8.9066, "step": 290 }, { "epoch": 0.13695728909283444, "grad_norm": 0.42235082387924194, "learning_rate": 4.993767948745113e-05, "loss": 9.168, "step": 291 }, { "epoch": 0.13742793269796447, "grad_norm": 0.41240936517715454, "learning_rate": 4.993630286525634e-05, "loss": 8.8015, "step": 292 }, { "epoch": 0.1378985763030945, "grad_norm": 0.40508440136909485, "learning_rate": 4.993491122372394e-05, "loss": 8.9218, "step": 293 }, { "epoch": 0.13836921990822448, "grad_norm": 0.44761571288108826, "learning_rate": 4.99335045636921e-05, "loss": 8.9542, "step": 294 }, { "epoch": 0.1388398635133545, "grad_norm": 0.35136064887046814, "learning_rate": 4.993208288600808e-05, "loss": 9.0036, "step": 295 }, { "epoch": 0.13931050711848453, "grad_norm": 0.3560550808906555, "learning_rate": 4.9930646191528175e-05, "loss": 9.5513, "step": 296 }, { "epoch": 0.13978115072361455, "grad_norm": 0.40760746598243713, "learning_rate": 4.99291944811177e-05, "loss": 9.1574, "step": 297 }, { "epoch": 0.14025179432874454, "grad_norm": 0.4152514338493347, "learning_rate": 4.992772775565104e-05, "loss": 8.9221, "step": 298 }, { "epoch": 0.14072243793387457, "grad_norm": 0.36200031638145447, "learning_rate": 4.992624601601162e-05, "loss": 9.2766, "step": 299 }, { "epoch": 0.1411930815390046, "grad_norm": 0.3931048512458801, "learning_rate": 4.992474926309191e-05, "loss": 9.0796, "step": 300 }, { "epoch": 0.1416637251441346, "grad_norm": 0.3852521777153015, "learning_rate": 4.992323749779339e-05, "loss": 8.9804, "step": 301 }, { "epoch": 0.1421343687492646, "grad_norm": 0.42558741569519043, "learning_rate": 4.992171072102663e-05, "loss": 8.6188, "step": 302 }, { "epoch": 0.14260501235439463, "grad_norm": 0.40560707449913025, "learning_rate": 4.992016893371122e-05, "loss": 9.2215, "step": 303 }, { "epoch": 0.14307565595952465, "grad_norm": 0.3654381334781647, "learning_rate": 4.9918612136775776e-05, "loss": 9.6141, "step": 304 }, { "epoch": 0.14354629956465467, "grad_norm": 0.3547174632549286, "learning_rate": 4.9917040331157986e-05, "loss": 9.4322, "step": 305 }, { "epoch": 0.14401694316978467, "grad_norm": 0.3975953161716461, "learning_rate": 4.9915453517804554e-05, "loss": 9.0455, "step": 306 }, { "epoch": 0.1444875867749147, "grad_norm": 0.4045639932155609, "learning_rate": 4.991385169767123e-05, "loss": 8.6646, "step": 307 }, { "epoch": 0.1449582303800447, "grad_norm": 0.39949241280555725, "learning_rate": 4.9912234871722805e-05, "loss": 8.9656, "step": 308 }, { "epoch": 0.14542887398517473, "grad_norm": 0.38490548729896545, "learning_rate": 4.9910603040933116e-05, "loss": 9.2289, "step": 309 }, { "epoch": 0.14589951759030473, "grad_norm": 0.38393279910087585, "learning_rate": 4.9908956206285e-05, "loss": 9.5308, "step": 310 }, { "epoch": 0.14637016119543475, "grad_norm": 0.41801533102989197, "learning_rate": 4.990729436877038e-05, "loss": 9.179, "step": 311 }, { "epoch": 0.14684080480056477, "grad_norm": 0.3734685182571411, "learning_rate": 4.9905617529390203e-05, "loss": 9.4323, "step": 312 }, { "epoch": 0.1473114484056948, "grad_norm": 0.38498827815055847, "learning_rate": 4.9903925689154425e-05, "loss": 8.7253, "step": 313 }, { "epoch": 0.1477820920108248, "grad_norm": 0.4148082435131073, "learning_rate": 4.990221884908206e-05, "loss": 9.5291, "step": 314 }, { "epoch": 0.1482527356159548, "grad_norm": 0.3645360469818115, "learning_rate": 4.990049701020115e-05, "loss": 9.3854, "step": 315 }, { "epoch": 0.14872337922108483, "grad_norm": 0.39119553565979004, "learning_rate": 4.989876017354878e-05, "loss": 8.8417, "step": 316 }, { "epoch": 0.14919402282621486, "grad_norm": 0.40799564123153687, "learning_rate": 4.989700834017105e-05, "loss": 9.1028, "step": 317 }, { "epoch": 0.14966466643134485, "grad_norm": 0.36694031953811646, "learning_rate": 4.9895241511123114e-05, "loss": 9.26, "step": 318 }, { "epoch": 0.15013531003647487, "grad_norm": 0.4914778769016266, "learning_rate": 4.989345968746914e-05, "loss": 9.3256, "step": 319 }, { "epoch": 0.1506059536416049, "grad_norm": 0.43579304218292236, "learning_rate": 4.989166287028234e-05, "loss": 8.7753, "step": 320 }, { "epoch": 0.15107659724673492, "grad_norm": 0.37302032113075256, "learning_rate": 4.988985106064495e-05, "loss": 9.3832, "step": 321 }, { "epoch": 0.1515472408518649, "grad_norm": 0.3695763945579529, "learning_rate": 4.988802425964824e-05, "loss": 8.7549, "step": 322 }, { "epoch": 0.15201788445699493, "grad_norm": 0.4146966338157654, "learning_rate": 4.98861824683925e-05, "loss": 8.8819, "step": 323 }, { "epoch": 0.15248852806212496, "grad_norm": 0.36729514598846436, "learning_rate": 4.9884325687987056e-05, "loss": 8.9922, "step": 324 }, { "epoch": 0.15295917166725498, "grad_norm": 0.3997980058193207, "learning_rate": 4.9882453919550264e-05, "loss": 9.0574, "step": 325 }, { "epoch": 0.15342981527238497, "grad_norm": 0.31628280878067017, "learning_rate": 4.9880567164209515e-05, "loss": 9.7555, "step": 326 }, { "epoch": 0.153900458877515, "grad_norm": 0.3956843316555023, "learning_rate": 4.98786654231012e-05, "loss": 9.2441, "step": 327 }, { "epoch": 0.15437110248264502, "grad_norm": 0.399984747171402, "learning_rate": 4.987674869737077e-05, "loss": 9.0811, "step": 328 }, { "epoch": 0.15484174608777504, "grad_norm": 0.40124884247779846, "learning_rate": 4.987481698817268e-05, "loss": 8.7801, "step": 329 }, { "epoch": 0.15531238969290503, "grad_norm": 0.36277976632118225, "learning_rate": 4.98728702966704e-05, "loss": 9.1685, "step": 330 }, { "epoch": 0.15578303329803506, "grad_norm": 0.4415287375450134, "learning_rate": 4.987090862403646e-05, "loss": 8.6159, "step": 331 }, { "epoch": 0.15625367690316508, "grad_norm": 0.4005844295024872, "learning_rate": 4.986893197145237e-05, "loss": 8.7962, "step": 332 }, { "epoch": 0.1567243205082951, "grad_norm": 0.4147176742553711, "learning_rate": 4.9866940340108704e-05, "loss": 9.1667, "step": 333 }, { "epoch": 0.1571949641134251, "grad_norm": 0.5922366976737976, "learning_rate": 4.986493373120502e-05, "loss": 9.1685, "step": 334 }, { "epoch": 0.15766560771855512, "grad_norm": 0.42389023303985596, "learning_rate": 4.986291214594992e-05, "loss": 8.9005, "step": 335 }, { "epoch": 0.15813625132368514, "grad_norm": 3.3356659412384033, "learning_rate": 4.986087558556104e-05, "loss": 8.8868, "step": 336 }, { "epoch": 0.15860689492881516, "grad_norm": 0.3584047853946686, "learning_rate": 4.9858824051264985e-05, "loss": 9.3012, "step": 337 }, { "epoch": 0.15907753853394516, "grad_norm": 0.432365357875824, "learning_rate": 4.985675754429744e-05, "loss": 8.6683, "step": 338 }, { "epoch": 0.15954818213907518, "grad_norm": 0.4141758680343628, "learning_rate": 4.985467606590305e-05, "loss": 8.8902, "step": 339 }, { "epoch": 0.1600188257442052, "grad_norm": 0.5318158268928528, "learning_rate": 4.985257961733553e-05, "loss": 9.3213, "step": 340 }, { "epoch": 0.16048946934933522, "grad_norm": 0.4039144814014435, "learning_rate": 4.985046819985758e-05, "loss": 9.3521, "step": 341 }, { "epoch": 0.16096011295446522, "grad_norm": 0.4055419862270355, "learning_rate": 4.984834181474093e-05, "loss": 9.032, "step": 342 }, { "epoch": 0.16143075655959524, "grad_norm": 0.47234630584716797, "learning_rate": 4.9846200463266304e-05, "loss": 8.9415, "step": 343 }, { "epoch": 0.16190140016472526, "grad_norm": 0.3458828628063202, "learning_rate": 4.984404414672346e-05, "loss": 9.3418, "step": 344 }, { "epoch": 0.16237204376985528, "grad_norm": 0.4208340048789978, "learning_rate": 4.9841872866411175e-05, "loss": 8.5468, "step": 345 }, { "epoch": 0.16284268737498528, "grad_norm": 0.4632960855960846, "learning_rate": 4.983968662363723e-05, "loss": 8.357, "step": 346 }, { "epoch": 0.1633133309801153, "grad_norm": 0.3957667946815491, "learning_rate": 4.98374854197184e-05, "loss": 9.5873, "step": 347 }, { "epoch": 0.16378397458524532, "grad_norm": 0.45077890157699585, "learning_rate": 4.98352692559805e-05, "loss": 8.6973, "step": 348 }, { "epoch": 0.16425461819037535, "grad_norm": 0.36463478207588196, "learning_rate": 4.983303813375833e-05, "loss": 9.1421, "step": 349 }, { "epoch": 0.16472526179550534, "grad_norm": 0.4010748565196991, "learning_rate": 4.983079205439574e-05, "loss": 9.1377, "step": 350 }, { "epoch": 0.16519590540063536, "grad_norm": 0.39440232515335083, "learning_rate": 4.982853101924554e-05, "loss": 8.9753, "step": 351 }, { "epoch": 0.16566654900576538, "grad_norm": 0.4520394504070282, "learning_rate": 4.9826255029669577e-05, "loss": 8.7352, "step": 352 }, { "epoch": 0.1661371926108954, "grad_norm": 0.4330653250217438, "learning_rate": 4.98239640870387e-05, "loss": 9.0555, "step": 353 }, { "epoch": 0.1666078362160254, "grad_norm": 0.47660115361213684, "learning_rate": 4.982165819273275e-05, "loss": 8.6404, "step": 354 }, { "epoch": 0.16707847982115542, "grad_norm": 0.4233279228210449, "learning_rate": 4.98193373481406e-05, "loss": 8.9099, "step": 355 }, { "epoch": 0.16754912342628545, "grad_norm": 0.43518248200416565, "learning_rate": 4.98170015546601e-05, "loss": 8.6882, "step": 356 }, { "epoch": 0.16801976703141547, "grad_norm": 0.3644963800907135, "learning_rate": 4.981465081369814e-05, "loss": 9.2448, "step": 357 }, { "epoch": 0.16849041063654546, "grad_norm": 0.38815975189208984, "learning_rate": 4.981228512667057e-05, "loss": 9.558, "step": 358 }, { "epoch": 0.16896105424167548, "grad_norm": 0.4271330237388611, "learning_rate": 4.980990449500227e-05, "loss": 8.4688, "step": 359 }, { "epoch": 0.1694316978468055, "grad_norm": 0.4300340712070465, "learning_rate": 4.980750892012711e-05, "loss": 8.5112, "step": 360 }, { "epoch": 0.16990234145193553, "grad_norm": 0.3674795627593994, "learning_rate": 4.980509840348796e-05, "loss": 9.1979, "step": 361 }, { "epoch": 0.17037298505706552, "grad_norm": 0.39522647857666016, "learning_rate": 4.980267294653671e-05, "loss": 9.3743, "step": 362 }, { "epoch": 0.17084362866219555, "grad_norm": 0.4358430504798889, "learning_rate": 4.980023255073422e-05, "loss": 9.1216, "step": 363 }, { "epoch": 0.17131427226732557, "grad_norm": 0.40390607714653015, "learning_rate": 4.9797777217550367e-05, "loss": 8.9767, "step": 364 }, { "epoch": 0.1717849158724556, "grad_norm": 0.3644031584262848, "learning_rate": 4.9795306948464e-05, "loss": 9.2284, "step": 365 }, { "epoch": 0.17225555947758558, "grad_norm": 0.41837140917778015, "learning_rate": 4.979282174496302e-05, "loss": 8.8997, "step": 366 }, { "epoch": 0.1727262030827156, "grad_norm": 0.38197219371795654, "learning_rate": 4.979032160854424e-05, "loss": 9.1135, "step": 367 }, { "epoch": 0.17319684668784563, "grad_norm": 0.3703914284706116, "learning_rate": 4.9787806540713546e-05, "loss": 9.499, "step": 368 }, { "epoch": 0.17366749029297565, "grad_norm": 0.5900145769119263, "learning_rate": 4.978527654298576e-05, "loss": 9.6679, "step": 369 }, { "epoch": 0.17413813389810565, "grad_norm": 0.4443458318710327, "learning_rate": 4.9782731616884736e-05, "loss": 8.4039, "step": 370 }, { "epoch": 0.17460877750323567, "grad_norm": 0.31717589497566223, "learning_rate": 4.978017176394331e-05, "loss": 9.7594, "step": 371 }, { "epoch": 0.1750794211083657, "grad_norm": 0.3682294189929962, "learning_rate": 4.977759698570328e-05, "loss": 9.3738, "step": 372 }, { "epoch": 0.1755500647134957, "grad_norm": 0.36333027482032776, "learning_rate": 4.977500728371547e-05, "loss": 9.4728, "step": 373 }, { "epoch": 0.1760207083186257, "grad_norm": 0.38923901319503784, "learning_rate": 4.9772402659539674e-05, "loss": 9.0362, "step": 374 }, { "epoch": 0.17649135192375573, "grad_norm": 0.3548789620399475, "learning_rate": 4.9769783114744686e-05, "loss": 9.4734, "step": 375 }, { "epoch": 0.17696199552888575, "grad_norm": 0.3727724552154541, "learning_rate": 4.976714865090827e-05, "loss": 8.9019, "step": 376 }, { "epoch": 0.17743263913401577, "grad_norm": 0.3825220763683319, "learning_rate": 4.976449926961719e-05, "loss": 9.4008, "step": 377 }, { "epoch": 0.17790328273914577, "grad_norm": 0.36432167887687683, "learning_rate": 4.9761834972467185e-05, "loss": 9.4614, "step": 378 }, { "epoch": 0.1783739263442758, "grad_norm": 0.4360719621181488, "learning_rate": 4.975915576106299e-05, "loss": 8.9864, "step": 379 }, { "epoch": 0.1788445699494058, "grad_norm": 0.36198675632476807, "learning_rate": 4.975646163701831e-05, "loss": 9.3858, "step": 380 }, { "epoch": 0.17931521355453583, "grad_norm": 0.3615058362483978, "learning_rate": 4.9753752601955836e-05, "loss": 9.4513, "step": 381 }, { "epoch": 0.17978585715966583, "grad_norm": 0.38385000824928284, "learning_rate": 4.975102865750725e-05, "loss": 9.0129, "step": 382 }, { "epoch": 0.18025650076479585, "grad_norm": 0.42161351442337036, "learning_rate": 4.9748289805313196e-05, "loss": 8.8066, "step": 383 }, { "epoch": 0.18072714436992587, "grad_norm": 0.3863692879676819, "learning_rate": 4.9745536047023324e-05, "loss": 9.0613, "step": 384 }, { "epoch": 0.1811977879750559, "grad_norm": 0.35685333609580994, "learning_rate": 4.9742767384296216e-05, "loss": 9.1823, "step": 385 }, { "epoch": 0.1816684315801859, "grad_norm": 0.4146454930305481, "learning_rate": 4.973998381879949e-05, "loss": 9.0627, "step": 386 }, { "epoch": 0.1821390751853159, "grad_norm": 0.40701958537101746, "learning_rate": 4.973718535220969e-05, "loss": 9.4653, "step": 387 }, { "epoch": 0.18260971879044594, "grad_norm": 0.5105063915252686, "learning_rate": 4.973437198621237e-05, "loss": 9.1349, "step": 388 }, { "epoch": 0.18308036239557596, "grad_norm": 0.3464662730693817, "learning_rate": 4.973154372250203e-05, "loss": 9.3152, "step": 389 }, { "epoch": 0.18355100600070595, "grad_norm": 0.3519923985004425, "learning_rate": 4.972870056278216e-05, "loss": 9.6833, "step": 390 }, { "epoch": 0.18402164960583597, "grad_norm": 0.3777810037136078, "learning_rate": 4.972584250876522e-05, "loss": 8.9543, "step": 391 }, { "epoch": 0.184492293210966, "grad_norm": 0.45620018243789673, "learning_rate": 4.972296956217265e-05, "loss": 8.5477, "step": 392 }, { "epoch": 0.18496293681609602, "grad_norm": 0.3768126368522644, "learning_rate": 4.972008172473483e-05, "loss": 9.2837, "step": 393 }, { "epoch": 0.185433580421226, "grad_norm": 0.37716034054756165, "learning_rate": 4.971717899819113e-05, "loss": 9.0821, "step": 394 }, { "epoch": 0.18590422402635604, "grad_norm": 0.40171629190444946, "learning_rate": 4.9714261384289896e-05, "loss": 9.0963, "step": 395 }, { "epoch": 0.18637486763148606, "grad_norm": 0.41346555948257446, "learning_rate": 4.9711328884788434e-05, "loss": 8.6835, "step": 396 }, { "epoch": 0.18684551123661608, "grad_norm": 0.3882580101490021, "learning_rate": 4.970838150145299e-05, "loss": 8.998, "step": 397 }, { "epoch": 0.18731615484174607, "grad_norm": 0.40618547797203064, "learning_rate": 4.9705419236058825e-05, "loss": 8.8586, "step": 398 }, { "epoch": 0.1877867984468761, "grad_norm": 0.4610426127910614, "learning_rate": 4.970244209039012e-05, "loss": 8.5731, "step": 399 }, { "epoch": 0.18825744205200612, "grad_norm": 0.3799988329410553, "learning_rate": 4.969945006624003e-05, "loss": 8.9463, "step": 400 }, { "epoch": 0.18872808565713614, "grad_norm": 0.37528830766677856, "learning_rate": 4.969644316541068e-05, "loss": 8.9402, "step": 401 }, { "epoch": 0.18919872926226614, "grad_norm": 0.3422936201095581, "learning_rate": 4.9693421389713156e-05, "loss": 9.3497, "step": 402 }, { "epoch": 0.18966937286739616, "grad_norm": 0.35784366726875305, "learning_rate": 4.969038474096749e-05, "loss": 9.1984, "step": 403 }, { "epoch": 0.19014001647252618, "grad_norm": 0.36203494668006897, "learning_rate": 4.96873332210027e-05, "loss": 9.5096, "step": 404 }, { "epoch": 0.1906106600776562, "grad_norm": 0.3657507598400116, "learning_rate": 4.9684266831656706e-05, "loss": 9.4901, "step": 405 }, { "epoch": 0.1910813036827862, "grad_norm": 0.3886093199253082, "learning_rate": 4.9681185574776446e-05, "loss": 9.2492, "step": 406 }, { "epoch": 0.19155194728791622, "grad_norm": 0.4091348350048065, "learning_rate": 4.967808945221778e-05, "loss": 8.9341, "step": 407 }, { "epoch": 0.19202259089304624, "grad_norm": 0.45772606134414673, "learning_rate": 4.967497846584552e-05, "loss": 9.1159, "step": 408 }, { "epoch": 0.19249323449817626, "grad_norm": 0.4274662733078003, "learning_rate": 4.967185261753345e-05, "loss": 9.0557, "step": 409 }, { "epoch": 0.19296387810330626, "grad_norm": 0.3963877558708191, "learning_rate": 4.96687119091643e-05, "loss": 9.2221, "step": 410 }, { "epoch": 0.19343452170843628, "grad_norm": 0.3958019018173218, "learning_rate": 4.966555634262972e-05, "loss": 8.7826, "step": 411 }, { "epoch": 0.1939051653135663, "grad_norm": 0.3447028398513794, "learning_rate": 4.9662385919830347e-05, "loss": 9.5672, "step": 412 }, { "epoch": 0.19437580891869632, "grad_norm": 0.41687721014022827, "learning_rate": 4.965920064267575e-05, "loss": 8.7692, "step": 413 }, { "epoch": 0.19484645252382632, "grad_norm": 0.40204861760139465, "learning_rate": 4.9656000513084455e-05, "loss": 8.9861, "step": 414 }, { "epoch": 0.19531709612895634, "grad_norm": 0.3969802260398865, "learning_rate": 4.965278553298392e-05, "loss": 8.7663, "step": 415 }, { "epoch": 0.19578773973408636, "grad_norm": 0.3831544518470764, "learning_rate": 4.964955570431055e-05, "loss": 9.1338, "step": 416 }, { "epoch": 0.19625838333921639, "grad_norm": 0.40865185856819153, "learning_rate": 4.96463110290097e-05, "loss": 8.7582, "step": 417 }, { "epoch": 0.19672902694434638, "grad_norm": 0.36668238043785095, "learning_rate": 4.964305150903566e-05, "loss": 9.185, "step": 418 }, { "epoch": 0.1971996705494764, "grad_norm": 0.4229344129562378, "learning_rate": 4.963977714635168e-05, "loss": 9.0629, "step": 419 }, { "epoch": 0.19767031415460642, "grad_norm": 0.36557090282440186, "learning_rate": 4.963648794292992e-05, "loss": 9.2807, "step": 420 }, { "epoch": 0.19814095775973645, "grad_norm": 0.36382701992988586, "learning_rate": 4.9633183900751504e-05, "loss": 9.3589, "step": 421 }, { "epoch": 0.19861160136486644, "grad_norm": 0.34733355045318604, "learning_rate": 4.962986502180648e-05, "loss": 9.246, "step": 422 }, { "epoch": 0.19908224496999646, "grad_norm": 0.39794841408729553, "learning_rate": 4.962653130809383e-05, "loss": 8.8009, "step": 423 }, { "epoch": 0.19955288857512649, "grad_norm": 1.290969967842102, "learning_rate": 4.962318276162148e-05, "loss": 8.8199, "step": 424 }, { "epoch": 0.2000235321802565, "grad_norm": 0.41390761733055115, "learning_rate": 4.961981938440629e-05, "loss": 8.8504, "step": 425 }, { "epoch": 0.2004941757853865, "grad_norm": 0.4563705623149872, "learning_rate": 4.9616441178474044e-05, "loss": 8.4598, "step": 426 }, { "epoch": 0.20096481939051652, "grad_norm": 0.41248825192451477, "learning_rate": 4.9613048145859465e-05, "loss": 8.9862, "step": 427 }, { "epoch": 0.20143546299564655, "grad_norm": 0.3711670935153961, "learning_rate": 4.9609640288606205e-05, "loss": 9.1376, "step": 428 }, { "epoch": 0.20190610660077657, "grad_norm": 0.3998201787471771, "learning_rate": 4.960621760876686e-05, "loss": 8.8631, "step": 429 }, { "epoch": 0.20237675020590656, "grad_norm": 0.39512693881988525, "learning_rate": 4.96027801084029e-05, "loss": 8.6108, "step": 430 }, { "epoch": 0.20284739381103659, "grad_norm": 0.40403223037719727, "learning_rate": 4.95993277895848e-05, "loss": 8.9947, "step": 431 }, { "epoch": 0.2033180374161666, "grad_norm": 0.37190157175064087, "learning_rate": 4.959586065439189e-05, "loss": 9.0393, "step": 432 }, { "epoch": 0.20378868102129663, "grad_norm": 0.49797308444976807, "learning_rate": 4.959237870491247e-05, "loss": 8.4229, "step": 433 }, { "epoch": 0.20425932462642662, "grad_norm": 0.4093763828277588, "learning_rate": 4.958888194324374e-05, "loss": 9.2132, "step": 434 }, { "epoch": 0.20472996823155665, "grad_norm": 0.4164353609085083, "learning_rate": 4.958537037149183e-05, "loss": 9.3971, "step": 435 }, { "epoch": 0.20520061183668667, "grad_norm": 0.4578768312931061, "learning_rate": 4.958184399177178e-05, "loss": 8.8712, "step": 436 }, { "epoch": 0.2056712554418167, "grad_norm": 0.3586215674877167, "learning_rate": 4.957830280620758e-05, "loss": 9.3741, "step": 437 }, { "epoch": 0.20614189904694669, "grad_norm": 0.4265285134315491, "learning_rate": 4.9574746816932084e-05, "loss": 9.5791, "step": 438 }, { "epoch": 0.2066125426520767, "grad_norm": 0.4029577672481537, "learning_rate": 4.9571176026087116e-05, "loss": 8.7589, "step": 439 }, { "epoch": 0.20708318625720673, "grad_norm": 0.38180944323539734, "learning_rate": 4.9567590435823383e-05, "loss": 9.0139, "step": 440 }, { "epoch": 0.20755382986233675, "grad_norm": 0.39456745982170105, "learning_rate": 4.9563990048300524e-05, "loss": 9.1201, "step": 441 }, { "epoch": 0.20802447346746675, "grad_norm": 0.5495271682739258, "learning_rate": 4.956037486568706e-05, "loss": 8.5788, "step": 442 }, { "epoch": 0.20849511707259677, "grad_norm": 0.4691711366176605, "learning_rate": 4.9556744890160477e-05, "loss": 8.6122, "step": 443 }, { "epoch": 0.2089657606777268, "grad_norm": 0.42626431584358215, "learning_rate": 4.955310012390711e-05, "loss": 9.0031, "step": 444 }, { "epoch": 0.20943640428285681, "grad_norm": 0.3541715145111084, "learning_rate": 4.954944056912224e-05, "loss": 9.3784, "step": 445 }, { "epoch": 0.2099070478879868, "grad_norm": 0.3353878855705261, "learning_rate": 4.954576622801006e-05, "loss": 9.2536, "step": 446 }, { "epoch": 0.21037769149311683, "grad_norm": 0.45526987314224243, "learning_rate": 4.954207710278364e-05, "loss": 8.8725, "step": 447 }, { "epoch": 0.21084833509824685, "grad_norm": 0.3993997275829315, "learning_rate": 4.953837319566497e-05, "loss": 8.7531, "step": 448 }, { "epoch": 0.21131897870337688, "grad_norm": 0.4544302821159363, "learning_rate": 4.953465450888495e-05, "loss": 8.6906, "step": 449 }, { "epoch": 0.21178962230850687, "grad_norm": 0.35516420006752014, "learning_rate": 4.9530921044683374e-05, "loss": 9.0749, "step": 450 }, { "epoch": 0.2122602659136369, "grad_norm": 1.6792665719985962, "learning_rate": 4.9527172805308944e-05, "loss": 9.2437, "step": 451 }, { "epoch": 0.21273090951876691, "grad_norm": 0.46345287561416626, "learning_rate": 4.952340979301924e-05, "loss": 9.0281, "step": 452 }, { "epoch": 0.21320155312389694, "grad_norm": 0.447298139333725, "learning_rate": 4.951963201008076e-05, "loss": 8.9642, "step": 453 }, { "epoch": 0.21367219672902693, "grad_norm": 0.4767840504646301, "learning_rate": 4.9515839458768905e-05, "loss": 8.359, "step": 454 }, { "epoch": 0.21414284033415695, "grad_norm": 0.4263994097709656, "learning_rate": 4.9512032141367946e-05, "loss": 9.1196, "step": 455 }, { "epoch": 0.21461348393928698, "grad_norm": 0.4342626929283142, "learning_rate": 4.950821006017107e-05, "loss": 8.6583, "step": 456 }, { "epoch": 0.215084127544417, "grad_norm": 0.3934561610221863, "learning_rate": 4.950437321748034e-05, "loss": 9.0519, "step": 457 }, { "epoch": 0.215554771149547, "grad_norm": 0.4860813319683075, "learning_rate": 4.9500521615606716e-05, "loss": 8.5634, "step": 458 }, { "epoch": 0.21602541475467701, "grad_norm": 0.35411691665649414, "learning_rate": 4.949665525687005e-05, "loss": 9.1898, "step": 459 }, { "epoch": 0.21649605835980704, "grad_norm": 0.4290132224559784, "learning_rate": 4.94927741435991e-05, "loss": 8.9995, "step": 460 }, { "epoch": 0.21696670196493706, "grad_norm": 0.3373097777366638, "learning_rate": 4.948887827813147e-05, "loss": 9.3386, "step": 461 }, { "epoch": 0.21743734557006705, "grad_norm": 0.42341887950897217, "learning_rate": 4.948496766281368e-05, "loss": 9.3743, "step": 462 }, { "epoch": 0.21790798917519708, "grad_norm": 0.3915397524833679, "learning_rate": 4.9481042300001124e-05, "loss": 9.1503, "step": 463 }, { "epoch": 0.2183786327803271, "grad_norm": 0.4155285954475403, "learning_rate": 4.947710219205808e-05, "loss": 9.0803, "step": 464 }, { "epoch": 0.21884927638545712, "grad_norm": 0.4009873867034912, "learning_rate": 4.94731473413577e-05, "loss": 8.8088, "step": 465 }, { "epoch": 0.21931991999058711, "grad_norm": 0.3694516122341156, "learning_rate": 4.946917775028204e-05, "loss": 9.6886, "step": 466 }, { "epoch": 0.21979056359571714, "grad_norm": 0.4301382899284363, "learning_rate": 4.946519342122199e-05, "loss": 8.8388, "step": 467 }, { "epoch": 0.22026120720084716, "grad_norm": 0.3725178837776184, "learning_rate": 4.946119435657738e-05, "loss": 9.3083, "step": 468 }, { "epoch": 0.22073185080597718, "grad_norm": 0.34573477506637573, "learning_rate": 4.945718055875684e-05, "loss": 9.3972, "step": 469 }, { "epoch": 0.22120249441110718, "grad_norm": 0.4900851845741272, "learning_rate": 4.945315203017795e-05, "loss": 8.8847, "step": 470 }, { "epoch": 0.2216731380162372, "grad_norm": 0.3375721871852875, "learning_rate": 4.944910877326709e-05, "loss": 9.3369, "step": 471 }, { "epoch": 0.22214378162136722, "grad_norm": 0.38274478912353516, "learning_rate": 4.944505079045958e-05, "loss": 9.2587, "step": 472 }, { "epoch": 0.22261442522649724, "grad_norm": 0.45915624499320984, "learning_rate": 4.944097808419955e-05, "loss": 8.6162, "step": 473 }, { "epoch": 0.22308506883162724, "grad_norm": 0.4436270296573639, "learning_rate": 4.9436890656940045e-05, "loss": 8.9692, "step": 474 }, { "epoch": 0.22355571243675726, "grad_norm": 0.44073861837387085, "learning_rate": 4.943278851114293e-05, "loss": 8.6524, "step": 475 }, { "epoch": 0.22402635604188728, "grad_norm": 0.37401431798934937, "learning_rate": 4.942867164927899e-05, "loss": 9.3269, "step": 476 }, { "epoch": 0.2244969996470173, "grad_norm": 0.36092767119407654, "learning_rate": 4.942454007382782e-05, "loss": 9.0893, "step": 477 }, { "epoch": 0.2249676432521473, "grad_norm": 0.46312302350997925, "learning_rate": 4.9420393787277917e-05, "loss": 9.3986, "step": 478 }, { "epoch": 0.22543828685727732, "grad_norm": 0.339429646730423, "learning_rate": 4.9416232792126615e-05, "loss": 9.3501, "step": 479 }, { "epoch": 0.22590893046240734, "grad_norm": 0.4019092917442322, "learning_rate": 4.941205709088011e-05, "loss": 8.8818, "step": 480 }, { "epoch": 0.22637957406753736, "grad_norm": 0.4025574028491974, "learning_rate": 4.940786668605348e-05, "loss": 9.0087, "step": 481 }, { "epoch": 0.22685021767266736, "grad_norm": 0.41925379633903503, "learning_rate": 4.9403661580170626e-05, "loss": 9.0019, "step": 482 }, { "epoch": 0.22732086127779738, "grad_norm": 0.38912633061408997, "learning_rate": 4.939944177576432e-05, "loss": 9.4554, "step": 483 }, { "epoch": 0.2277915048829274, "grad_norm": 0.3775523602962494, "learning_rate": 4.9395207275376175e-05, "loss": 8.911, "step": 484 }, { "epoch": 0.22826214848805743, "grad_norm": 0.37626808881759644, "learning_rate": 4.939095808155668e-05, "loss": 8.9951, "step": 485 }, { "epoch": 0.22873279209318742, "grad_norm": 0.4059127867221832, "learning_rate": 4.938669419686516e-05, "loss": 9.0841, "step": 486 }, { "epoch": 0.22920343569831744, "grad_norm": 0.35881519317626953, "learning_rate": 4.938241562386977e-05, "loss": 9.2341, "step": 487 }, { "epoch": 0.22967407930344746, "grad_norm": 0.42100849747657776, "learning_rate": 4.9378122365147536e-05, "loss": 9.0711, "step": 488 }, { "epoch": 0.2301447229085775, "grad_norm": 0.4081602394580841, "learning_rate": 4.9373814423284336e-05, "loss": 9.0102, "step": 489 }, { "epoch": 0.23061536651370748, "grad_norm": 0.3893739581108093, "learning_rate": 4.936949180087486e-05, "loss": 9.1481, "step": 490 }, { "epoch": 0.2310860101188375, "grad_norm": 0.38784539699554443, "learning_rate": 4.936515450052267e-05, "loss": 9.2699, "step": 491 }, { "epoch": 0.23155665372396753, "grad_norm": 0.39232099056243896, "learning_rate": 4.9360802524840156e-05, "loss": 9.1015, "step": 492 }, { "epoch": 0.23202729732909755, "grad_norm": 0.4174420237541199, "learning_rate": 4.935643587644855e-05, "loss": 8.8689, "step": 493 }, { "epoch": 0.23249794093422754, "grad_norm": 0.3970744013786316, "learning_rate": 4.9352054557977905e-05, "loss": 9.134, "step": 494 }, { "epoch": 0.23296858453935756, "grad_norm": 0.34588709473609924, "learning_rate": 4.934765857206715e-05, "loss": 9.1163, "step": 495 }, { "epoch": 0.2334392281444876, "grad_norm": 0.38045328855514526, "learning_rate": 4.934324792136399e-05, "loss": 9.2736, "step": 496 }, { "epoch": 0.2339098717496176, "grad_norm": 0.3795531094074249, "learning_rate": 4.9338822608525027e-05, "loss": 9.2326, "step": 497 }, { "epoch": 0.2343805153547476, "grad_norm": 0.3959232270717621, "learning_rate": 4.9334382636215646e-05, "loss": 9.2973, "step": 498 }, { "epoch": 0.23485115895987763, "grad_norm": 0.40320464968681335, "learning_rate": 4.932992800711009e-05, "loss": 8.8766, "step": 499 }, { "epoch": 0.23532180256500765, "grad_norm": 0.35472753643989563, "learning_rate": 4.9325458723891405e-05, "loss": 9.2191, "step": 500 }, { "epoch": 0.23579244617013767, "grad_norm": 0.40472298860549927, "learning_rate": 4.932097478925148e-05, "loss": 8.8783, "step": 501 }, { "epoch": 0.23626308977526767, "grad_norm": 0.4293891489505768, "learning_rate": 4.931647620589104e-05, "loss": 8.4516, "step": 502 }, { "epoch": 0.2367337333803977, "grad_norm": 0.3897256851196289, "learning_rate": 4.9311962976519586e-05, "loss": 9.2541, "step": 503 }, { "epoch": 0.2372043769855277, "grad_norm": 0.36981016397476196, "learning_rate": 4.9307435103855507e-05, "loss": 9.0664, "step": 504 }, { "epoch": 0.23767502059065773, "grad_norm": 0.4339733421802521, "learning_rate": 4.930289259062596e-05, "loss": 9.2965, "step": 505 }, { "epoch": 0.23814566419578773, "grad_norm": 0.4204358756542206, "learning_rate": 4.9298335439566946e-05, "loss": 9.0738, "step": 506 }, { "epoch": 0.23861630780091775, "grad_norm": 0.3759208023548126, "learning_rate": 4.929376365342326e-05, "loss": 9.5119, "step": 507 }, { "epoch": 0.23908695140604777, "grad_norm": 0.3684697151184082, "learning_rate": 4.9289177234948535e-05, "loss": 9.338, "step": 508 }, { "epoch": 0.2395575950111778, "grad_norm": 0.40956175327301025, "learning_rate": 4.928457618690522e-05, "loss": 9.0164, "step": 509 }, { "epoch": 0.2400282386163078, "grad_norm": 0.4373653829097748, "learning_rate": 4.927996051206454e-05, "loss": 8.4385, "step": 510 }, { "epoch": 0.2404988822214378, "grad_norm": 0.3845258951187134, "learning_rate": 4.927533021320657e-05, "loss": 9.3247, "step": 511 }, { "epoch": 0.24096952582656783, "grad_norm": 0.3763442039489746, "learning_rate": 4.9270685293120164e-05, "loss": 9.357, "step": 512 }, { "epoch": 0.24144016943169785, "grad_norm": 0.4450169503688812, "learning_rate": 4.9266025754603005e-05, "loss": 8.5107, "step": 513 }, { "epoch": 0.24191081303682785, "grad_norm": 0.41103556752204895, "learning_rate": 4.926135160046157e-05, "loss": 9.3063, "step": 514 }, { "epoch": 0.24238145664195787, "grad_norm": 0.4856661856174469, "learning_rate": 4.925666283351114e-05, "loss": 8.7831, "step": 515 }, { "epoch": 0.2428521002470879, "grad_norm": 0.3764643371105194, "learning_rate": 4.92519594565758e-05, "loss": 9.0384, "step": 516 }, { "epoch": 0.24332274385221792, "grad_norm": 0.3988141417503357, "learning_rate": 4.924724147248841e-05, "loss": 9.1045, "step": 517 }, { "epoch": 0.2437933874573479, "grad_norm": 0.3450901210308075, "learning_rate": 4.924250888409069e-05, "loss": 9.3091, "step": 518 }, { "epoch": 0.24426403106247793, "grad_norm": 0.4347275495529175, "learning_rate": 4.923776169423309e-05, "loss": 9.115, "step": 519 }, { "epoch": 0.24473467466760795, "grad_norm": 0.36428380012512207, "learning_rate": 4.923299990577488e-05, "loss": 9.0645, "step": 520 }, { "epoch": 0.24520531827273798, "grad_norm": 0.4311101734638214, "learning_rate": 4.922822352158412e-05, "loss": 8.7247, "step": 521 }, { "epoch": 0.24567596187786797, "grad_norm": 0.4824456572532654, "learning_rate": 4.922343254453768e-05, "loss": 8.7448, "step": 522 }, { "epoch": 0.246146605482998, "grad_norm": 0.5465502738952637, "learning_rate": 4.9218626977521206e-05, "loss": 8.471, "step": 523 }, { "epoch": 0.24661724908812802, "grad_norm": 0.4191696047782898, "learning_rate": 4.921380682342912e-05, "loss": 8.4572, "step": 524 }, { "epoch": 0.24708789269325804, "grad_norm": 0.40454065799713135, "learning_rate": 4.920897208516464e-05, "loss": 9.3254, "step": 525 }, { "epoch": 0.24755853629838803, "grad_norm": 0.36362919211387634, "learning_rate": 4.920412276563977e-05, "loss": 9.4725, "step": 526 }, { "epoch": 0.24802917990351805, "grad_norm": 0.38239118456840515, "learning_rate": 4.91992588677753e-05, "loss": 8.5503, "step": 527 }, { "epoch": 0.24849982350864808, "grad_norm": 0.3423115015029907, "learning_rate": 4.919438039450078e-05, "loss": 9.294, "step": 528 }, { "epoch": 0.2489704671137781, "grad_norm": 0.3812299966812134, "learning_rate": 4.918948734875457e-05, "loss": 9.374, "step": 529 }, { "epoch": 0.2494411107189081, "grad_norm": 0.5085097551345825, "learning_rate": 4.9184579733483796e-05, "loss": 8.5979, "step": 530 }, { "epoch": 0.24991175432403812, "grad_norm": 0.34993723034858704, "learning_rate": 4.917965755164433e-05, "loss": 9.4077, "step": 531 } ], "logging_steps": 1, "max_steps": 4248, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 531, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.093280422836306e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }