{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3820621806198959, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012735406020663196, "grad_norm": 7.271933690036313, "learning_rate": 4.237288135593221e-08, "loss": 1.8539, "step": 1 }, { "epoch": 0.0002547081204132639, "grad_norm": 9.658086805732816, "learning_rate": 8.474576271186442e-08, "loss": 2.0669, "step": 2 }, { "epoch": 0.0003820621806198959, "grad_norm": 11.484017585849884, "learning_rate": 1.2711864406779662e-07, "loss": 2.2215, "step": 3 }, { "epoch": 0.0005094162408265278, "grad_norm": 5.540318356122022, "learning_rate": 1.6949152542372883e-07, "loss": 1.8601, "step": 4 }, { "epoch": 0.0006367703010331598, "grad_norm": 8.72124267822763, "learning_rate": 2.1186440677966102e-07, "loss": 2.036, "step": 5 }, { "epoch": 0.0007641243612397918, "grad_norm": 5.653927826902831, "learning_rate": 2.5423728813559323e-07, "loss": 1.6908, "step": 6 }, { "epoch": 0.0008914784214464238, "grad_norm": 6.524121350290057, "learning_rate": 2.966101694915255e-07, "loss": 1.8968, "step": 7 }, { "epoch": 0.0010188324816530557, "grad_norm": 6.459540108765332, "learning_rate": 3.3898305084745766e-07, "loss": 1.7855, "step": 8 }, { "epoch": 0.0011461865418596878, "grad_norm": 9.04352851601738, "learning_rate": 3.813559322033899e-07, "loss": 2.0275, "step": 9 }, { "epoch": 0.0012735406020663196, "grad_norm": 5.189147687544124, "learning_rate": 4.2372881355932204e-07, "loss": 1.7905, "step": 10 }, { "epoch": 0.0014008946622729515, "grad_norm": 7.8253156604412375, "learning_rate": 4.661016949152543e-07, "loss": 1.9746, "step": 11 }, { "epoch": 0.0015282487224795836, "grad_norm": 8.244971390044737, "learning_rate": 5.084745762711865e-07, "loss": 1.9675, "step": 12 }, { "epoch": 0.0016556027826862155, "grad_norm": 9.407911793872701, "learning_rate": 5.508474576271187e-07, "loss": 1.8633, "step": 13 }, { "epoch": 0.0017829568428928476, "grad_norm": 7.269078375368235, "learning_rate": 5.93220338983051e-07, "loss": 1.9822, "step": 14 }, { "epoch": 0.0019103109030994794, "grad_norm": 6.836825277174009, "learning_rate": 6.355932203389831e-07, "loss": 1.8575, "step": 15 }, { "epoch": 0.0020376649633061113, "grad_norm": 7.472620005302849, "learning_rate": 6.779661016949153e-07, "loss": 1.7522, "step": 16 }, { "epoch": 0.002165019023512743, "grad_norm": 5.337881334605893, "learning_rate": 7.203389830508476e-07, "loss": 1.623, "step": 17 }, { "epoch": 0.0022923730837193755, "grad_norm": 9.373863863448666, "learning_rate": 7.627118644067798e-07, "loss": 1.9805, "step": 18 }, { "epoch": 0.0024197271439260074, "grad_norm": 7.718845629389268, "learning_rate": 8.050847457627118e-07, "loss": 1.807, "step": 19 }, { "epoch": 0.0025470812041326393, "grad_norm": 7.130785818146366, "learning_rate": 8.474576271186441e-07, "loss": 1.824, "step": 20 }, { "epoch": 0.002674435264339271, "grad_norm": 6.681287500260171, "learning_rate": 8.898305084745763e-07, "loss": 1.8496, "step": 21 }, { "epoch": 0.002801789324545903, "grad_norm": 6.872143516134369, "learning_rate": 9.322033898305086e-07, "loss": 1.8806, "step": 22 }, { "epoch": 0.0029291433847525353, "grad_norm": 4.705998046939443, "learning_rate": 9.745762711864408e-07, "loss": 1.634, "step": 23 }, { "epoch": 0.003056497444959167, "grad_norm": 11.506522277892683, "learning_rate": 1.016949152542373e-06, "loss": 2.1845, "step": 24 }, { "epoch": 0.003183851505165799, "grad_norm": 11.312083874209405, "learning_rate": 1.059322033898305e-06, "loss": 1.8857, "step": 25 }, { "epoch": 0.003311205565372431, "grad_norm": 6.4627660818311226, "learning_rate": 1.1016949152542374e-06, "loss": 1.8913, "step": 26 }, { "epoch": 0.003438559625579063, "grad_norm": 7.32229167356663, "learning_rate": 1.1440677966101696e-06, "loss": 1.9071, "step": 27 }, { "epoch": 0.003565913685785695, "grad_norm": 5.958980854124657, "learning_rate": 1.186440677966102e-06, "loss": 1.7951, "step": 28 }, { "epoch": 0.003693267745992327, "grad_norm": 7.968036694484309, "learning_rate": 1.228813559322034e-06, "loss": 1.9683, "step": 29 }, { "epoch": 0.003820621806198959, "grad_norm": 6.087713606766322, "learning_rate": 1.2711864406779662e-06, "loss": 1.7463, "step": 30 }, { "epoch": 0.003947975866405591, "grad_norm": 7.15380657743577, "learning_rate": 1.3135593220338985e-06, "loss": 1.8375, "step": 31 }, { "epoch": 0.004075329926612223, "grad_norm": 6.99281487234341, "learning_rate": 1.3559322033898307e-06, "loss": 2.0248, "step": 32 }, { "epoch": 0.0042026839868188545, "grad_norm": 6.702985570920233, "learning_rate": 1.3983050847457628e-06, "loss": 1.9517, "step": 33 }, { "epoch": 0.004330038047025486, "grad_norm": 7.265232789313097, "learning_rate": 1.4406779661016951e-06, "loss": 1.9653, "step": 34 }, { "epoch": 0.004457392107232118, "grad_norm": 11.115946654222988, "learning_rate": 1.4830508474576273e-06, "loss": 2.2556, "step": 35 }, { "epoch": 0.004584746167438751, "grad_norm": 7.299990917879491, "learning_rate": 1.5254237288135596e-06, "loss": 1.8916, "step": 36 }, { "epoch": 0.004712100227645383, "grad_norm": 8.757682697242801, "learning_rate": 1.5677966101694915e-06, "loss": 1.9587, "step": 37 }, { "epoch": 0.004839454287852015, "grad_norm": 6.31538702786833, "learning_rate": 1.6101694915254237e-06, "loss": 1.6702, "step": 38 }, { "epoch": 0.004966808348058647, "grad_norm": 5.678276211998329, "learning_rate": 1.652542372881356e-06, "loss": 1.6698, "step": 39 }, { "epoch": 0.0050941624082652785, "grad_norm": 6.5502751882271255, "learning_rate": 1.6949152542372882e-06, "loss": 1.6604, "step": 40 }, { "epoch": 0.00522151646847191, "grad_norm": 7.378033030484872, "learning_rate": 1.7372881355932205e-06, "loss": 1.8324, "step": 41 }, { "epoch": 0.005348870528678542, "grad_norm": 15.420303713013, "learning_rate": 1.7796610169491526e-06, "loss": 2.0715, "step": 42 }, { "epoch": 0.005476224588885174, "grad_norm": 5.720408048146975, "learning_rate": 1.8220338983050848e-06, "loss": 1.723, "step": 43 }, { "epoch": 0.005603578649091806, "grad_norm": 6.303637032995317, "learning_rate": 1.8644067796610171e-06, "loss": 1.7073, "step": 44 }, { "epoch": 0.005730932709298438, "grad_norm": 6.278397704866944, "learning_rate": 1.9067796610169493e-06, "loss": 1.6899, "step": 45 }, { "epoch": 0.005858286769505071, "grad_norm": 9.400638363853234, "learning_rate": 1.9491525423728816e-06, "loss": 2.0536, "step": 46 }, { "epoch": 0.0059856408297117025, "grad_norm": 11.772339058952785, "learning_rate": 1.9915254237288137e-06, "loss": 1.9004, "step": 47 }, { "epoch": 0.006112994889918334, "grad_norm": 6.097740079901016, "learning_rate": 2.033898305084746e-06, "loss": 1.7205, "step": 48 }, { "epoch": 0.006240348950124966, "grad_norm": 7.00755338498105, "learning_rate": 2.076271186440678e-06, "loss": 1.8041, "step": 49 }, { "epoch": 0.006367703010331598, "grad_norm": 6.9732164515856, "learning_rate": 2.11864406779661e-06, "loss": 1.7358, "step": 50 }, { "epoch": 0.00649505707053823, "grad_norm": 6.583437987191887, "learning_rate": 2.1610169491525427e-06, "loss": 1.7373, "step": 51 }, { "epoch": 0.006622411130744862, "grad_norm": 8.700856431922421, "learning_rate": 2.203389830508475e-06, "loss": 1.8176, "step": 52 }, { "epoch": 0.006749765190951494, "grad_norm": 6.11722055087655, "learning_rate": 2.245762711864407e-06, "loss": 1.6658, "step": 53 }, { "epoch": 0.006877119251158126, "grad_norm": 6.397543745767114, "learning_rate": 2.288135593220339e-06, "loss": 1.761, "step": 54 }, { "epoch": 0.0070044733113647575, "grad_norm": 9.611483508510796, "learning_rate": 2.3305084745762712e-06, "loss": 1.9658, "step": 55 }, { "epoch": 0.00713182737157139, "grad_norm": 9.349587138643392, "learning_rate": 2.372881355932204e-06, "loss": 1.7335, "step": 56 }, { "epoch": 0.007259181431778022, "grad_norm": 6.522174278890342, "learning_rate": 2.415254237288136e-06, "loss": 1.6166, "step": 57 }, { "epoch": 0.007386535491984654, "grad_norm": 7.750941557755308, "learning_rate": 2.457627118644068e-06, "loss": 1.7544, "step": 58 }, { "epoch": 0.007513889552191286, "grad_norm": 6.912297289671559, "learning_rate": 2.5e-06, "loss": 1.7011, "step": 59 }, { "epoch": 0.007641243612397918, "grad_norm": 7.984840812457877, "learning_rate": 2.5423728813559323e-06, "loss": 1.6993, "step": 60 }, { "epoch": 0.00776859767260455, "grad_norm": 7.121099642585329, "learning_rate": 2.5847457627118645e-06, "loss": 1.6092, "step": 61 }, { "epoch": 0.007895951732811182, "grad_norm": 13.071363736303791, "learning_rate": 2.627118644067797e-06, "loss": 1.9803, "step": 62 }, { "epoch": 0.008023305793017814, "grad_norm": 7.054327363108295, "learning_rate": 2.669491525423729e-06, "loss": 1.5912, "step": 63 }, { "epoch": 0.008150659853224445, "grad_norm": 8.19110074835651, "learning_rate": 2.7118644067796613e-06, "loss": 1.6463, "step": 64 }, { "epoch": 0.008278013913431078, "grad_norm": 7.584386759380691, "learning_rate": 2.7542372881355934e-06, "loss": 1.6676, "step": 65 }, { "epoch": 0.008405367973637709, "grad_norm": 13.138200788018468, "learning_rate": 2.7966101694915256e-06, "loss": 1.8309, "step": 66 }, { "epoch": 0.008532722033844342, "grad_norm": 9.75600502737869, "learning_rate": 2.838983050847458e-06, "loss": 1.8151, "step": 67 }, { "epoch": 0.008660076094050973, "grad_norm": 9.03322022802435, "learning_rate": 2.8813559322033903e-06, "loss": 1.7743, "step": 68 }, { "epoch": 0.008787430154257606, "grad_norm": 7.31942395137626, "learning_rate": 2.9237288135593224e-06, "loss": 1.4732, "step": 69 }, { "epoch": 0.008914784214464237, "grad_norm": 7.29859110156687, "learning_rate": 2.9661016949152545e-06, "loss": 1.5354, "step": 70 }, { "epoch": 0.00904213827467087, "grad_norm": 7.57838373430734, "learning_rate": 3.0084745762711862e-06, "loss": 1.6728, "step": 71 }, { "epoch": 0.009169492334877502, "grad_norm": 8.627342016718561, "learning_rate": 3.0508474576271192e-06, "loss": 1.5038, "step": 72 }, { "epoch": 0.009296846395084133, "grad_norm": 11.103801289476134, "learning_rate": 3.0932203389830514e-06, "loss": 1.6997, "step": 73 }, { "epoch": 0.009424200455290766, "grad_norm": 7.695175185164474, "learning_rate": 3.135593220338983e-06, "loss": 1.6034, "step": 74 }, { "epoch": 0.009551554515497397, "grad_norm": 7.476937502840767, "learning_rate": 3.1779661016949152e-06, "loss": 1.7081, "step": 75 }, { "epoch": 0.00967890857570403, "grad_norm": 8.36338889468324, "learning_rate": 3.2203389830508473e-06, "loss": 1.7367, "step": 76 }, { "epoch": 0.00980626263591066, "grad_norm": 7.377402480026078, "learning_rate": 3.26271186440678e-06, "loss": 1.5538, "step": 77 }, { "epoch": 0.009933616696117293, "grad_norm": 7.682228824336897, "learning_rate": 3.305084745762712e-06, "loss": 1.6121, "step": 78 }, { "epoch": 0.010060970756323924, "grad_norm": 9.349692811866577, "learning_rate": 3.347457627118644e-06, "loss": 1.6372, "step": 79 }, { "epoch": 0.010188324816530557, "grad_norm": 7.284498284211861, "learning_rate": 3.3898305084745763e-06, "loss": 1.6672, "step": 80 }, { "epoch": 0.01031567887673719, "grad_norm": 7.870893588490932, "learning_rate": 3.4322033898305084e-06, "loss": 1.581, "step": 81 }, { "epoch": 0.01044303293694382, "grad_norm": 7.532864852561931, "learning_rate": 3.474576271186441e-06, "loss": 1.5485, "step": 82 }, { "epoch": 0.010570386997150454, "grad_norm": 7.315126387572679, "learning_rate": 3.516949152542373e-06, "loss": 1.462, "step": 83 }, { "epoch": 0.010697741057357085, "grad_norm": 9.982699879040739, "learning_rate": 3.5593220338983053e-06, "loss": 1.6831, "step": 84 }, { "epoch": 0.010825095117563717, "grad_norm": 6.745164905565429, "learning_rate": 3.6016949152542374e-06, "loss": 1.4625, "step": 85 }, { "epoch": 0.010952449177770348, "grad_norm": 7.396636518107798, "learning_rate": 3.6440677966101695e-06, "loss": 1.4898, "step": 86 }, { "epoch": 0.011079803237976981, "grad_norm": 6.530327106595857, "learning_rate": 3.686440677966102e-06, "loss": 1.4851, "step": 87 }, { "epoch": 0.011207157298183612, "grad_norm": 7.280031290511072, "learning_rate": 3.7288135593220342e-06, "loss": 1.5776, "step": 88 }, { "epoch": 0.011334511358390245, "grad_norm": 7.258807653030983, "learning_rate": 3.7711864406779664e-06, "loss": 1.493, "step": 89 }, { "epoch": 0.011461865418596876, "grad_norm": 6.780060975917414, "learning_rate": 3.8135593220338985e-06, "loss": 1.497, "step": 90 }, { "epoch": 0.011589219478803509, "grad_norm": 5.299820792569274, "learning_rate": 3.8559322033898315e-06, "loss": 1.3391, "step": 91 }, { "epoch": 0.011716573539010141, "grad_norm": 11.906512079833732, "learning_rate": 3.898305084745763e-06, "loss": 1.7044, "step": 92 }, { "epoch": 0.011843927599216772, "grad_norm": 13.750145928735572, "learning_rate": 3.940677966101695e-06, "loss": 1.3373, "step": 93 }, { "epoch": 0.011971281659423405, "grad_norm": 6.2094515960377015, "learning_rate": 3.9830508474576275e-06, "loss": 1.4218, "step": 94 }, { "epoch": 0.012098635719630036, "grad_norm": 8.702043175020558, "learning_rate": 4.025423728813559e-06, "loss": 1.5958, "step": 95 }, { "epoch": 0.012225989779836669, "grad_norm": 10.530891389712178, "learning_rate": 4.067796610169492e-06, "loss": 1.3869, "step": 96 }, { "epoch": 0.0123533438400433, "grad_norm": 8.382484394062372, "learning_rate": 4.110169491525424e-06, "loss": 1.4319, "step": 97 }, { "epoch": 0.012480697900249933, "grad_norm": 8.65764730585386, "learning_rate": 4.152542372881356e-06, "loss": 1.4986, "step": 98 }, { "epoch": 0.012608051960456564, "grad_norm": 6.733364941110484, "learning_rate": 4.1949152542372886e-06, "loss": 1.3344, "step": 99 }, { "epoch": 0.012735406020663196, "grad_norm": 7.635913980134383, "learning_rate": 4.23728813559322e-06, "loss": 1.4161, "step": 100 }, { "epoch": 0.012862760080869829, "grad_norm": 10.236577619610213, "learning_rate": 4.279661016949153e-06, "loss": 1.5962, "step": 101 }, { "epoch": 0.01299011414107646, "grad_norm": 5.526567621635844, "learning_rate": 4.322033898305085e-06, "loss": 1.2936, "step": 102 }, { "epoch": 0.013117468201283093, "grad_norm": 7.496201634189791, "learning_rate": 4.364406779661017e-06, "loss": 1.4379, "step": 103 }, { "epoch": 0.013244822261489724, "grad_norm": 6.365117809706692, "learning_rate": 4.40677966101695e-06, "loss": 1.3969, "step": 104 }, { "epoch": 0.013372176321696357, "grad_norm": 4.970383669437514, "learning_rate": 4.449152542372881e-06, "loss": 1.4318, "step": 105 }, { "epoch": 0.013499530381902988, "grad_norm": 6.540428319781601, "learning_rate": 4.491525423728814e-06, "loss": 1.4254, "step": 106 }, { "epoch": 0.01362688444210962, "grad_norm": 5.759607332938477, "learning_rate": 4.5338983050847465e-06, "loss": 1.4332, "step": 107 }, { "epoch": 0.013754238502316251, "grad_norm": 8.31020834173882, "learning_rate": 4.576271186440678e-06, "loss": 1.4719, "step": 108 }, { "epoch": 0.013881592562522884, "grad_norm": 7.340003043857536, "learning_rate": 4.618644067796611e-06, "loss": 1.3971, "step": 109 }, { "epoch": 0.014008946622729515, "grad_norm": 6.511784361547076, "learning_rate": 4.6610169491525425e-06, "loss": 1.4415, "step": 110 }, { "epoch": 0.014136300682936148, "grad_norm": 6.731671633748867, "learning_rate": 4.703389830508475e-06, "loss": 1.4395, "step": 111 }, { "epoch": 0.01426365474314278, "grad_norm": 6.905671660774626, "learning_rate": 4.745762711864408e-06, "loss": 1.365, "step": 112 }, { "epoch": 0.014391008803349412, "grad_norm": 6.430425515815662, "learning_rate": 4.788135593220339e-06, "loss": 1.3832, "step": 113 }, { "epoch": 0.014518362863556044, "grad_norm": 7.435499419092716, "learning_rate": 4.830508474576272e-06, "loss": 1.4609, "step": 114 }, { "epoch": 0.014645716923762675, "grad_norm": 5.4419059003708625, "learning_rate": 4.872881355932204e-06, "loss": 1.3687, "step": 115 }, { "epoch": 0.014773070983969308, "grad_norm": 5.480813981464255, "learning_rate": 4.915254237288136e-06, "loss": 1.3259, "step": 116 }, { "epoch": 0.014900425044175939, "grad_norm": 19.487504134260305, "learning_rate": 4.957627118644069e-06, "loss": 1.2768, "step": 117 }, { "epoch": 0.015027779104382572, "grad_norm": 6.2148980714352895, "learning_rate": 5e-06, "loss": 1.3555, "step": 118 }, { "epoch": 0.015155133164589203, "grad_norm": 5.838187604837424, "learning_rate": 5.042372881355932e-06, "loss": 1.3501, "step": 119 }, { "epoch": 0.015282487224795836, "grad_norm": 5.3091089533384, "learning_rate": 5.084745762711865e-06, "loss": 1.3454, "step": 120 }, { "epoch": 0.015409841285002468, "grad_norm": 7.12967595373887, "learning_rate": 5.127118644067796e-06, "loss": 1.2718, "step": 121 }, { "epoch": 0.0155371953452091, "grad_norm": 5.081241156275937, "learning_rate": 5.169491525423729e-06, "loss": 1.306, "step": 122 }, { "epoch": 0.015664549405415732, "grad_norm": 5.1678133686722605, "learning_rate": 5.211864406779662e-06, "loss": 1.2662, "step": 123 }, { "epoch": 0.015791903465622363, "grad_norm": 6.609829212931382, "learning_rate": 5.254237288135594e-06, "loss": 1.4107, "step": 124 }, { "epoch": 0.015919257525828994, "grad_norm": 5.390686388633695, "learning_rate": 5.296610169491526e-06, "loss": 1.3657, "step": 125 }, { "epoch": 0.01604661158603563, "grad_norm": 4.716304726524517, "learning_rate": 5.338983050847458e-06, "loss": 1.285, "step": 126 }, { "epoch": 0.01617396564624226, "grad_norm": 7.442946547443793, "learning_rate": 5.38135593220339e-06, "loss": 1.5355, "step": 127 }, { "epoch": 0.01630131970644889, "grad_norm": 6.08763747167752, "learning_rate": 5.423728813559323e-06, "loss": 1.225, "step": 128 }, { "epoch": 0.01642867376665552, "grad_norm": 7.440177884180067, "learning_rate": 5.466101694915254e-06, "loss": 1.4228, "step": 129 }, { "epoch": 0.016556027826862156, "grad_norm": 7.716718027865346, "learning_rate": 5.508474576271187e-06, "loss": 1.4903, "step": 130 }, { "epoch": 0.016683381887068787, "grad_norm": 4.940675318037667, "learning_rate": 5.550847457627119e-06, "loss": 1.3298, "step": 131 }, { "epoch": 0.016810735947275418, "grad_norm": 6.172436497895839, "learning_rate": 5.593220338983051e-06, "loss": 1.334, "step": 132 }, { "epoch": 0.016938090007482053, "grad_norm": 4.577141665939358, "learning_rate": 5.635593220338984e-06, "loss": 1.3172, "step": 133 }, { "epoch": 0.017065444067688684, "grad_norm": 4.928191476396523, "learning_rate": 5.677966101694916e-06, "loss": 1.2845, "step": 134 }, { "epoch": 0.017192798127895315, "grad_norm": 5.175516465809592, "learning_rate": 5.720338983050848e-06, "loss": 1.3261, "step": 135 }, { "epoch": 0.017320152188101946, "grad_norm": 5.868259215222263, "learning_rate": 5.7627118644067805e-06, "loss": 1.3385, "step": 136 }, { "epoch": 0.01744750624830858, "grad_norm": 6.6858498507159405, "learning_rate": 5.805084745762712e-06, "loss": 1.3098, "step": 137 }, { "epoch": 0.01757486030851521, "grad_norm": 5.492215282646657, "learning_rate": 5.847457627118645e-06, "loss": 1.2635, "step": 138 }, { "epoch": 0.017702214368721842, "grad_norm": 10.420122310923675, "learning_rate": 5.8898305084745765e-06, "loss": 1.3717, "step": 139 }, { "epoch": 0.017829568428928473, "grad_norm": 5.144524576447836, "learning_rate": 5.932203389830509e-06, "loss": 1.2734, "step": 140 }, { "epoch": 0.017956922489135108, "grad_norm": 6.264504044638567, "learning_rate": 5.974576271186441e-06, "loss": 1.318, "step": 141 }, { "epoch": 0.01808427654934174, "grad_norm": 5.740016397060673, "learning_rate": 6.0169491525423725e-06, "loss": 1.3783, "step": 142 }, { "epoch": 0.01821163060954837, "grad_norm": 6.251052157530912, "learning_rate": 6.059322033898306e-06, "loss": 1.3595, "step": 143 }, { "epoch": 0.018338984669755004, "grad_norm": 5.523655392073338, "learning_rate": 6.1016949152542385e-06, "loss": 1.1841, "step": 144 }, { "epoch": 0.018466338729961635, "grad_norm": 5.937324601644495, "learning_rate": 6.14406779661017e-06, "loss": 1.328, "step": 145 }, { "epoch": 0.018593692790168266, "grad_norm": 5.449422459229983, "learning_rate": 6.186440677966103e-06, "loss": 1.3431, "step": 146 }, { "epoch": 0.018721046850374897, "grad_norm": 7.690879017131419, "learning_rate": 6.2288135593220344e-06, "loss": 1.3203, "step": 147 }, { "epoch": 0.01884840091058153, "grad_norm": 5.27418216675118, "learning_rate": 6.271186440677966e-06, "loss": 1.2765, "step": 148 }, { "epoch": 0.018975754970788163, "grad_norm": 6.17835986202785, "learning_rate": 6.313559322033899e-06, "loss": 1.2689, "step": 149 }, { "epoch": 0.019103109030994794, "grad_norm": 5.636045335076978, "learning_rate": 6.3559322033898304e-06, "loss": 1.3782, "step": 150 }, { "epoch": 0.019230463091201428, "grad_norm": 5.108084699461727, "learning_rate": 6.398305084745763e-06, "loss": 1.3202, "step": 151 }, { "epoch": 0.01935781715140806, "grad_norm": 6.525588214420939, "learning_rate": 6.440677966101695e-06, "loss": 1.3457, "step": 152 }, { "epoch": 0.01948517121161469, "grad_norm": 5.606250966219782, "learning_rate": 6.483050847457628e-06, "loss": 1.3161, "step": 153 }, { "epoch": 0.01961252527182132, "grad_norm": 5.588206693263131, "learning_rate": 6.52542372881356e-06, "loss": 1.3518, "step": 154 }, { "epoch": 0.019739879332027956, "grad_norm": 5.148154081610033, "learning_rate": 6.567796610169492e-06, "loss": 1.2552, "step": 155 }, { "epoch": 0.019867233392234587, "grad_norm": 7.284755141437708, "learning_rate": 6.610169491525424e-06, "loss": 1.2592, "step": 156 }, { "epoch": 0.019994587452441218, "grad_norm": 4.314617134869305, "learning_rate": 6.652542372881357e-06, "loss": 1.1922, "step": 157 }, { "epoch": 0.02012194151264785, "grad_norm": 5.301650153890583, "learning_rate": 6.694915254237288e-06, "loss": 1.2398, "step": 158 }, { "epoch": 0.020249295572854483, "grad_norm": 4.878119228831268, "learning_rate": 6.737288135593221e-06, "loss": 1.3827, "step": 159 }, { "epoch": 0.020376649633061114, "grad_norm": 6.819622271416848, "learning_rate": 6.779661016949153e-06, "loss": 1.3324, "step": 160 }, { "epoch": 0.020504003693267745, "grad_norm": 4.742557919660855, "learning_rate": 6.822033898305085e-06, "loss": 1.3472, "step": 161 }, { "epoch": 0.02063135775347438, "grad_norm": 5.689883895788964, "learning_rate": 6.864406779661017e-06, "loss": 1.1704, "step": 162 }, { "epoch": 0.02075871181368101, "grad_norm": 6.417111302943501, "learning_rate": 6.90677966101695e-06, "loss": 1.3367, "step": 163 }, { "epoch": 0.02088606587388764, "grad_norm": 6.942220645458798, "learning_rate": 6.949152542372882e-06, "loss": 1.2618, "step": 164 }, { "epoch": 0.021013419934094273, "grad_norm": 5.252453185448875, "learning_rate": 6.9915254237288146e-06, "loss": 1.2426, "step": 165 }, { "epoch": 0.021140773994300907, "grad_norm": 5.9624264758553185, "learning_rate": 7.033898305084746e-06, "loss": 1.514, "step": 166 }, { "epoch": 0.021268128054507538, "grad_norm": 7.006920527037309, "learning_rate": 7.076271186440679e-06, "loss": 1.2284, "step": 167 }, { "epoch": 0.02139548211471417, "grad_norm": 6.09704000229216, "learning_rate": 7.1186440677966106e-06, "loss": 1.3126, "step": 168 }, { "epoch": 0.0215228361749208, "grad_norm": 8.247528168259292, "learning_rate": 7.161016949152543e-06, "loss": 1.4338, "step": 169 }, { "epoch": 0.021650190235127435, "grad_norm": 6.171994905876386, "learning_rate": 7.203389830508475e-06, "loss": 1.2461, "step": 170 }, { "epoch": 0.021777544295334066, "grad_norm": 5.131775076203093, "learning_rate": 7.2457627118644065e-06, "loss": 1.2865, "step": 171 }, { "epoch": 0.021904898355540697, "grad_norm": 6.94815098159131, "learning_rate": 7.288135593220339e-06, "loss": 1.1561, "step": 172 }, { "epoch": 0.02203225241574733, "grad_norm": 6.040933218097003, "learning_rate": 7.3305084745762725e-06, "loss": 1.2328, "step": 173 }, { "epoch": 0.022159606475953962, "grad_norm": 5.685342232662009, "learning_rate": 7.372881355932204e-06, "loss": 1.2557, "step": 174 }, { "epoch": 0.022286960536160593, "grad_norm": 6.001107431572396, "learning_rate": 7.415254237288137e-06, "loss": 1.2557, "step": 175 }, { "epoch": 0.022414314596367224, "grad_norm": 6.392164571865962, "learning_rate": 7.4576271186440685e-06, "loss": 1.2892, "step": 176 }, { "epoch": 0.02254166865657386, "grad_norm": 6.654915116110552, "learning_rate": 7.500000000000001e-06, "loss": 1.2959, "step": 177 }, { "epoch": 0.02266902271678049, "grad_norm": 7.458014397932309, "learning_rate": 7.542372881355933e-06, "loss": 1.3309, "step": 178 }, { "epoch": 0.02279637677698712, "grad_norm": 7.500898180267626, "learning_rate": 7.5847457627118645e-06, "loss": 1.2727, "step": 179 }, { "epoch": 0.02292373083719375, "grad_norm": 5.733225361598391, "learning_rate": 7.627118644067797e-06, "loss": 1.3032, "step": 180 }, { "epoch": 0.023051084897400386, "grad_norm": 8.154695676999397, "learning_rate": 7.66949152542373e-06, "loss": 1.2128, "step": 181 }, { "epoch": 0.023178438957607017, "grad_norm": 6.985545902969932, "learning_rate": 7.711864406779663e-06, "loss": 1.3754, "step": 182 }, { "epoch": 0.023305793017813648, "grad_norm": 5.1080014310280655, "learning_rate": 7.754237288135595e-06, "loss": 1.2624, "step": 183 }, { "epoch": 0.023433147078020283, "grad_norm": 5.456029954386173, "learning_rate": 7.796610169491526e-06, "loss": 1.3119, "step": 184 }, { "epoch": 0.023560501138226914, "grad_norm": 5.870259096838373, "learning_rate": 7.838983050847458e-06, "loss": 1.3431, "step": 185 }, { "epoch": 0.023687855198433545, "grad_norm": 4.844651464438866, "learning_rate": 7.88135593220339e-06, "loss": 1.1957, "step": 186 }, { "epoch": 0.023815209258640176, "grad_norm": 4.881515257364843, "learning_rate": 7.923728813559323e-06, "loss": 1.2141, "step": 187 }, { "epoch": 0.02394256331884681, "grad_norm": 5.3909046929818265, "learning_rate": 7.966101694915255e-06, "loss": 1.2117, "step": 188 }, { "epoch": 0.02406991737905344, "grad_norm": 5.084112680862707, "learning_rate": 8.008474576271187e-06, "loss": 1.2686, "step": 189 }, { "epoch": 0.024197271439260072, "grad_norm": 6.606274321627709, "learning_rate": 8.050847457627118e-06, "loss": 1.269, "step": 190 }, { "epoch": 0.024324625499466707, "grad_norm": 4.587065315960808, "learning_rate": 8.093220338983052e-06, "loss": 1.1426, "step": 191 }, { "epoch": 0.024451979559673338, "grad_norm": 5.427227269739341, "learning_rate": 8.135593220338983e-06, "loss": 1.2607, "step": 192 }, { "epoch": 0.02457933361987997, "grad_norm": 7.6170325004261885, "learning_rate": 8.177966101694917e-06, "loss": 1.3308, "step": 193 }, { "epoch": 0.0247066876800866, "grad_norm": 6.3427616345396345, "learning_rate": 8.220338983050849e-06, "loss": 1.213, "step": 194 }, { "epoch": 0.024834041740293234, "grad_norm": 7.609799602341245, "learning_rate": 8.26271186440678e-06, "loss": 1.2924, "step": 195 }, { "epoch": 0.024961395800499865, "grad_norm": 4.2632262922802315, "learning_rate": 8.305084745762712e-06, "loss": 1.145, "step": 196 }, { "epoch": 0.025088749860706496, "grad_norm": 5.907637175889623, "learning_rate": 8.347457627118645e-06, "loss": 1.1883, "step": 197 }, { "epoch": 0.025216103920913127, "grad_norm": 5.60309063911008, "learning_rate": 8.389830508474577e-06, "loss": 1.2593, "step": 198 }, { "epoch": 0.02534345798111976, "grad_norm": 5.196493969572722, "learning_rate": 8.432203389830509e-06, "loss": 1.0949, "step": 199 }, { "epoch": 0.025470812041326393, "grad_norm": 4.282351386970627, "learning_rate": 8.47457627118644e-06, "loss": 1.2098, "step": 200 }, { "epoch": 0.025598166101533024, "grad_norm": 7.771190590780939, "learning_rate": 8.516949152542372e-06, "loss": 1.3432, "step": 201 }, { "epoch": 0.025725520161739658, "grad_norm": 6.684773397321345, "learning_rate": 8.559322033898306e-06, "loss": 1.3336, "step": 202 }, { "epoch": 0.02585287422194629, "grad_norm": 7.465165686739227, "learning_rate": 8.601694915254239e-06, "loss": 1.1817, "step": 203 }, { "epoch": 0.02598022828215292, "grad_norm": 7.086114075804347, "learning_rate": 8.64406779661017e-06, "loss": 1.239, "step": 204 }, { "epoch": 0.02610758234235955, "grad_norm": 5.267852583691561, "learning_rate": 8.686440677966103e-06, "loss": 1.2166, "step": 205 }, { "epoch": 0.026234936402566186, "grad_norm": 6.204765199065724, "learning_rate": 8.728813559322034e-06, "loss": 1.1705, "step": 206 }, { "epoch": 0.026362290462772817, "grad_norm": 5.8650519374099, "learning_rate": 8.771186440677966e-06, "loss": 1.1771, "step": 207 }, { "epoch": 0.026489644522979448, "grad_norm": 4.473193540326503, "learning_rate": 8.8135593220339e-06, "loss": 1.2766, "step": 208 }, { "epoch": 0.02661699858318608, "grad_norm": 6.466444701222969, "learning_rate": 8.855932203389831e-06, "loss": 1.1482, "step": 209 }, { "epoch": 0.026744352643392713, "grad_norm": 5.215909996040192, "learning_rate": 8.898305084745763e-06, "loss": 1.3104, "step": 210 }, { "epoch": 0.026871706703599344, "grad_norm": 4.8024398880969015, "learning_rate": 8.940677966101694e-06, "loss": 1.0803, "step": 211 }, { "epoch": 0.026999060763805975, "grad_norm": 6.601632130272579, "learning_rate": 8.983050847457628e-06, "loss": 1.3942, "step": 212 }, { "epoch": 0.02712641482401261, "grad_norm": 5.329161065013925, "learning_rate": 9.02542372881356e-06, "loss": 1.2741, "step": 213 }, { "epoch": 0.02725376888421924, "grad_norm": 5.287158798138966, "learning_rate": 9.067796610169493e-06, "loss": 1.2516, "step": 214 }, { "epoch": 0.02738112294442587, "grad_norm": 4.441068262343084, "learning_rate": 9.110169491525425e-06, "loss": 1.1225, "step": 215 }, { "epoch": 0.027508477004632503, "grad_norm": 5.781878133106383, "learning_rate": 9.152542372881356e-06, "loss": 1.3163, "step": 216 }, { "epoch": 0.027635831064839137, "grad_norm": 6.6835361034553715, "learning_rate": 9.194915254237288e-06, "loss": 1.2013, "step": 217 }, { "epoch": 0.027763185125045768, "grad_norm": 6.766142298984512, "learning_rate": 9.237288135593222e-06, "loss": 1.2307, "step": 218 }, { "epoch": 0.0278905391852524, "grad_norm": 5.126568996026534, "learning_rate": 9.279661016949153e-06, "loss": 1.204, "step": 219 }, { "epoch": 0.02801789324545903, "grad_norm": 5.481113953245, "learning_rate": 9.322033898305085e-06, "loss": 1.2129, "step": 220 }, { "epoch": 0.028145247305665665, "grad_norm": 9.019736795243459, "learning_rate": 9.364406779661017e-06, "loss": 1.3009, "step": 221 }, { "epoch": 0.028272601365872296, "grad_norm": 6.11214577364218, "learning_rate": 9.40677966101695e-06, "loss": 1.1636, "step": 222 }, { "epoch": 0.028399955426078927, "grad_norm": 6.208039688573386, "learning_rate": 9.449152542372882e-06, "loss": 1.2205, "step": 223 }, { "epoch": 0.02852730948628556, "grad_norm": 6.883913954414822, "learning_rate": 9.491525423728815e-06, "loss": 1.2852, "step": 224 }, { "epoch": 0.028654663546492192, "grad_norm": 7.770250504823821, "learning_rate": 9.533898305084747e-06, "loss": 1.2271, "step": 225 }, { "epoch": 0.028782017606698823, "grad_norm": 6.430792119192894, "learning_rate": 9.576271186440679e-06, "loss": 1.2906, "step": 226 }, { "epoch": 0.028909371666905454, "grad_norm": 7.010031460461512, "learning_rate": 9.61864406779661e-06, "loss": 1.2835, "step": 227 }, { "epoch": 0.02903672572711209, "grad_norm": 7.645972753876864, "learning_rate": 9.661016949152544e-06, "loss": 1.2884, "step": 228 }, { "epoch": 0.02916407978731872, "grad_norm": 6.254690622657227, "learning_rate": 9.703389830508475e-06, "loss": 1.2195, "step": 229 }, { "epoch": 0.02929143384752535, "grad_norm": 5.283445725714637, "learning_rate": 9.745762711864407e-06, "loss": 1.2283, "step": 230 }, { "epoch": 0.02941878790773198, "grad_norm": 5.300161776682745, "learning_rate": 9.788135593220339e-06, "loss": 1.2168, "step": 231 }, { "epoch": 0.029546141967938616, "grad_norm": 4.599103442185643, "learning_rate": 9.830508474576272e-06, "loss": 1.1986, "step": 232 }, { "epoch": 0.029673496028145247, "grad_norm": 4.231142055650315, "learning_rate": 9.872881355932204e-06, "loss": 1.1642, "step": 233 }, { "epoch": 0.029800850088351878, "grad_norm": 5.711332252730025, "learning_rate": 9.915254237288137e-06, "loss": 1.1048, "step": 234 }, { "epoch": 0.029928204148558513, "grad_norm": 4.869391712213411, "learning_rate": 9.957627118644069e-06, "loss": 1.0453, "step": 235 }, { "epoch": 0.030055558208765144, "grad_norm": 5.5126782723662675, "learning_rate": 1e-05, "loss": 1.2603, "step": 236 }, { "epoch": 0.030182912268971775, "grad_norm": 5.210563223006909, "learning_rate": 1.0042372881355933e-05, "loss": 1.2728, "step": 237 }, { "epoch": 0.030310266329178406, "grad_norm": 4.462683768455516, "learning_rate": 1.0084745762711864e-05, "loss": 1.173, "step": 238 }, { "epoch": 0.03043762038938504, "grad_norm": 6.872645859376026, "learning_rate": 1.0127118644067798e-05, "loss": 1.2844, "step": 239 }, { "epoch": 0.03056497444959167, "grad_norm": 6.347134293444063, "learning_rate": 1.016949152542373e-05, "loss": 1.1263, "step": 240 }, { "epoch": 0.030692328509798302, "grad_norm": 4.948369483974166, "learning_rate": 1.0211864406779661e-05, "loss": 1.1743, "step": 241 }, { "epoch": 0.030819682570004937, "grad_norm": 5.091288527678461, "learning_rate": 1.0254237288135593e-05, "loss": 1.1484, "step": 242 }, { "epoch": 0.030947036630211568, "grad_norm": 6.830186475942115, "learning_rate": 1.0296610169491526e-05, "loss": 1.2874, "step": 243 }, { "epoch": 0.0310743906904182, "grad_norm": 4.416479046753212, "learning_rate": 1.0338983050847458e-05, "loss": 1.1908, "step": 244 }, { "epoch": 0.03120174475062483, "grad_norm": 6.347450116845334, "learning_rate": 1.038135593220339e-05, "loss": 1.3404, "step": 245 }, { "epoch": 0.031329098810831464, "grad_norm": 5.174297399324874, "learning_rate": 1.0423728813559325e-05, "loss": 1.3433, "step": 246 }, { "epoch": 0.03145645287103809, "grad_norm": 6.830511797695472, "learning_rate": 1.0466101694915256e-05, "loss": 1.2365, "step": 247 }, { "epoch": 0.031583806931244726, "grad_norm": 4.788501700444122, "learning_rate": 1.0508474576271188e-05, "loss": 1.186, "step": 248 }, { "epoch": 0.03171116099145136, "grad_norm": 4.786448725323033, "learning_rate": 1.055084745762712e-05, "loss": 1.2575, "step": 249 }, { "epoch": 0.03183851505165799, "grad_norm": 5.365572446233603, "learning_rate": 1.0593220338983052e-05, "loss": 1.1592, "step": 250 }, { "epoch": 0.03196586911186462, "grad_norm": 5.338549308461248, "learning_rate": 1.0635593220338985e-05, "loss": 1.1911, "step": 251 }, { "epoch": 0.03209322317207126, "grad_norm": 6.401895373542065, "learning_rate": 1.0677966101694917e-05, "loss": 1.2061, "step": 252 }, { "epoch": 0.032220577232277885, "grad_norm": 7.118990857049467, "learning_rate": 1.0720338983050848e-05, "loss": 1.1411, "step": 253 }, { "epoch": 0.03234793129248452, "grad_norm": 5.334239505238133, "learning_rate": 1.076271186440678e-05, "loss": 1.1481, "step": 254 }, { "epoch": 0.032475285352691154, "grad_norm": 6.033395336247018, "learning_rate": 1.0805084745762714e-05, "loss": 1.304, "step": 255 }, { "epoch": 0.03260263941289778, "grad_norm": 5.935809085397795, "learning_rate": 1.0847457627118645e-05, "loss": 1.2317, "step": 256 }, { "epoch": 0.032729993473104416, "grad_norm": 6.56257120729597, "learning_rate": 1.0889830508474577e-05, "loss": 1.0834, "step": 257 }, { "epoch": 0.03285734753331104, "grad_norm": 5.52426654901273, "learning_rate": 1.0932203389830509e-05, "loss": 1.2457, "step": 258 }, { "epoch": 0.03298470159351768, "grad_norm": 5.3082240230657876, "learning_rate": 1.0974576271186442e-05, "loss": 1.109, "step": 259 }, { "epoch": 0.03311205565372431, "grad_norm": 7.27848219758666, "learning_rate": 1.1016949152542374e-05, "loss": 1.1494, "step": 260 }, { "epoch": 0.03323940971393094, "grad_norm": 6.186081232346015, "learning_rate": 1.1059322033898305e-05, "loss": 1.1951, "step": 261 }, { "epoch": 0.033366763774137574, "grad_norm": 6.09704876157918, "learning_rate": 1.1101694915254237e-05, "loss": 1.2232, "step": 262 }, { "epoch": 0.03349411783434421, "grad_norm": 5.6139682675869, "learning_rate": 1.1144067796610169e-05, "loss": 1.1192, "step": 263 }, { "epoch": 0.033621471894550836, "grad_norm": 5.013522744422119, "learning_rate": 1.1186440677966102e-05, "loss": 1.1021, "step": 264 }, { "epoch": 0.03374882595475747, "grad_norm": 5.189883867849798, "learning_rate": 1.1228813559322034e-05, "loss": 1.0947, "step": 265 }, { "epoch": 0.033876180014964105, "grad_norm": 9.693514624172337, "learning_rate": 1.1271186440677967e-05, "loss": 1.2365, "step": 266 }, { "epoch": 0.03400353407517073, "grad_norm": 4.465752509974851, "learning_rate": 1.13135593220339e-05, "loss": 1.2413, "step": 267 }, { "epoch": 0.03413088813537737, "grad_norm": 5.235194811033555, "learning_rate": 1.1355932203389833e-05, "loss": 1.161, "step": 268 }, { "epoch": 0.034258242195583995, "grad_norm": 6.032168619829575, "learning_rate": 1.1398305084745764e-05, "loss": 1.1826, "step": 269 }, { "epoch": 0.03438559625579063, "grad_norm": 6.792682345529315, "learning_rate": 1.1440677966101696e-05, "loss": 1.1542, "step": 270 }, { "epoch": 0.034512950315997264, "grad_norm": 6.934400261192159, "learning_rate": 1.148305084745763e-05, "loss": 1.2888, "step": 271 }, { "epoch": 0.03464030437620389, "grad_norm": 5.887182388323694, "learning_rate": 1.1525423728813561e-05, "loss": 1.2312, "step": 272 }, { "epoch": 0.034767658436410526, "grad_norm": 6.01624388050281, "learning_rate": 1.1567796610169493e-05, "loss": 1.2219, "step": 273 }, { "epoch": 0.03489501249661716, "grad_norm": 7.370642457700537, "learning_rate": 1.1610169491525424e-05, "loss": 1.2457, "step": 274 }, { "epoch": 0.03502236655682379, "grad_norm": 6.054900449884194, "learning_rate": 1.1652542372881356e-05, "loss": 1.1599, "step": 275 }, { "epoch": 0.03514972061703042, "grad_norm": 7.087314997551028, "learning_rate": 1.169491525423729e-05, "loss": 1.2703, "step": 276 }, { "epoch": 0.03527707467723706, "grad_norm": 6.126673567100073, "learning_rate": 1.1737288135593221e-05, "loss": 1.1172, "step": 277 }, { "epoch": 0.035404428737443684, "grad_norm": 6.316984189121061, "learning_rate": 1.1779661016949153e-05, "loss": 1.2458, "step": 278 }, { "epoch": 0.03553178279765032, "grad_norm": 7.160140799731012, "learning_rate": 1.1822033898305085e-05, "loss": 1.1956, "step": 279 }, { "epoch": 0.035659136857856946, "grad_norm": 6.894648823802736, "learning_rate": 1.1864406779661018e-05, "loss": 1.1181, "step": 280 }, { "epoch": 0.03578649091806358, "grad_norm": 6.664617446028264, "learning_rate": 1.190677966101695e-05, "loss": 1.1402, "step": 281 }, { "epoch": 0.035913844978270215, "grad_norm": 6.941991778114701, "learning_rate": 1.1949152542372882e-05, "loss": 1.1371, "step": 282 }, { "epoch": 0.03604119903847684, "grad_norm": 4.18847827017481, "learning_rate": 1.1991525423728813e-05, "loss": 1.1745, "step": 283 }, { "epoch": 0.03616855309868348, "grad_norm": 5.438227637703072, "learning_rate": 1.2033898305084745e-05, "loss": 1.2066, "step": 284 }, { "epoch": 0.03629590715889011, "grad_norm": 5.776477720439709, "learning_rate": 1.2076271186440678e-05, "loss": 1.0364, "step": 285 }, { "epoch": 0.03642326121909674, "grad_norm": 4.473684720727284, "learning_rate": 1.2118644067796612e-05, "loss": 1.1551, "step": 286 }, { "epoch": 0.036550615279303374, "grad_norm": 5.67647515168569, "learning_rate": 1.2161016949152544e-05, "loss": 1.2159, "step": 287 }, { "epoch": 0.03667796933951001, "grad_norm": 5.7037797943882, "learning_rate": 1.2203389830508477e-05, "loss": 1.2122, "step": 288 }, { "epoch": 0.036805323399716636, "grad_norm": 3.9661231771896124, "learning_rate": 1.2245762711864409e-05, "loss": 1.0775, "step": 289 }, { "epoch": 0.03693267745992327, "grad_norm": 6.025963240112854, "learning_rate": 1.228813559322034e-05, "loss": 1.2435, "step": 290 }, { "epoch": 0.0370600315201299, "grad_norm": 6.998972953655834, "learning_rate": 1.2330508474576272e-05, "loss": 1.2353, "step": 291 }, { "epoch": 0.03718738558033653, "grad_norm": 5.012810128147621, "learning_rate": 1.2372881355932205e-05, "loss": 1.1848, "step": 292 }, { "epoch": 0.03731473964054317, "grad_norm": 5.8944234970891864, "learning_rate": 1.2415254237288137e-05, "loss": 1.1496, "step": 293 }, { "epoch": 0.037442093700749794, "grad_norm": 6.154631581068166, "learning_rate": 1.2457627118644069e-05, "loss": 1.2476, "step": 294 }, { "epoch": 0.03756944776095643, "grad_norm": 8.244605872992144, "learning_rate": 1.25e-05, "loss": 1.2131, "step": 295 }, { "epoch": 0.03769680182116306, "grad_norm": 5.418418288060157, "learning_rate": 1.2542372881355932e-05, "loss": 1.1227, "step": 296 }, { "epoch": 0.03782415588136969, "grad_norm": 7.34352247819092, "learning_rate": 1.2584745762711866e-05, "loss": 1.0944, "step": 297 }, { "epoch": 0.037951509941576325, "grad_norm": 5.990661665555994, "learning_rate": 1.2627118644067797e-05, "loss": 1.1886, "step": 298 }, { "epoch": 0.03807886400178296, "grad_norm": 5.1388355607481895, "learning_rate": 1.2669491525423729e-05, "loss": 1.1942, "step": 299 }, { "epoch": 0.03820621806198959, "grad_norm": 5.576874237453807, "learning_rate": 1.2711864406779661e-05, "loss": 1.2666, "step": 300 }, { "epoch": 0.03833357212219622, "grad_norm": 6.33907171920546, "learning_rate": 1.2754237288135594e-05, "loss": 1.1965, "step": 301 }, { "epoch": 0.038460926182402856, "grad_norm": 6.752063859556231, "learning_rate": 1.2796610169491526e-05, "loss": 1.1443, "step": 302 }, { "epoch": 0.038588280242609484, "grad_norm": 5.783912499236838, "learning_rate": 1.2838983050847458e-05, "loss": 1.2141, "step": 303 }, { "epoch": 0.03871563430281612, "grad_norm": 9.393457985080918, "learning_rate": 1.288135593220339e-05, "loss": 1.227, "step": 304 }, { "epoch": 0.038842988363022746, "grad_norm": 3.728983414621008, "learning_rate": 1.2923728813559324e-05, "loss": 1.1179, "step": 305 }, { "epoch": 0.03897034242322938, "grad_norm": 6.896312623976467, "learning_rate": 1.2966101694915256e-05, "loss": 1.1986, "step": 306 }, { "epoch": 0.039097696483436015, "grad_norm": 5.28379985943732, "learning_rate": 1.3008474576271188e-05, "loss": 1.0703, "step": 307 }, { "epoch": 0.03922505054364264, "grad_norm": 7.615355599028728, "learning_rate": 1.305084745762712e-05, "loss": 1.2172, "step": 308 }, { "epoch": 0.03935240460384928, "grad_norm": 5.221403988975462, "learning_rate": 1.3093220338983053e-05, "loss": 1.0893, "step": 309 }, { "epoch": 0.03947975866405591, "grad_norm": 4.33216729737341, "learning_rate": 1.3135593220338985e-05, "loss": 1.0594, "step": 310 }, { "epoch": 0.03960711272426254, "grad_norm": 5.546533020653022, "learning_rate": 1.3177966101694916e-05, "loss": 1.0849, "step": 311 }, { "epoch": 0.03973446678446917, "grad_norm": 6.177659040601369, "learning_rate": 1.3220338983050848e-05, "loss": 1.1686, "step": 312 }, { "epoch": 0.03986182084467581, "grad_norm": 6.419279516083969, "learning_rate": 1.3262711864406782e-05, "loss": 1.1274, "step": 313 }, { "epoch": 0.039989174904882435, "grad_norm": 5.585204517851212, "learning_rate": 1.3305084745762713e-05, "loss": 1.1679, "step": 314 }, { "epoch": 0.04011652896508907, "grad_norm": 4.364550727716378, "learning_rate": 1.3347457627118645e-05, "loss": 1.1287, "step": 315 }, { "epoch": 0.0402438830252957, "grad_norm": 7.1135740518403, "learning_rate": 1.3389830508474577e-05, "loss": 1.1679, "step": 316 }, { "epoch": 0.04037123708550233, "grad_norm": 8.475439603763972, "learning_rate": 1.343220338983051e-05, "loss": 1.1221, "step": 317 }, { "epoch": 0.040498591145708966, "grad_norm": 6.599847803383956, "learning_rate": 1.3474576271186442e-05, "loss": 1.2375, "step": 318 }, { "epoch": 0.040625945205915594, "grad_norm": 5.760783160268134, "learning_rate": 1.3516949152542374e-05, "loss": 1.1498, "step": 319 }, { "epoch": 0.04075329926612223, "grad_norm": 5.623168816789539, "learning_rate": 1.3559322033898305e-05, "loss": 1.0415, "step": 320 }, { "epoch": 0.04088065332632886, "grad_norm": 5.145493633882771, "learning_rate": 1.3601694915254237e-05, "loss": 1.1116, "step": 321 }, { "epoch": 0.04100800738653549, "grad_norm": 6.643310865385602, "learning_rate": 1.364406779661017e-05, "loss": 1.3068, "step": 322 }, { "epoch": 0.041135361446742125, "grad_norm": 5.551218314953344, "learning_rate": 1.3686440677966102e-05, "loss": 1.2183, "step": 323 }, { "epoch": 0.04126271550694876, "grad_norm": 6.227722749099259, "learning_rate": 1.3728813559322034e-05, "loss": 1.281, "step": 324 }, { "epoch": 0.04139006956715539, "grad_norm": 7.083253598231905, "learning_rate": 1.3771186440677969e-05, "loss": 1.0694, "step": 325 }, { "epoch": 0.04151742362736202, "grad_norm": 5.469741818543489, "learning_rate": 1.38135593220339e-05, "loss": 1.2052, "step": 326 }, { "epoch": 0.04164477768756865, "grad_norm": 5.973156326146339, "learning_rate": 1.3855932203389832e-05, "loss": 1.2121, "step": 327 }, { "epoch": 0.04177213174777528, "grad_norm": 5.59625972090016, "learning_rate": 1.3898305084745764e-05, "loss": 1.1219, "step": 328 }, { "epoch": 0.04189948580798192, "grad_norm": 7.162602299730285, "learning_rate": 1.3940677966101697e-05, "loss": 1.1341, "step": 329 }, { "epoch": 0.042026839868188545, "grad_norm": 5.91873415790762, "learning_rate": 1.3983050847457629e-05, "loss": 1.1412, "step": 330 }, { "epoch": 0.04215419392839518, "grad_norm": 4.9801198560393685, "learning_rate": 1.4025423728813561e-05, "loss": 1.0811, "step": 331 }, { "epoch": 0.042281547988601814, "grad_norm": 7.04494488794065, "learning_rate": 1.4067796610169493e-05, "loss": 1.1825, "step": 332 }, { "epoch": 0.04240890204880844, "grad_norm": 4.976812191722208, "learning_rate": 1.4110169491525424e-05, "loss": 1.0741, "step": 333 }, { "epoch": 0.042536256109015076, "grad_norm": 5.810975233685298, "learning_rate": 1.4152542372881358e-05, "loss": 1.222, "step": 334 }, { "epoch": 0.04266361016922171, "grad_norm": 6.413104005442228, "learning_rate": 1.419491525423729e-05, "loss": 1.1863, "step": 335 }, { "epoch": 0.04279096422942834, "grad_norm": 5.016552230357741, "learning_rate": 1.4237288135593221e-05, "loss": 1.1668, "step": 336 }, { "epoch": 0.04291831828963497, "grad_norm": 3.332210844506039, "learning_rate": 1.4279661016949153e-05, "loss": 1.1309, "step": 337 }, { "epoch": 0.0430456723498416, "grad_norm": 5.386934736650668, "learning_rate": 1.4322033898305086e-05, "loss": 1.1814, "step": 338 }, { "epoch": 0.043173026410048235, "grad_norm": 6.338924282523835, "learning_rate": 1.4364406779661018e-05, "loss": 1.2045, "step": 339 }, { "epoch": 0.04330038047025487, "grad_norm": 7.035777079943078, "learning_rate": 1.440677966101695e-05, "loss": 1.1769, "step": 340 }, { "epoch": 0.0434277345304615, "grad_norm": 7.6157803688276555, "learning_rate": 1.4449152542372881e-05, "loss": 1.1015, "step": 341 }, { "epoch": 0.04355508859066813, "grad_norm": 5.101772800568265, "learning_rate": 1.4491525423728813e-05, "loss": 1.153, "step": 342 }, { "epoch": 0.043682442650874766, "grad_norm": 6.4044347656892215, "learning_rate": 1.4533898305084746e-05, "loss": 1.0963, "step": 343 }, { "epoch": 0.04380979671108139, "grad_norm": 7.158946233171024, "learning_rate": 1.4576271186440678e-05, "loss": 1.1785, "step": 344 }, { "epoch": 0.04393715077128803, "grad_norm": 5.614692399116299, "learning_rate": 1.4618644067796612e-05, "loss": 1.1204, "step": 345 }, { "epoch": 0.04406450483149466, "grad_norm": 7.491907204007989, "learning_rate": 1.4661016949152545e-05, "loss": 1.0839, "step": 346 }, { "epoch": 0.04419185889170129, "grad_norm": 6.626849797952939, "learning_rate": 1.4703389830508477e-05, "loss": 1.2041, "step": 347 }, { "epoch": 0.044319212951907924, "grad_norm": 6.108738078212482, "learning_rate": 1.4745762711864408e-05, "loss": 1.1178, "step": 348 }, { "epoch": 0.04444656701211455, "grad_norm": 5.6485756082916945, "learning_rate": 1.478813559322034e-05, "loss": 1.0452, "step": 349 }, { "epoch": 0.044573921072321186, "grad_norm": 6.370962322851192, "learning_rate": 1.4830508474576274e-05, "loss": 1.17, "step": 350 }, { "epoch": 0.04470127513252782, "grad_norm": 5.2103942862478325, "learning_rate": 1.4872881355932205e-05, "loss": 1.1943, "step": 351 }, { "epoch": 0.04482862919273445, "grad_norm": 5.756424673207367, "learning_rate": 1.4915254237288137e-05, "loss": 1.0939, "step": 352 }, { "epoch": 0.04495598325294108, "grad_norm": 5.8667551178477675, "learning_rate": 1.4957627118644069e-05, "loss": 1.1213, "step": 353 }, { "epoch": 0.04508333731314772, "grad_norm": 5.172829346899861, "learning_rate": 1.5000000000000002e-05, "loss": 1.079, "step": 354 }, { "epoch": 0.045210691373354345, "grad_norm": 4.573603040529301, "learning_rate": 1.5042372881355934e-05, "loss": 1.0208, "step": 355 }, { "epoch": 0.04533804543356098, "grad_norm": 4.8197230839623035, "learning_rate": 1.5084745762711865e-05, "loss": 1.1836, "step": 356 }, { "epoch": 0.045465399493767614, "grad_norm": 7.777097030397322, "learning_rate": 1.5127118644067797e-05, "loss": 1.1588, "step": 357 }, { "epoch": 0.04559275355397424, "grad_norm": 5.278754553281395, "learning_rate": 1.5169491525423729e-05, "loss": 1.0581, "step": 358 }, { "epoch": 0.045720107614180876, "grad_norm": 4.858938117795735, "learning_rate": 1.5211864406779662e-05, "loss": 1.0556, "step": 359 }, { "epoch": 0.0458474616743875, "grad_norm": 5.661545103286113, "learning_rate": 1.5254237288135594e-05, "loss": 1.1903, "step": 360 }, { "epoch": 0.04597481573459414, "grad_norm": 7.069885982905191, "learning_rate": 1.5296610169491526e-05, "loss": 1.1951, "step": 361 }, { "epoch": 0.04610216979480077, "grad_norm": 4.610622017492172, "learning_rate": 1.533898305084746e-05, "loss": 1.0968, "step": 362 }, { "epoch": 0.0462295238550074, "grad_norm": 5.0935084747906005, "learning_rate": 1.538135593220339e-05, "loss": 1.1064, "step": 363 }, { "epoch": 0.046356877915214034, "grad_norm": 4.494028155508151, "learning_rate": 1.5423728813559326e-05, "loss": 1.0162, "step": 364 }, { "epoch": 0.04648423197542067, "grad_norm": 5.347282268412717, "learning_rate": 1.5466101694915256e-05, "loss": 1.0498, "step": 365 }, { "epoch": 0.046611586035627296, "grad_norm": 6.447015340870528, "learning_rate": 1.550847457627119e-05, "loss": 1.0885, "step": 366 }, { "epoch": 0.04673894009583393, "grad_norm": 5.665803544376279, "learning_rate": 1.555084745762712e-05, "loss": 1.0579, "step": 367 }, { "epoch": 0.046866294156040565, "grad_norm": 6.497470363477434, "learning_rate": 1.5593220338983053e-05, "loss": 1.147, "step": 368 }, { "epoch": 0.04699364821624719, "grad_norm": 6.44022082264147, "learning_rate": 1.5635593220338986e-05, "loss": 1.1009, "step": 369 }, { "epoch": 0.04712100227645383, "grad_norm": 5.137514048462012, "learning_rate": 1.5677966101694916e-05, "loss": 1.0439, "step": 370 }, { "epoch": 0.047248356336660455, "grad_norm": 5.702533680817622, "learning_rate": 1.572033898305085e-05, "loss": 1.0592, "step": 371 }, { "epoch": 0.04737571039686709, "grad_norm": 5.280215190201436, "learning_rate": 1.576271186440678e-05, "loss": 1.1372, "step": 372 }, { "epoch": 0.047503064457073724, "grad_norm": 5.3070710210883725, "learning_rate": 1.5805084745762713e-05, "loss": 1.1866, "step": 373 }, { "epoch": 0.04763041851728035, "grad_norm": 5.790683946551362, "learning_rate": 1.5847457627118646e-05, "loss": 1.122, "step": 374 }, { "epoch": 0.047757772577486986, "grad_norm": 4.4342977225389655, "learning_rate": 1.5889830508474576e-05, "loss": 1.0376, "step": 375 }, { "epoch": 0.04788512663769362, "grad_norm": 7.361533850601036, "learning_rate": 1.593220338983051e-05, "loss": 1.1915, "step": 376 }, { "epoch": 0.04801248069790025, "grad_norm": 6.792856435960524, "learning_rate": 1.5974576271186443e-05, "loss": 1.13, "step": 377 }, { "epoch": 0.04813983475810688, "grad_norm": 5.3009787897284015, "learning_rate": 1.6016949152542373e-05, "loss": 1.1738, "step": 378 }, { "epoch": 0.04826718881831352, "grad_norm": 7.999690049842849, "learning_rate": 1.6059322033898307e-05, "loss": 1.1297, "step": 379 }, { "epoch": 0.048394542878520144, "grad_norm": 5.551331527032272, "learning_rate": 1.6101694915254237e-05, "loss": 1.2117, "step": 380 }, { "epoch": 0.04852189693872678, "grad_norm": 4.79399733321872, "learning_rate": 1.614406779661017e-05, "loss": 1.1036, "step": 381 }, { "epoch": 0.04864925099893341, "grad_norm": 5.685210049924944, "learning_rate": 1.6186440677966104e-05, "loss": 1.2222, "step": 382 }, { "epoch": 0.04877660505914004, "grad_norm": 8.528954948278576, "learning_rate": 1.6228813559322034e-05, "loss": 1.1457, "step": 383 }, { "epoch": 0.048903959119346675, "grad_norm": 5.27660350563353, "learning_rate": 1.6271186440677967e-05, "loss": 0.9738, "step": 384 }, { "epoch": 0.0490313131795533, "grad_norm": 8.813854789754753, "learning_rate": 1.63135593220339e-05, "loss": 1.1817, "step": 385 }, { "epoch": 0.04915866723975994, "grad_norm": 4.481348378738923, "learning_rate": 1.6355932203389834e-05, "loss": 1.0746, "step": 386 }, { "epoch": 0.04928602129996657, "grad_norm": 5.467818000940252, "learning_rate": 1.6398305084745764e-05, "loss": 1.087, "step": 387 }, { "epoch": 0.0494133753601732, "grad_norm": 6.6069025415732945, "learning_rate": 1.6440677966101697e-05, "loss": 1.1628, "step": 388 }, { "epoch": 0.049540729420379834, "grad_norm": 5.1796771893751075, "learning_rate": 1.648305084745763e-05, "loss": 1.1182, "step": 389 }, { "epoch": 0.04966808348058647, "grad_norm": 4.147969192537622, "learning_rate": 1.652542372881356e-05, "loss": 1.1918, "step": 390 }, { "epoch": 0.049795437540793096, "grad_norm": 7.153712193297355, "learning_rate": 1.6567796610169494e-05, "loss": 1.1153, "step": 391 }, { "epoch": 0.04992279160099973, "grad_norm": 6.5428372771306, "learning_rate": 1.6610169491525424e-05, "loss": 1.2467, "step": 392 }, { "epoch": 0.050050145661206365, "grad_norm": 6.720477498664704, "learning_rate": 1.6652542372881357e-05, "loss": 1.1426, "step": 393 }, { "epoch": 0.05017749972141299, "grad_norm": 5.490675564806118, "learning_rate": 1.669491525423729e-05, "loss": 1.1069, "step": 394 }, { "epoch": 0.05030485378161963, "grad_norm": 4.996180028819127, "learning_rate": 1.673728813559322e-05, "loss": 1.1453, "step": 395 }, { "epoch": 0.050432207841826254, "grad_norm": 6.543552037899987, "learning_rate": 1.6779661016949154e-05, "loss": 1.1048, "step": 396 }, { "epoch": 0.05055956190203289, "grad_norm": 6.797227170743476, "learning_rate": 1.6822033898305084e-05, "loss": 1.2534, "step": 397 }, { "epoch": 0.05068691596223952, "grad_norm": 6.369776960986534, "learning_rate": 1.6864406779661018e-05, "loss": 1.175, "step": 398 }, { "epoch": 0.05081427002244615, "grad_norm": 4.493815941253846, "learning_rate": 1.690677966101695e-05, "loss": 1.1982, "step": 399 }, { "epoch": 0.050941624082652785, "grad_norm": 4.799971882419884, "learning_rate": 1.694915254237288e-05, "loss": 1.1094, "step": 400 }, { "epoch": 0.05106897814285942, "grad_norm": 5.5972935199009175, "learning_rate": 1.6991525423728815e-05, "loss": 1.1708, "step": 401 }, { "epoch": 0.05119633220306605, "grad_norm": 5.317337885054691, "learning_rate": 1.7033898305084745e-05, "loss": 1.0627, "step": 402 }, { "epoch": 0.05132368626327268, "grad_norm": 5.190269190692385, "learning_rate": 1.7076271186440678e-05, "loss": 1.086, "step": 403 }, { "epoch": 0.051451040323479316, "grad_norm": 7.00350346766054, "learning_rate": 1.711864406779661e-05, "loss": 1.1158, "step": 404 }, { "epoch": 0.051578394383685944, "grad_norm": 5.506921747563612, "learning_rate": 1.7161016949152545e-05, "loss": 1.1417, "step": 405 }, { "epoch": 0.05170574844389258, "grad_norm": 5.838009549432716, "learning_rate": 1.7203389830508478e-05, "loss": 1.2006, "step": 406 }, { "epoch": 0.051833102504099206, "grad_norm": 6.087071911683631, "learning_rate": 1.7245762711864408e-05, "loss": 1.2103, "step": 407 }, { "epoch": 0.05196045656430584, "grad_norm": 6.958343810418135, "learning_rate": 1.728813559322034e-05, "loss": 1.1675, "step": 408 }, { "epoch": 0.052087810624512475, "grad_norm": 6.131831193883537, "learning_rate": 1.733050847457627e-05, "loss": 1.1751, "step": 409 }, { "epoch": 0.0522151646847191, "grad_norm": 6.429994225862797, "learning_rate": 1.7372881355932205e-05, "loss": 1.0883, "step": 410 }, { "epoch": 0.05234251874492574, "grad_norm": 5.681284096324905, "learning_rate": 1.741525423728814e-05, "loss": 1.072, "step": 411 }, { "epoch": 0.05246987280513237, "grad_norm": 5.152898635657918, "learning_rate": 1.745762711864407e-05, "loss": 1.1444, "step": 412 }, { "epoch": 0.052597226865339, "grad_norm": 8.122164657024648, "learning_rate": 1.7500000000000002e-05, "loss": 1.117, "step": 413 }, { "epoch": 0.05272458092554563, "grad_norm": 7.388012285107703, "learning_rate": 1.7542372881355932e-05, "loss": 1.1604, "step": 414 }, { "epoch": 0.05285193498575227, "grad_norm": 4.142600455576895, "learning_rate": 1.7584745762711865e-05, "loss": 1.0567, "step": 415 }, { "epoch": 0.052979289045958895, "grad_norm": 5.341916132123986, "learning_rate": 1.76271186440678e-05, "loss": 1.1664, "step": 416 }, { "epoch": 0.05310664310616553, "grad_norm": 4.098930284467975, "learning_rate": 1.766949152542373e-05, "loss": 1.0655, "step": 417 }, { "epoch": 0.05323399716637216, "grad_norm": 5.283165939679673, "learning_rate": 1.7711864406779662e-05, "loss": 1.0878, "step": 418 }, { "epoch": 0.05336135122657879, "grad_norm": 5.72671571034816, "learning_rate": 1.7754237288135596e-05, "loss": 1.1076, "step": 419 }, { "epoch": 0.053488705286785426, "grad_norm": 4.181638800501704, "learning_rate": 1.7796610169491526e-05, "loss": 1.0477, "step": 420 }, { "epoch": 0.053616059346992054, "grad_norm": 6.037518500068706, "learning_rate": 1.783898305084746e-05, "loss": 1.0744, "step": 421 }, { "epoch": 0.05374341340719869, "grad_norm": 5.908386812743721, "learning_rate": 1.788135593220339e-05, "loss": 1.2553, "step": 422 }, { "epoch": 0.05387076746740532, "grad_norm": 5.477098120428443, "learning_rate": 1.7923728813559326e-05, "loss": 1.1578, "step": 423 }, { "epoch": 0.05399812152761195, "grad_norm": 5.269725074907827, "learning_rate": 1.7966101694915256e-05, "loss": 1.0921, "step": 424 }, { "epoch": 0.054125475587818585, "grad_norm": 5.99473658806239, "learning_rate": 1.800847457627119e-05, "loss": 1.1249, "step": 425 }, { "epoch": 0.05425282964802522, "grad_norm": 5.212141226810042, "learning_rate": 1.805084745762712e-05, "loss": 1.0942, "step": 426 }, { "epoch": 0.05438018370823185, "grad_norm": 4.914321965340521, "learning_rate": 1.8093220338983053e-05, "loss": 1.0524, "step": 427 }, { "epoch": 0.05450753776843848, "grad_norm": 4.934122789640228, "learning_rate": 1.8135593220338986e-05, "loss": 1.0692, "step": 428 }, { "epoch": 0.05463489182864511, "grad_norm": 5.974731962823762, "learning_rate": 1.8177966101694916e-05, "loss": 1.0516, "step": 429 }, { "epoch": 0.05476224588885174, "grad_norm": 5.18332734102514, "learning_rate": 1.822033898305085e-05, "loss": 1.1025, "step": 430 }, { "epoch": 0.05488959994905838, "grad_norm": 4.61394209816082, "learning_rate": 1.8262711864406783e-05, "loss": 1.091, "step": 431 }, { "epoch": 0.055016954009265005, "grad_norm": 5.894652591010023, "learning_rate": 1.8305084745762713e-05, "loss": 1.1953, "step": 432 }, { "epoch": 0.05514430806947164, "grad_norm": 5.86790849951594, "learning_rate": 1.8347457627118646e-05, "loss": 1.0643, "step": 433 }, { "epoch": 0.055271662129678274, "grad_norm": 5.628648380088236, "learning_rate": 1.8389830508474576e-05, "loss": 1.119, "step": 434 }, { "epoch": 0.0553990161898849, "grad_norm": 4.991344207554816, "learning_rate": 1.843220338983051e-05, "loss": 1.0047, "step": 435 }, { "epoch": 0.055526370250091536, "grad_norm": 6.211890650133721, "learning_rate": 1.8474576271186443e-05, "loss": 1.0869, "step": 436 }, { "epoch": 0.05565372431029817, "grad_norm": 6.316762561456026, "learning_rate": 1.8516949152542373e-05, "loss": 1.0198, "step": 437 }, { "epoch": 0.0557810783705048, "grad_norm": 4.2256016229559235, "learning_rate": 1.8559322033898307e-05, "loss": 1.0982, "step": 438 }, { "epoch": 0.05590843243071143, "grad_norm": 7.262435245717664, "learning_rate": 1.8601694915254237e-05, "loss": 1.0517, "step": 439 }, { "epoch": 0.05603578649091806, "grad_norm": 5.585281184126876, "learning_rate": 1.864406779661017e-05, "loss": 1.2128, "step": 440 }, { "epoch": 0.056163140551124695, "grad_norm": 4.767034803637561, "learning_rate": 1.8686440677966103e-05, "loss": 1.1603, "step": 441 }, { "epoch": 0.05629049461133133, "grad_norm": 6.468924552294705, "learning_rate": 1.8728813559322033e-05, "loss": 1.2014, "step": 442 }, { "epoch": 0.05641784867153796, "grad_norm": 9.31974357153982, "learning_rate": 1.877118644067797e-05, "loss": 1.0906, "step": 443 }, { "epoch": 0.05654520273174459, "grad_norm": 6.012844007647524, "learning_rate": 1.88135593220339e-05, "loss": 1.1387, "step": 444 }, { "epoch": 0.056672556791951226, "grad_norm": 4.083626146262829, "learning_rate": 1.8855932203389834e-05, "loss": 1.1401, "step": 445 }, { "epoch": 0.05679991085215785, "grad_norm": 3.9173329104567634, "learning_rate": 1.8898305084745764e-05, "loss": 1.0576, "step": 446 }, { "epoch": 0.05692726491236449, "grad_norm": 5.714794354280716, "learning_rate": 1.8940677966101697e-05, "loss": 0.975, "step": 447 }, { "epoch": 0.05705461897257112, "grad_norm": 7.531880079322617, "learning_rate": 1.898305084745763e-05, "loss": 1.1463, "step": 448 }, { "epoch": 0.05718197303277775, "grad_norm": 4.742736684054098, "learning_rate": 1.902542372881356e-05, "loss": 0.9861, "step": 449 }, { "epoch": 0.057309327092984384, "grad_norm": 4.8112771598542885, "learning_rate": 1.9067796610169494e-05, "loss": 1.1611, "step": 450 }, { "epoch": 0.05743668115319101, "grad_norm": 6.124219883654539, "learning_rate": 1.9110169491525424e-05, "loss": 1.1772, "step": 451 }, { "epoch": 0.057564035213397646, "grad_norm": 6.490103891067207, "learning_rate": 1.9152542372881357e-05, "loss": 1.1467, "step": 452 }, { "epoch": 0.05769138927360428, "grad_norm": 4.884413799929866, "learning_rate": 1.919491525423729e-05, "loss": 1.0313, "step": 453 }, { "epoch": 0.05781874333381091, "grad_norm": 17.31884120667972, "learning_rate": 1.923728813559322e-05, "loss": 1.0885, "step": 454 }, { "epoch": 0.05794609739401754, "grad_norm": 6.044814909695786, "learning_rate": 1.9279661016949154e-05, "loss": 1.0058, "step": 455 }, { "epoch": 0.05807345145422418, "grad_norm": 6.115362876923583, "learning_rate": 1.9322033898305087e-05, "loss": 1.1625, "step": 456 }, { "epoch": 0.058200805514430805, "grad_norm": 7.167009685498636, "learning_rate": 1.9364406779661017e-05, "loss": 1.2125, "step": 457 }, { "epoch": 0.05832815957463744, "grad_norm": 8.321018070875693, "learning_rate": 1.940677966101695e-05, "loss": 1.2028, "step": 458 }, { "epoch": 0.058455513634844074, "grad_norm": 5.883357171000457, "learning_rate": 1.944915254237288e-05, "loss": 1.0797, "step": 459 }, { "epoch": 0.0585828676950507, "grad_norm": 6.548593372992014, "learning_rate": 1.9491525423728814e-05, "loss": 1.2657, "step": 460 }, { "epoch": 0.058710221755257336, "grad_norm": 6.503332017630883, "learning_rate": 1.9533898305084748e-05, "loss": 1.1852, "step": 461 }, { "epoch": 0.05883757581546396, "grad_norm": 5.406696984660371, "learning_rate": 1.9576271186440678e-05, "loss": 1.0883, "step": 462 }, { "epoch": 0.0589649298756706, "grad_norm": 6.705726669953014, "learning_rate": 1.961864406779661e-05, "loss": 1.0506, "step": 463 }, { "epoch": 0.05909228393587723, "grad_norm": 4.629241235900978, "learning_rate": 1.9661016949152545e-05, "loss": 1.0315, "step": 464 }, { "epoch": 0.05921963799608386, "grad_norm": 5.078040676884035, "learning_rate": 1.9703389830508478e-05, "loss": 1.1109, "step": 465 }, { "epoch": 0.059346992056290494, "grad_norm": 7.199572698841122, "learning_rate": 1.9745762711864408e-05, "loss": 1.0686, "step": 466 }, { "epoch": 0.05947434611649713, "grad_norm": 6.04655483295243, "learning_rate": 1.978813559322034e-05, "loss": 1.2287, "step": 467 }, { "epoch": 0.059601700176703756, "grad_norm": 4.20413933182076, "learning_rate": 1.9830508474576275e-05, "loss": 1.1357, "step": 468 }, { "epoch": 0.05972905423691039, "grad_norm": 5.884048636001412, "learning_rate": 1.9872881355932205e-05, "loss": 1.0665, "step": 469 }, { "epoch": 0.059856408297117025, "grad_norm": 4.659764544387501, "learning_rate": 1.9915254237288138e-05, "loss": 1.0815, "step": 470 }, { "epoch": 0.05998376235732365, "grad_norm": 5.519786788305731, "learning_rate": 1.9957627118644068e-05, "loss": 1.0704, "step": 471 }, { "epoch": 0.06011111641753029, "grad_norm": 8.08682390597387, "learning_rate": 2e-05, "loss": 1.0464, "step": 472 }, { "epoch": 0.06023847047773692, "grad_norm": 5.6181193336206166, "learning_rate": 1.9999999787305685e-05, "loss": 1.0311, "step": 473 }, { "epoch": 0.06036582453794355, "grad_norm": 4.519360327229284, "learning_rate": 1.9999999149222745e-05, "loss": 1.0972, "step": 474 }, { "epoch": 0.060493178598150184, "grad_norm": 6.846898856876654, "learning_rate": 1.9999998085751203e-05, "loss": 1.1147, "step": 475 }, { "epoch": 0.06062053265835681, "grad_norm": 5.986633352740627, "learning_rate": 1.9999996596891108e-05, "loss": 1.1468, "step": 476 }, { "epoch": 0.060747886718563446, "grad_norm": 4.984324679225318, "learning_rate": 1.999999468264253e-05, "loss": 1.1052, "step": 477 }, { "epoch": 0.06087524077877008, "grad_norm": 7.075737893813151, "learning_rate": 1.999999234300554e-05, "loss": 1.0461, "step": 478 }, { "epoch": 0.06100259483897671, "grad_norm": 5.73308430212171, "learning_rate": 1.9999989577980245e-05, "loss": 1.2228, "step": 479 }, { "epoch": 0.06112994889918334, "grad_norm": 6.053575376693396, "learning_rate": 1.9999986387566755e-05, "loss": 1.2758, "step": 480 }, { "epoch": 0.06125730295938998, "grad_norm": 6.078084832772162, "learning_rate": 1.9999982771765212e-05, "loss": 1.1119, "step": 481 }, { "epoch": 0.061384657019596604, "grad_norm": 6.797968774241789, "learning_rate": 1.9999978730575768e-05, "loss": 1.0882, "step": 482 }, { "epoch": 0.06151201107980324, "grad_norm": 6.273452093694109, "learning_rate": 1.999997426399859e-05, "loss": 1.1447, "step": 483 }, { "epoch": 0.06163936514000987, "grad_norm": 7.10284083310267, "learning_rate": 1.999996937203388e-05, "loss": 0.9969, "step": 484 }, { "epoch": 0.0617667192002165, "grad_norm": 5.530254727061982, "learning_rate": 1.9999964054681835e-05, "loss": 1.1789, "step": 485 }, { "epoch": 0.061894073260423135, "grad_norm": 4.669864489858105, "learning_rate": 1.9999958311942685e-05, "loss": 1.1693, "step": 486 }, { "epoch": 0.06202142732062976, "grad_norm": 5.757957922497859, "learning_rate": 1.9999952143816677e-05, "loss": 1.1089, "step": 487 }, { "epoch": 0.0621487813808364, "grad_norm": 7.478914502779971, "learning_rate": 1.999994555030407e-05, "loss": 1.1695, "step": 488 }, { "epoch": 0.06227613544104303, "grad_norm": 7.5854044450973825, "learning_rate": 1.9999938531405142e-05, "loss": 1.1188, "step": 489 }, { "epoch": 0.06240348950124966, "grad_norm": 7.252243747122389, "learning_rate": 1.9999931087120198e-05, "loss": 1.1036, "step": 490 }, { "epoch": 0.06253084356145629, "grad_norm": 5.098055652157508, "learning_rate": 1.999992321744955e-05, "loss": 1.1222, "step": 491 }, { "epoch": 0.06265819762166293, "grad_norm": 5.617857743175649, "learning_rate": 1.9999914922393536e-05, "loss": 1.0715, "step": 492 }, { "epoch": 0.06278555168186956, "grad_norm": 4.719171568373844, "learning_rate": 1.9999906201952507e-05, "loss": 1.0764, "step": 493 }, { "epoch": 0.06291290574207618, "grad_norm": 6.104534602441233, "learning_rate": 1.9999897056126832e-05, "loss": 1.2059, "step": 494 }, { "epoch": 0.06304025980228282, "grad_norm": 6.722891165636998, "learning_rate": 1.9999887484916902e-05, "loss": 1.1001, "step": 495 }, { "epoch": 0.06316761386248945, "grad_norm": 5.75820105806048, "learning_rate": 1.999987748832313e-05, "loss": 1.0382, "step": 496 }, { "epoch": 0.06329496792269608, "grad_norm": 5.701985786262386, "learning_rate": 1.9999867066345927e-05, "loss": 1.0952, "step": 497 }, { "epoch": 0.06342232198290272, "grad_norm": 4.50835236964317, "learning_rate": 1.9999856218985753e-05, "loss": 1.0516, "step": 498 }, { "epoch": 0.06354967604310935, "grad_norm": 6.243899305236716, "learning_rate": 1.9999844946243055e-05, "loss": 0.9709, "step": 499 }, { "epoch": 0.06367703010331598, "grad_norm": 5.025162325082904, "learning_rate": 1.999983324811832e-05, "loss": 1.0088, "step": 500 }, { "epoch": 0.06380438416352262, "grad_norm": 4.394774081832498, "learning_rate": 1.9999821124612047e-05, "loss": 1.0979, "step": 501 }, { "epoch": 0.06393173822372925, "grad_norm": 5.236599122276942, "learning_rate": 1.9999808575724747e-05, "loss": 1.1553, "step": 502 }, { "epoch": 0.06405909228393587, "grad_norm": 5.490313235937539, "learning_rate": 1.9999795601456955e-05, "loss": 1.0991, "step": 503 }, { "epoch": 0.06418644634414251, "grad_norm": 7.443317350150323, "learning_rate": 1.9999782201809227e-05, "loss": 1.029, "step": 504 }, { "epoch": 0.06431380040434914, "grad_norm": 6.019639298288074, "learning_rate": 1.9999768376782126e-05, "loss": 1.0034, "step": 505 }, { "epoch": 0.06444115446455577, "grad_norm": 5.986425302535064, "learning_rate": 1.9999754126376247e-05, "loss": 1.2177, "step": 506 }, { "epoch": 0.06456850852476241, "grad_norm": 6.693771701353711, "learning_rate": 1.999973945059219e-05, "loss": 1.1051, "step": 507 }, { "epoch": 0.06469586258496904, "grad_norm": 5.945756812918701, "learning_rate": 1.9999724349430588e-05, "loss": 1.1274, "step": 508 }, { "epoch": 0.06482321664517567, "grad_norm": 5.769953067264716, "learning_rate": 1.9999708822892074e-05, "loss": 1.1091, "step": 509 }, { "epoch": 0.06495057070538231, "grad_norm": 5.927645566674118, "learning_rate": 1.999969287097731e-05, "loss": 1.229, "step": 510 }, { "epoch": 0.06507792476558893, "grad_norm": 5.822752084151968, "learning_rate": 1.9999676493686982e-05, "loss": 1.068, "step": 511 }, { "epoch": 0.06520527882579556, "grad_norm": 8.400925321930224, "learning_rate": 1.9999659691021783e-05, "loss": 1.1987, "step": 512 }, { "epoch": 0.0653326328860022, "grad_norm": 3.869302190712713, "learning_rate": 1.999964246298242e-05, "loss": 1.0111, "step": 513 }, { "epoch": 0.06545998694620883, "grad_norm": 8.700792421229472, "learning_rate": 1.9999624809569635e-05, "loss": 1.0577, "step": 514 }, { "epoch": 0.06558734100641546, "grad_norm": 5.010001669634086, "learning_rate": 1.9999606730784178e-05, "loss": 1.1465, "step": 515 }, { "epoch": 0.06571469506662209, "grad_norm": 8.145420177808317, "learning_rate": 1.9999588226626814e-05, "loss": 1.1327, "step": 516 }, { "epoch": 0.06584204912682873, "grad_norm": 6.193503193792947, "learning_rate": 1.9999569297098334e-05, "loss": 1.144, "step": 517 }, { "epoch": 0.06596940318703536, "grad_norm": 7.465247190743016, "learning_rate": 1.999954994219954e-05, "loss": 1.1173, "step": 518 }, { "epoch": 0.06609675724724198, "grad_norm": 6.750553814809614, "learning_rate": 1.999953016193126e-05, "loss": 1.1759, "step": 519 }, { "epoch": 0.06622411130744862, "grad_norm": 4.564339045028179, "learning_rate": 1.9999509956294324e-05, "loss": 1.0987, "step": 520 }, { "epoch": 0.06635146536765525, "grad_norm": 5.428131802762721, "learning_rate": 1.9999489325289607e-05, "loss": 1.124, "step": 521 }, { "epoch": 0.06647881942786188, "grad_norm": 7.130879168351056, "learning_rate": 1.9999468268917978e-05, "loss": 1.008, "step": 522 }, { "epoch": 0.06660617348806852, "grad_norm": 6.866731977923751, "learning_rate": 1.9999446787180338e-05, "loss": 1.1246, "step": 523 }, { "epoch": 0.06673352754827515, "grad_norm": 5.66268234801421, "learning_rate": 1.9999424880077592e-05, "loss": 1.1667, "step": 524 }, { "epoch": 0.06686088160848178, "grad_norm": 6.135632375693495, "learning_rate": 1.999940254761068e-05, "loss": 1.1069, "step": 525 }, { "epoch": 0.06698823566868842, "grad_norm": 6.9654472975314015, "learning_rate": 1.9999379789780543e-05, "loss": 1.2112, "step": 526 }, { "epoch": 0.06711558972889504, "grad_norm": 4.0917355771810735, "learning_rate": 1.999935660658816e-05, "loss": 1.0041, "step": 527 }, { "epoch": 0.06724294378910167, "grad_norm": 5.299854262165641, "learning_rate": 1.9999332998034515e-05, "loss": 1.1039, "step": 528 }, { "epoch": 0.06737029784930831, "grad_norm": 5.03714288624225, "learning_rate": 1.9999308964120604e-05, "loss": 1.0356, "step": 529 }, { "epoch": 0.06749765190951494, "grad_norm": 5.018361328580709, "learning_rate": 1.999928450484746e-05, "loss": 1.0958, "step": 530 }, { "epoch": 0.06762500596972157, "grad_norm": 4.259050045925872, "learning_rate": 1.9999259620216113e-05, "loss": 1.0695, "step": 531 }, { "epoch": 0.06775236002992821, "grad_norm": 4.36079760893954, "learning_rate": 1.999923431022763e-05, "loss": 1.1662, "step": 532 }, { "epoch": 0.06787971409013484, "grad_norm": 5.849711758285983, "learning_rate": 1.9999208574883086e-05, "loss": 1.1017, "step": 533 }, { "epoch": 0.06800706815034147, "grad_norm": 4.771071826465309, "learning_rate": 1.999918241418357e-05, "loss": 1.0827, "step": 534 }, { "epoch": 0.0681344222105481, "grad_norm": 4.811774062854366, "learning_rate": 1.99991558281302e-05, "loss": 1.0242, "step": 535 }, { "epoch": 0.06826177627075473, "grad_norm": 6.02176026421573, "learning_rate": 1.999912881672411e-05, "loss": 1.0808, "step": 536 }, { "epoch": 0.06838913033096136, "grad_norm": 5.9725923802994405, "learning_rate": 1.999910137996644e-05, "loss": 1.0689, "step": 537 }, { "epoch": 0.06851648439116799, "grad_norm": 4.913192016936453, "learning_rate": 1.9999073517858365e-05, "loss": 1.015, "step": 538 }, { "epoch": 0.06864383845137463, "grad_norm": 5.977263286120293, "learning_rate": 1.999904523040107e-05, "loss": 1.1264, "step": 539 }, { "epoch": 0.06877119251158126, "grad_norm": 6.274222774929446, "learning_rate": 1.9999016517595752e-05, "loss": 1.0752, "step": 540 }, { "epoch": 0.06889854657178789, "grad_norm": 5.376708180660854, "learning_rate": 1.999898737944364e-05, "loss": 1.0579, "step": 541 }, { "epoch": 0.06902590063199453, "grad_norm": 5.978380991215184, "learning_rate": 1.9998957815945962e-05, "loss": 0.9844, "step": 542 }, { "epoch": 0.06915325469220115, "grad_norm": 5.110581375165403, "learning_rate": 1.999892782710399e-05, "loss": 1.1507, "step": 543 }, { "epoch": 0.06928060875240778, "grad_norm": 5.5490173500932345, "learning_rate": 1.999889741291899e-05, "loss": 1.1175, "step": 544 }, { "epoch": 0.06940796281261442, "grad_norm": 6.375743448626605, "learning_rate": 1.9998866573392265e-05, "loss": 1.1339, "step": 545 }, { "epoch": 0.06953531687282105, "grad_norm": 5.936784560617764, "learning_rate": 1.9998835308525115e-05, "loss": 0.9726, "step": 546 }, { "epoch": 0.06966267093302768, "grad_norm": 6.062258371473456, "learning_rate": 1.9998803618318873e-05, "loss": 1.0518, "step": 547 }, { "epoch": 0.06979002499323432, "grad_norm": 5.482671141521196, "learning_rate": 1.9998771502774895e-05, "loss": 1.1076, "step": 548 }, { "epoch": 0.06991737905344095, "grad_norm": 6.91431518782602, "learning_rate": 1.9998738961894538e-05, "loss": 1.2298, "step": 549 }, { "epoch": 0.07004473311364758, "grad_norm": 8.527637205784295, "learning_rate": 1.9998705995679195e-05, "loss": 1.1865, "step": 550 }, { "epoch": 0.07017208717385422, "grad_norm": 5.06632813192715, "learning_rate": 1.999867260413026e-05, "loss": 1.1329, "step": 551 }, { "epoch": 0.07029944123406084, "grad_norm": 6.611392459528193, "learning_rate": 1.9998638787249158e-05, "loss": 1.023, "step": 552 }, { "epoch": 0.07042679529426747, "grad_norm": 4.434471493854659, "learning_rate": 1.9998604545037325e-05, "loss": 1.0204, "step": 553 }, { "epoch": 0.07055414935447411, "grad_norm": 5.885039821026563, "learning_rate": 1.999856987749622e-05, "loss": 1.1628, "step": 554 }, { "epoch": 0.07068150341468074, "grad_norm": 7.757189707425133, "learning_rate": 1.999853478462732e-05, "loss": 1.0407, "step": 555 }, { "epoch": 0.07080885747488737, "grad_norm": 5.097586842499232, "learning_rate": 1.999849926643211e-05, "loss": 1.0845, "step": 556 }, { "epoch": 0.07093621153509401, "grad_norm": 6.9215922760756765, "learning_rate": 1.999846332291211e-05, "loss": 1.0299, "step": 557 }, { "epoch": 0.07106356559530064, "grad_norm": 4.351081643511859, "learning_rate": 1.9998426954068837e-05, "loss": 1.035, "step": 558 }, { "epoch": 0.07119091965550726, "grad_norm": 5.114464514362844, "learning_rate": 1.9998390159903853e-05, "loss": 1.0353, "step": 559 }, { "epoch": 0.07131827371571389, "grad_norm": 7.082369241365838, "learning_rate": 1.9998352940418713e-05, "loss": 0.9955, "step": 560 }, { "epoch": 0.07144562777592053, "grad_norm": 4.362420880931001, "learning_rate": 1.9998315295615002e-05, "loss": 1.1176, "step": 561 }, { "epoch": 0.07157298183612716, "grad_norm": 6.0660644749385355, "learning_rate": 1.999827722549432e-05, "loss": 1.1078, "step": 562 }, { "epoch": 0.07170033589633379, "grad_norm": 4.456082830731038, "learning_rate": 1.99982387300583e-05, "loss": 1.0612, "step": 563 }, { "epoch": 0.07182768995654043, "grad_norm": 5.829329542602628, "learning_rate": 1.9998199809308558e-05, "loss": 1.0402, "step": 564 }, { "epoch": 0.07195504401674706, "grad_norm": 5.921042673202974, "learning_rate": 1.9998160463246762e-05, "loss": 1.15, "step": 565 }, { "epoch": 0.07208239807695369, "grad_norm": 5.450573868242138, "learning_rate": 1.9998120691874586e-05, "loss": 1.1177, "step": 566 }, { "epoch": 0.07220975213716033, "grad_norm": 5.703991131790351, "learning_rate": 1.9998080495193716e-05, "loss": 1.1882, "step": 567 }, { "epoch": 0.07233710619736695, "grad_norm": 5.68231568533917, "learning_rate": 1.9998039873205866e-05, "loss": 1.0749, "step": 568 }, { "epoch": 0.07246446025757358, "grad_norm": 4.717088015083142, "learning_rate": 1.999799882591277e-05, "loss": 1.1654, "step": 569 }, { "epoch": 0.07259181431778022, "grad_norm": 5.238015073517867, "learning_rate": 1.999795735331616e-05, "loss": 1.0809, "step": 570 }, { "epoch": 0.07271916837798685, "grad_norm": 6.21417528126376, "learning_rate": 1.9997915455417813e-05, "loss": 1.1254, "step": 571 }, { "epoch": 0.07284652243819348, "grad_norm": 4.179097870209438, "learning_rate": 1.9997873132219502e-05, "loss": 1.0508, "step": 572 }, { "epoch": 0.07297387649840012, "grad_norm": 6.321086520222815, "learning_rate": 1.9997830383723034e-05, "loss": 1.1056, "step": 573 }, { "epoch": 0.07310123055860675, "grad_norm": 5.323689961360213, "learning_rate": 1.9997787209930222e-05, "loss": 1.1261, "step": 574 }, { "epoch": 0.07322858461881337, "grad_norm": 5.293979089457133, "learning_rate": 1.999774361084291e-05, "loss": 1.075, "step": 575 }, { "epoch": 0.07335593867902002, "grad_norm": 6.901915419823858, "learning_rate": 1.9997699586462947e-05, "loss": 1.0958, "step": 576 }, { "epoch": 0.07348329273922664, "grad_norm": 4.789539008525201, "learning_rate": 1.9997655136792206e-05, "loss": 1.0798, "step": 577 }, { "epoch": 0.07361064679943327, "grad_norm": 6.839564328066072, "learning_rate": 1.999761026183258e-05, "loss": 1.0093, "step": 578 }, { "epoch": 0.07373800085963991, "grad_norm": 6.244943022517674, "learning_rate": 1.9997564961585976e-05, "loss": 1.06, "step": 579 }, { "epoch": 0.07386535491984654, "grad_norm": 3.9667931485780747, "learning_rate": 1.999751923605432e-05, "loss": 1.0282, "step": 580 }, { "epoch": 0.07399270898005317, "grad_norm": 6.350330566452636, "learning_rate": 1.999747308523956e-05, "loss": 1.1528, "step": 581 }, { "epoch": 0.0741200630402598, "grad_norm": 5.164516042394334, "learning_rate": 1.999742650914366e-05, "loss": 1.031, "step": 582 }, { "epoch": 0.07424741710046644, "grad_norm": 4.923610417638436, "learning_rate": 1.9997379507768596e-05, "loss": 1.0159, "step": 583 }, { "epoch": 0.07437477116067306, "grad_norm": 5.076996381492745, "learning_rate": 1.9997332081116374e-05, "loss": 1.1246, "step": 584 }, { "epoch": 0.07450212522087969, "grad_norm": 3.4257266867804166, "learning_rate": 1.9997284229189008e-05, "loss": 1.1539, "step": 585 }, { "epoch": 0.07462947928108633, "grad_norm": 5.9256759966138635, "learning_rate": 1.9997235951988533e-05, "loss": 1.0759, "step": 586 }, { "epoch": 0.07475683334129296, "grad_norm": 5.6626644961145844, "learning_rate": 1.9997187249517004e-05, "loss": 1.1168, "step": 587 }, { "epoch": 0.07488418740149959, "grad_norm": 5.7046274374791075, "learning_rate": 1.999713812177649e-05, "loss": 0.9921, "step": 588 }, { "epoch": 0.07501154146170623, "grad_norm": 5.994923669426167, "learning_rate": 1.9997088568769084e-05, "loss": 1.2166, "step": 589 }, { "epoch": 0.07513889552191286, "grad_norm": 5.193736544338015, "learning_rate": 1.9997038590496892e-05, "loss": 0.9917, "step": 590 }, { "epoch": 0.07526624958211948, "grad_norm": 4.396294351179956, "learning_rate": 1.9996988186962044e-05, "loss": 1.0891, "step": 591 }, { "epoch": 0.07539360364232613, "grad_norm": 5.49995734458335, "learning_rate": 1.999693735816668e-05, "loss": 0.9993, "step": 592 }, { "epoch": 0.07552095770253275, "grad_norm": 5.253398884946806, "learning_rate": 1.9996886104112963e-05, "loss": 1.0713, "step": 593 }, { "epoch": 0.07564831176273938, "grad_norm": 4.642680011765876, "learning_rate": 1.9996834424803074e-05, "loss": 1.0839, "step": 594 }, { "epoch": 0.07577566582294602, "grad_norm": 6.37376182350069, "learning_rate": 1.999678232023921e-05, "loss": 1.1269, "step": 595 }, { "epoch": 0.07590301988315265, "grad_norm": 5.763360134317636, "learning_rate": 1.9996729790423588e-05, "loss": 1.1208, "step": 596 }, { "epoch": 0.07603037394335928, "grad_norm": 5.206828680769963, "learning_rate": 1.9996676835358443e-05, "loss": 1.0223, "step": 597 }, { "epoch": 0.07615772800356592, "grad_norm": 4.44025776020388, "learning_rate": 1.999662345504603e-05, "loss": 1.07, "step": 598 }, { "epoch": 0.07628508206377255, "grad_norm": 4.782135127297473, "learning_rate": 1.9996569649488614e-05, "loss": 1.0387, "step": 599 }, { "epoch": 0.07641243612397917, "grad_norm": 5.107139915137725, "learning_rate": 1.9996515418688493e-05, "loss": 1.1048, "step": 600 }, { "epoch": 0.07653979018418582, "grad_norm": 5.9873211732412495, "learning_rate": 1.9996460762647962e-05, "loss": 1.0631, "step": 601 }, { "epoch": 0.07666714424439244, "grad_norm": 5.043658764095534, "learning_rate": 1.9996405681369353e-05, "loss": 1.0388, "step": 602 }, { "epoch": 0.07679449830459907, "grad_norm": 4.477337036234279, "learning_rate": 1.999635017485501e-05, "loss": 1.0678, "step": 603 }, { "epoch": 0.07692185236480571, "grad_norm": 3.4802276151035993, "learning_rate": 1.999629424310729e-05, "loss": 1.0386, "step": 604 }, { "epoch": 0.07704920642501234, "grad_norm": 6.354251381564723, "learning_rate": 1.999623788612858e-05, "loss": 1.0509, "step": 605 }, { "epoch": 0.07717656048521897, "grad_norm": 7.521001783969517, "learning_rate": 1.9996181103921268e-05, "loss": 1.0353, "step": 606 }, { "epoch": 0.0773039145454256, "grad_norm": 5.629739248359252, "learning_rate": 1.9996123896487775e-05, "loss": 1.0046, "step": 607 }, { "epoch": 0.07743126860563224, "grad_norm": 5.746840230765029, "learning_rate": 1.9996066263830533e-05, "loss": 1.0223, "step": 608 }, { "epoch": 0.07755862266583886, "grad_norm": 6.298985584272212, "learning_rate": 1.999600820595199e-05, "loss": 1.0892, "step": 609 }, { "epoch": 0.07768597672604549, "grad_norm": 6.803573780037953, "learning_rate": 1.999594972285462e-05, "loss": 1.0924, "step": 610 }, { "epoch": 0.07781333078625213, "grad_norm": 6.37358376722561, "learning_rate": 1.9995890814540915e-05, "loss": 1.0744, "step": 611 }, { "epoch": 0.07794068484645876, "grad_norm": 9.007917100839986, "learning_rate": 1.9995831481013376e-05, "loss": 1.2104, "step": 612 }, { "epoch": 0.07806803890666539, "grad_norm": 8.287857126219594, "learning_rate": 1.999577172227452e-05, "loss": 1.0484, "step": 613 }, { "epoch": 0.07819539296687203, "grad_norm": 4.04299092309741, "learning_rate": 1.9995711538326903e-05, "loss": 0.9676, "step": 614 }, { "epoch": 0.07832274702707866, "grad_norm": 6.571260544897439, "learning_rate": 1.9995650929173075e-05, "loss": 1.0512, "step": 615 }, { "epoch": 0.07845010108728528, "grad_norm": 6.1484744156888915, "learning_rate": 1.999558989481562e-05, "loss": 1.1788, "step": 616 }, { "epoch": 0.07857745514749193, "grad_norm": 5.316568230621563, "learning_rate": 1.999552843525713e-05, "loss": 1.0587, "step": 617 }, { "epoch": 0.07870480920769855, "grad_norm": 4.570047484971308, "learning_rate": 1.999546655050022e-05, "loss": 1.0535, "step": 618 }, { "epoch": 0.07883216326790518, "grad_norm": 6.1608163386830785, "learning_rate": 1.9995404240547527e-05, "loss": 1.0144, "step": 619 }, { "epoch": 0.07895951732811182, "grad_norm": 7.7568651371924595, "learning_rate": 1.9995341505401697e-05, "loss": 1.0017, "step": 620 }, { "epoch": 0.07908687138831845, "grad_norm": 5.0343224284140184, "learning_rate": 1.99952783450654e-05, "loss": 0.9866, "step": 621 }, { "epoch": 0.07921422544852508, "grad_norm": 4.733892334358776, "learning_rate": 1.999521475954132e-05, "loss": 1.0346, "step": 622 }, { "epoch": 0.07934157950873172, "grad_norm": 8.245032462200156, "learning_rate": 1.9995150748832167e-05, "loss": 1.0559, "step": 623 }, { "epoch": 0.07946893356893835, "grad_norm": 5.288275385428034, "learning_rate": 1.9995086312940665e-05, "loss": 1.1287, "step": 624 }, { "epoch": 0.07959628762914497, "grad_norm": 4.692547128203939, "learning_rate": 1.9995021451869548e-05, "loss": 1.0788, "step": 625 }, { "epoch": 0.07972364168935162, "grad_norm": 5.274171136757848, "learning_rate": 1.999495616562158e-05, "loss": 1.0668, "step": 626 }, { "epoch": 0.07985099574955824, "grad_norm": 4.503948598565, "learning_rate": 1.9994890454199537e-05, "loss": 0.9856, "step": 627 }, { "epoch": 0.07997834980976487, "grad_norm": 4.458866099436066, "learning_rate": 1.999482431760621e-05, "loss": 1.1141, "step": 628 }, { "epoch": 0.0801057038699715, "grad_norm": 5.021075273227246, "learning_rate": 1.9994757755844422e-05, "loss": 1.0495, "step": 629 }, { "epoch": 0.08023305793017814, "grad_norm": 6.947131370381861, "learning_rate": 1.9994690768916997e-05, "loss": 0.98, "step": 630 }, { "epoch": 0.08036041199038477, "grad_norm": 7.41009293017886, "learning_rate": 1.999462335682679e-05, "loss": 1.1349, "step": 631 }, { "epoch": 0.0804877660505914, "grad_norm": 5.172513156024174, "learning_rate": 1.999455551957666e-05, "loss": 1.0966, "step": 632 }, { "epoch": 0.08061512011079804, "grad_norm": 5.951912183195801, "learning_rate": 1.99944872571695e-05, "loss": 1.0624, "step": 633 }, { "epoch": 0.08074247417100466, "grad_norm": 7.68449951170285, "learning_rate": 1.9994418569608212e-05, "loss": 0.9459, "step": 634 }, { "epoch": 0.08086982823121129, "grad_norm": 5.28572353524242, "learning_rate": 1.999434945689572e-05, "loss": 1.1071, "step": 635 }, { "epoch": 0.08099718229141793, "grad_norm": 5.390054600709794, "learning_rate": 1.9994279919034958e-05, "loss": 1.1041, "step": 636 }, { "epoch": 0.08112453635162456, "grad_norm": 6.091162812164593, "learning_rate": 1.999420995602889e-05, "loss": 1.0167, "step": 637 }, { "epoch": 0.08125189041183119, "grad_norm": 9.582331419829577, "learning_rate": 1.9994139567880492e-05, "loss": 1.1483, "step": 638 }, { "epoch": 0.08137924447203783, "grad_norm": 6.486061704103913, "learning_rate": 1.999406875459275e-05, "loss": 1.057, "step": 639 }, { "epoch": 0.08150659853224446, "grad_norm": 5.762382774236402, "learning_rate": 1.999399751616869e-05, "loss": 1.1079, "step": 640 }, { "epoch": 0.08163395259245108, "grad_norm": 5.748421825606348, "learning_rate": 1.999392585261133e-05, "loss": 1.0681, "step": 641 }, { "epoch": 0.08176130665265773, "grad_norm": 6.038734654142748, "learning_rate": 1.9993853763923724e-05, "loss": 0.9983, "step": 642 }, { "epoch": 0.08188866071286435, "grad_norm": 6.696020954623414, "learning_rate": 1.9993781250108934e-05, "loss": 0.9574, "step": 643 }, { "epoch": 0.08201601477307098, "grad_norm": 4.880728461504636, "learning_rate": 1.9993708311170055e-05, "loss": 1.0897, "step": 644 }, { "epoch": 0.08214336883327762, "grad_norm": 6.765789355953292, "learning_rate": 1.999363494711018e-05, "loss": 0.9652, "step": 645 }, { "epoch": 0.08227072289348425, "grad_norm": 4.847223623737666, "learning_rate": 1.999356115793243e-05, "loss": 1.079, "step": 646 }, { "epoch": 0.08239807695369088, "grad_norm": 5.234269143927291, "learning_rate": 1.999348694363995e-05, "loss": 1.0237, "step": 647 }, { "epoch": 0.08252543101389752, "grad_norm": 4.89183541331242, "learning_rate": 1.9993412304235898e-05, "loss": 1.0104, "step": 648 }, { "epoch": 0.08265278507410415, "grad_norm": 5.865712718277241, "learning_rate": 1.999333723972344e-05, "loss": 1.0922, "step": 649 }, { "epoch": 0.08278013913431077, "grad_norm": 5.789907869913807, "learning_rate": 1.999326175010578e-05, "loss": 1.0889, "step": 650 }, { "epoch": 0.0829074931945174, "grad_norm": 6.9597588942410065, "learning_rate": 1.9993185835386118e-05, "loss": 1.0146, "step": 651 }, { "epoch": 0.08303484725472404, "grad_norm": 5.810287218463056, "learning_rate": 1.999310949556769e-05, "loss": 1.0555, "step": 652 }, { "epoch": 0.08316220131493067, "grad_norm": 5.914241168247319, "learning_rate": 1.999303273065374e-05, "loss": 1.1138, "step": 653 }, { "epoch": 0.0832895553751373, "grad_norm": 7.40780816406947, "learning_rate": 1.9992955540647544e-05, "loss": 1.0209, "step": 654 }, { "epoch": 0.08341690943534394, "grad_norm": 5.139163287743998, "learning_rate": 1.999287792555237e-05, "loss": 1.1916, "step": 655 }, { "epoch": 0.08354426349555057, "grad_norm": 7.338151071702125, "learning_rate": 1.999279988537153e-05, "loss": 1.0876, "step": 656 }, { "epoch": 0.0836716175557572, "grad_norm": 8.346356045647864, "learning_rate": 1.9992721420108338e-05, "loss": 1.0237, "step": 657 }, { "epoch": 0.08379897161596384, "grad_norm": 5.194369994982586, "learning_rate": 1.9992642529766136e-05, "loss": 1.019, "step": 658 }, { "epoch": 0.08392632567617046, "grad_norm": 6.352857844734931, "learning_rate": 1.999256321434828e-05, "loss": 1.0938, "step": 659 }, { "epoch": 0.08405367973637709, "grad_norm": 4.991216000871772, "learning_rate": 1.9992483473858138e-05, "loss": 1.1457, "step": 660 }, { "epoch": 0.08418103379658373, "grad_norm": 5.119276246534278, "learning_rate": 1.9992403308299112e-05, "loss": 1.0455, "step": 661 }, { "epoch": 0.08430838785679036, "grad_norm": 6.314325956131299, "learning_rate": 1.9992322717674603e-05, "loss": 1.1113, "step": 662 }, { "epoch": 0.08443574191699699, "grad_norm": 7.77910023602844, "learning_rate": 1.9992241701988042e-05, "loss": 1.0262, "step": 663 }, { "epoch": 0.08456309597720363, "grad_norm": 7.1596010849287195, "learning_rate": 1.999216026124288e-05, "loss": 1.0519, "step": 664 }, { "epoch": 0.08469045003741026, "grad_norm": 4.897077210322237, "learning_rate": 1.9992078395442574e-05, "loss": 1.0238, "step": 665 }, { "epoch": 0.08481780409761688, "grad_norm": 6.509385595436648, "learning_rate": 1.9991996104590612e-05, "loss": 1.0995, "step": 666 }, { "epoch": 0.08494515815782352, "grad_norm": 6.02964231578448, "learning_rate": 1.999191338869049e-05, "loss": 1.0622, "step": 667 }, { "epoch": 0.08507251221803015, "grad_norm": 6.841055561733911, "learning_rate": 1.9991830247745732e-05, "loss": 1.089, "step": 668 }, { "epoch": 0.08519986627823678, "grad_norm": 6.576305581782257, "learning_rate": 1.999174668175987e-05, "loss": 1.0683, "step": 669 }, { "epoch": 0.08532722033844342, "grad_norm": 5.960560877989974, "learning_rate": 1.9991662690736462e-05, "loss": 1.0535, "step": 670 }, { "epoch": 0.08545457439865005, "grad_norm": 6.509239086210101, "learning_rate": 1.999157827467908e-05, "loss": 1.0301, "step": 671 }, { "epoch": 0.08558192845885668, "grad_norm": 7.124308770095535, "learning_rate": 1.9991493433591315e-05, "loss": 1.1226, "step": 672 }, { "epoch": 0.0857092825190633, "grad_norm": 5.026968704141392, "learning_rate": 1.9991408167476772e-05, "loss": 1.1574, "step": 673 }, { "epoch": 0.08583663657926995, "grad_norm": 4.301359298900442, "learning_rate": 1.9991322476339088e-05, "loss": 1.0282, "step": 674 }, { "epoch": 0.08596399063947657, "grad_norm": 11.74719529462412, "learning_rate": 1.9991236360181897e-05, "loss": 1.0841, "step": 675 }, { "epoch": 0.0860913446996832, "grad_norm": 5.71392752342204, "learning_rate": 1.999114981900887e-05, "loss": 1.128, "step": 676 }, { "epoch": 0.08621869875988984, "grad_norm": 5.9033570647377465, "learning_rate": 1.9991062852823683e-05, "loss": 1.0681, "step": 677 }, { "epoch": 0.08634605282009647, "grad_norm": 5.322196996403861, "learning_rate": 1.9990975461630043e-05, "loss": 1.0203, "step": 678 }, { "epoch": 0.0864734068803031, "grad_norm": 4.951072869448979, "learning_rate": 1.999088764543166e-05, "loss": 1.0799, "step": 679 }, { "epoch": 0.08660076094050974, "grad_norm": 5.500998839799335, "learning_rate": 1.999079940423227e-05, "loss": 1.0914, "step": 680 }, { "epoch": 0.08672811500071637, "grad_norm": 6.443149791036705, "learning_rate": 1.9990710738035632e-05, "loss": 1.212, "step": 681 }, { "epoch": 0.086855469060923, "grad_norm": 5.5992443869211295, "learning_rate": 1.999062164684551e-05, "loss": 1.1295, "step": 682 }, { "epoch": 0.08698282312112963, "grad_norm": 4.401937119017067, "learning_rate": 1.9990532130665703e-05, "loss": 1.0117, "step": 683 }, { "epoch": 0.08711017718133626, "grad_norm": 6.193840092408911, "learning_rate": 1.9990442189500016e-05, "loss": 1.0194, "step": 684 }, { "epoch": 0.08723753124154289, "grad_norm": 7.51254012466861, "learning_rate": 1.9990351823352268e-05, "loss": 0.9975, "step": 685 }, { "epoch": 0.08736488530174953, "grad_norm": 5.877886124116896, "learning_rate": 1.999026103222631e-05, "loss": 1.1254, "step": 686 }, { "epoch": 0.08749223936195616, "grad_norm": 5.472095272874463, "learning_rate": 1.9990169816126005e-05, "loss": 1.0221, "step": 687 }, { "epoch": 0.08761959342216279, "grad_norm": 4.893392833864217, "learning_rate": 1.999007817505523e-05, "loss": 1.0455, "step": 688 }, { "epoch": 0.08774694748236943, "grad_norm": 5.637380242608783, "learning_rate": 1.9989986109017882e-05, "loss": 1.02, "step": 689 }, { "epoch": 0.08787430154257606, "grad_norm": 4.8513515752131635, "learning_rate": 1.9989893618017882e-05, "loss": 1.0035, "step": 690 }, { "epoch": 0.08800165560278268, "grad_norm": 6.058231699570725, "learning_rate": 1.9989800702059158e-05, "loss": 1.0731, "step": 691 }, { "epoch": 0.08812900966298932, "grad_norm": 5.045541876584629, "learning_rate": 1.9989707361145672e-05, "loss": 0.9995, "step": 692 }, { "epoch": 0.08825636372319595, "grad_norm": 5.568628770940815, "learning_rate": 1.9989613595281384e-05, "loss": 1.0222, "step": 693 }, { "epoch": 0.08838371778340258, "grad_norm": 5.02853680543003, "learning_rate": 1.998951940447029e-05, "loss": 1.0428, "step": 694 }, { "epoch": 0.08851107184360922, "grad_norm": 4.792565214522304, "learning_rate": 1.9989424788716397e-05, "loss": 1.0065, "step": 695 }, { "epoch": 0.08863842590381585, "grad_norm": 5.291371775342626, "learning_rate": 1.9989329748023728e-05, "loss": 1.0209, "step": 696 }, { "epoch": 0.08876577996402248, "grad_norm": 6.034196045176283, "learning_rate": 1.998923428239632e-05, "loss": 1.1247, "step": 697 }, { "epoch": 0.0888931340242291, "grad_norm": 5.512739512804424, "learning_rate": 1.9989138391838243e-05, "loss": 1.0025, "step": 698 }, { "epoch": 0.08902048808443574, "grad_norm": 5.877097871265012, "learning_rate": 1.9989042076353572e-05, "loss": 1.0448, "step": 699 }, { "epoch": 0.08914784214464237, "grad_norm": 4.461166883673963, "learning_rate": 1.9988945335946407e-05, "loss": 1.0821, "step": 700 }, { "epoch": 0.089275196204849, "grad_norm": 6.024518144895393, "learning_rate": 1.9988848170620857e-05, "loss": 1.1252, "step": 701 }, { "epoch": 0.08940255026505564, "grad_norm": 5.810905319826364, "learning_rate": 1.998875058038106e-05, "loss": 1.0291, "step": 702 }, { "epoch": 0.08952990432526227, "grad_norm": 4.288768987566355, "learning_rate": 1.9988652565231167e-05, "loss": 0.951, "step": 703 }, { "epoch": 0.0896572583854689, "grad_norm": 4.429403951945096, "learning_rate": 1.9988554125175347e-05, "loss": 1.1162, "step": 704 }, { "epoch": 0.08978461244567554, "grad_norm": 5.102004214642084, "learning_rate": 1.998845526021779e-05, "loss": 1.02, "step": 705 }, { "epoch": 0.08991196650588217, "grad_norm": 4.930098381995267, "learning_rate": 1.9988355970362693e-05, "loss": 1.0211, "step": 706 }, { "epoch": 0.09003932056608879, "grad_norm": 5.773187737747096, "learning_rate": 1.9988256255614292e-05, "loss": 1.0348, "step": 707 }, { "epoch": 0.09016667462629543, "grad_norm": 5.927086944668368, "learning_rate": 1.9988156115976818e-05, "loss": 1.1057, "step": 708 }, { "epoch": 0.09029402868650206, "grad_norm": 5.025749279734705, "learning_rate": 1.998805555145454e-05, "loss": 0.9713, "step": 709 }, { "epoch": 0.09042138274670869, "grad_norm": 5.882972340229742, "learning_rate": 1.9987954562051724e-05, "loss": 1.0112, "step": 710 }, { "epoch": 0.09054873680691533, "grad_norm": 5.104455101530909, "learning_rate": 1.9987853147772677e-05, "loss": 1.1046, "step": 711 }, { "epoch": 0.09067609086712196, "grad_norm": 4.674587113206371, "learning_rate": 1.9987751308621714e-05, "loss": 1.0097, "step": 712 }, { "epoch": 0.09080344492732859, "grad_norm": 4.381587273734659, "learning_rate": 1.9987649044603158e-05, "loss": 1.0142, "step": 713 }, { "epoch": 0.09093079898753523, "grad_norm": 9.46538823947152, "learning_rate": 1.9987546355721363e-05, "loss": 1.095, "step": 714 }, { "epoch": 0.09105815304774185, "grad_norm": 4.511638425962468, "learning_rate": 1.9987443241980696e-05, "loss": 1.1166, "step": 715 }, { "epoch": 0.09118550710794848, "grad_norm": 5.012728511329524, "learning_rate": 1.9987339703385552e-05, "loss": 1.1406, "step": 716 }, { "epoch": 0.09131286116815512, "grad_norm": 6.335841769660585, "learning_rate": 1.9987235739940325e-05, "loss": 1.1545, "step": 717 }, { "epoch": 0.09144021522836175, "grad_norm": 4.924962410565526, "learning_rate": 1.9987131351649437e-05, "loss": 1.1317, "step": 718 }, { "epoch": 0.09156756928856838, "grad_norm": 6.1544385074745165, "learning_rate": 1.998702653851734e-05, "loss": 1.1361, "step": 719 }, { "epoch": 0.091694923348775, "grad_norm": 6.154575178172681, "learning_rate": 1.998692130054848e-05, "loss": 0.9903, "step": 720 }, { "epoch": 0.09182227740898165, "grad_norm": 6.154952943747507, "learning_rate": 1.998681563774734e-05, "loss": 1.0783, "step": 721 }, { "epoch": 0.09194963146918828, "grad_norm": 4.838842453109084, "learning_rate": 1.9986709550118417e-05, "loss": 1.0136, "step": 722 }, { "epoch": 0.0920769855293949, "grad_norm": 5.410325823718829, "learning_rate": 1.9986603037666218e-05, "loss": 1.0734, "step": 723 }, { "epoch": 0.09220433958960154, "grad_norm": 7.933672605467293, "learning_rate": 1.9986496100395276e-05, "loss": 1.0677, "step": 724 }, { "epoch": 0.09233169364980817, "grad_norm": 9.510319375372314, "learning_rate": 1.9986388738310142e-05, "loss": 1.0391, "step": 725 }, { "epoch": 0.0924590477100148, "grad_norm": 5.218726609229018, "learning_rate": 1.998628095141538e-05, "loss": 1.0036, "step": 726 }, { "epoch": 0.09258640177022144, "grad_norm": 6.188449979354496, "learning_rate": 1.998617273971558e-05, "loss": 1.0897, "step": 727 }, { "epoch": 0.09271375583042807, "grad_norm": 5.214680072630314, "learning_rate": 1.998606410321534e-05, "loss": 1.0975, "step": 728 }, { "epoch": 0.0928411098906347, "grad_norm": 5.1103404588248855, "learning_rate": 1.998595504191928e-05, "loss": 1.115, "step": 729 }, { "epoch": 0.09296846395084134, "grad_norm": 7.251754548435966, "learning_rate": 1.9985845555832047e-05, "loss": 0.9956, "step": 730 }, { "epoch": 0.09309581801104796, "grad_norm": 6.024552653946663, "learning_rate": 1.9985735644958292e-05, "loss": 1.0765, "step": 731 }, { "epoch": 0.09322317207125459, "grad_norm": 4.944129259919682, "learning_rate": 1.9985625309302692e-05, "loss": 1.1096, "step": 732 }, { "epoch": 0.09335052613146123, "grad_norm": 4.15991324847806, "learning_rate": 1.9985514548869942e-05, "loss": 0.9459, "step": 733 }, { "epoch": 0.09347788019166786, "grad_norm": 3.7197710446687253, "learning_rate": 1.998540336366475e-05, "loss": 0.9994, "step": 734 }, { "epoch": 0.09360523425187449, "grad_norm": 6.06962528536452, "learning_rate": 1.9985291753691853e-05, "loss": 0.9692, "step": 735 }, { "epoch": 0.09373258831208113, "grad_norm": 4.293669347546974, "learning_rate": 1.998517971895599e-05, "loss": 1.163, "step": 736 }, { "epoch": 0.09385994237228776, "grad_norm": 4.771429610435823, "learning_rate": 1.9985067259461936e-05, "loss": 1.0687, "step": 737 }, { "epoch": 0.09398729643249439, "grad_norm": 5.6708848998084065, "learning_rate": 1.9984954375214464e-05, "loss": 0.9521, "step": 738 }, { "epoch": 0.09411465049270103, "grad_norm": 7.222470864474539, "learning_rate": 1.9984841066218387e-05, "loss": 1.0992, "step": 739 }, { "epoch": 0.09424200455290765, "grad_norm": 5.365241129523414, "learning_rate": 1.9984727332478517e-05, "loss": 1.1041, "step": 740 }, { "epoch": 0.09436935861311428, "grad_norm": 6.039438648032581, "learning_rate": 1.9984613173999694e-05, "loss": 1.0716, "step": 741 }, { "epoch": 0.09449671267332091, "grad_norm": 4.452459345808136, "learning_rate": 1.9984498590786778e-05, "loss": 1.0688, "step": 742 }, { "epoch": 0.09462406673352755, "grad_norm": 4.496510318180523, "learning_rate": 1.9984383582844636e-05, "loss": 1.1061, "step": 743 }, { "epoch": 0.09475142079373418, "grad_norm": 5.133958012277289, "learning_rate": 1.998426815017817e-05, "loss": 1.0172, "step": 744 }, { "epoch": 0.0948787748539408, "grad_norm": 6.401996670637185, "learning_rate": 1.998415229279228e-05, "loss": 1.0715, "step": 745 }, { "epoch": 0.09500612891414745, "grad_norm": 5.80380879011085, "learning_rate": 1.99840360106919e-05, "loss": 0.9658, "step": 746 }, { "epoch": 0.09513348297435407, "grad_norm": 5.835257004125801, "learning_rate": 1.998391930388198e-05, "loss": 1.1158, "step": 747 }, { "epoch": 0.0952608370345607, "grad_norm": 5.1207374896936955, "learning_rate": 1.9983802172367477e-05, "loss": 1.0881, "step": 748 }, { "epoch": 0.09538819109476734, "grad_norm": 11.487577279944423, "learning_rate": 1.9983684616153378e-05, "loss": 1.1162, "step": 749 }, { "epoch": 0.09551554515497397, "grad_norm": 5.542628422317417, "learning_rate": 1.998356663524468e-05, "loss": 1.1225, "step": 750 }, { "epoch": 0.0956428992151806, "grad_norm": 6.967952118088262, "learning_rate": 1.998344822964641e-05, "loss": 1.1577, "step": 751 }, { "epoch": 0.09577025327538724, "grad_norm": 5.47302950580714, "learning_rate": 1.99833293993636e-05, "loss": 1.0219, "step": 752 }, { "epoch": 0.09589760733559387, "grad_norm": 5.362081572740628, "learning_rate": 1.99832101444013e-05, "loss": 1.0726, "step": 753 }, { "epoch": 0.0960249613958005, "grad_norm": 6.812421045808244, "learning_rate": 1.9983090464764587e-05, "loss": 1.065, "step": 754 }, { "epoch": 0.09615231545600714, "grad_norm": 8.852579514610417, "learning_rate": 1.9982970360458557e-05, "loss": 1.1051, "step": 755 }, { "epoch": 0.09627966951621376, "grad_norm": 6.265131524266463, "learning_rate": 1.998284983148831e-05, "loss": 1.0406, "step": 756 }, { "epoch": 0.09640702357642039, "grad_norm": 10.166042829874742, "learning_rate": 1.9982728877858978e-05, "loss": 1.0842, "step": 757 }, { "epoch": 0.09653437763662703, "grad_norm": 5.6845487546028695, "learning_rate": 1.998260749957571e-05, "loss": 1.0445, "step": 758 }, { "epoch": 0.09666173169683366, "grad_norm": 4.165910779417907, "learning_rate": 1.9982485696643663e-05, "loss": 0.9373, "step": 759 }, { "epoch": 0.09678908575704029, "grad_norm": 4.396198685395114, "learning_rate": 1.998236346906802e-05, "loss": 1.0627, "step": 760 }, { "epoch": 0.09691643981724693, "grad_norm": 6.08568614592465, "learning_rate": 1.9982240816853983e-05, "loss": 1.0471, "step": 761 }, { "epoch": 0.09704379387745356, "grad_norm": 5.1671257225291, "learning_rate": 1.9982117740006763e-05, "loss": 1.1163, "step": 762 }, { "epoch": 0.09717114793766018, "grad_norm": 5.916493408818419, "learning_rate": 1.9981994238531603e-05, "loss": 1.1089, "step": 763 }, { "epoch": 0.09729850199786683, "grad_norm": 4.560569041032856, "learning_rate": 1.9981870312433755e-05, "loss": 0.9946, "step": 764 }, { "epoch": 0.09742585605807345, "grad_norm": 4.936586319610627, "learning_rate": 1.9981745961718486e-05, "loss": 1.1241, "step": 765 }, { "epoch": 0.09755321011828008, "grad_norm": 5.56261932587953, "learning_rate": 1.998162118639109e-05, "loss": 1.1137, "step": 766 }, { "epoch": 0.09768056417848671, "grad_norm": 4.700296051765587, "learning_rate": 1.9981495986456876e-05, "loss": 0.9961, "step": 767 }, { "epoch": 0.09780791823869335, "grad_norm": 5.784826543609449, "learning_rate": 1.9981370361921165e-05, "loss": 1.0256, "step": 768 }, { "epoch": 0.09793527229889998, "grad_norm": 5.929088537148662, "learning_rate": 1.9981244312789306e-05, "loss": 0.9581, "step": 769 }, { "epoch": 0.0980626263591066, "grad_norm": 5.179139535494771, "learning_rate": 1.9981117839066657e-05, "loss": 0.9951, "step": 770 }, { "epoch": 0.09818998041931325, "grad_norm": 6.371153924167855, "learning_rate": 1.99809909407586e-05, "loss": 1.0445, "step": 771 }, { "epoch": 0.09831733447951987, "grad_norm": 7.152752283867728, "learning_rate": 1.998086361787053e-05, "loss": 1.1133, "step": 772 }, { "epoch": 0.0984446885397265, "grad_norm": 5.0226457841069605, "learning_rate": 1.998073587040787e-05, "loss": 1.086, "step": 773 }, { "epoch": 0.09857204259993314, "grad_norm": 5.248569293580547, "learning_rate": 1.998060769837605e-05, "loss": 1.0538, "step": 774 }, { "epoch": 0.09869939666013977, "grad_norm": 6.354143319842696, "learning_rate": 1.9980479101780525e-05, "loss": 1.0211, "step": 775 }, { "epoch": 0.0988267507203464, "grad_norm": 4.431387976904693, "learning_rate": 1.9980350080626755e-05, "loss": 0.9861, "step": 776 }, { "epoch": 0.09895410478055304, "grad_norm": 6.274642885540535, "learning_rate": 1.998022063492024e-05, "loss": 1.0126, "step": 777 }, { "epoch": 0.09908145884075967, "grad_norm": 4.3373684558105, "learning_rate": 1.9980090764666486e-05, "loss": 1.0245, "step": 778 }, { "epoch": 0.0992088129009663, "grad_norm": 5.5406400642685085, "learning_rate": 1.997996046987101e-05, "loss": 1.0031, "step": 779 }, { "epoch": 0.09933616696117294, "grad_norm": 6.194165887352312, "learning_rate": 1.997982975053936e-05, "loss": 1.0883, "step": 780 }, { "epoch": 0.09946352102137956, "grad_norm": 5.211934830651193, "learning_rate": 1.9979698606677095e-05, "loss": 1.1272, "step": 781 }, { "epoch": 0.09959087508158619, "grad_norm": 7.170192413089251, "learning_rate": 1.9979567038289796e-05, "loss": 1.0528, "step": 782 }, { "epoch": 0.09971822914179283, "grad_norm": 6.254459468621482, "learning_rate": 1.997943504538306e-05, "loss": 0.9662, "step": 783 }, { "epoch": 0.09984558320199946, "grad_norm": 6.957908510804562, "learning_rate": 1.9979302627962494e-05, "loss": 1.0736, "step": 784 }, { "epoch": 0.09997293726220609, "grad_norm": 6.320699069316108, "learning_rate": 1.997916978603374e-05, "loss": 1.0725, "step": 785 }, { "epoch": 0.10010029132241273, "grad_norm": 5.789714163937426, "learning_rate": 1.9979036519602446e-05, "loss": 1.0178, "step": 786 }, { "epoch": 0.10022764538261936, "grad_norm": 6.830007751817898, "learning_rate": 1.997890282867428e-05, "loss": 1.0137, "step": 787 }, { "epoch": 0.10035499944282598, "grad_norm": 4.903635482520552, "learning_rate": 1.9978768713254927e-05, "loss": 1.0192, "step": 788 }, { "epoch": 0.10048235350303261, "grad_norm": 18.90650180617438, "learning_rate": 1.99786341733501e-05, "loss": 0.9982, "step": 789 }, { "epoch": 0.10060970756323925, "grad_norm": 4.963165407706841, "learning_rate": 1.997849920896551e-05, "loss": 0.9878, "step": 790 }, { "epoch": 0.10073706162344588, "grad_norm": 4.880410542025724, "learning_rate": 1.9978363820106912e-05, "loss": 1.0394, "step": 791 }, { "epoch": 0.10086441568365251, "grad_norm": 5.359966990525815, "learning_rate": 1.9978228006780056e-05, "loss": 1.0565, "step": 792 }, { "epoch": 0.10099176974385915, "grad_norm": 5.27348234934632, "learning_rate": 1.9978091768990723e-05, "loss": 1.1053, "step": 793 }, { "epoch": 0.10111912380406578, "grad_norm": 6.207040468644822, "learning_rate": 1.9977955106744706e-05, "loss": 1.047, "step": 794 }, { "epoch": 0.1012464778642724, "grad_norm": 5.75935564622613, "learning_rate": 1.9977818020047816e-05, "loss": 0.9662, "step": 795 }, { "epoch": 0.10137383192447905, "grad_norm": 3.5010736725984377, "learning_rate": 1.9977680508905894e-05, "loss": 1.0921, "step": 796 }, { "epoch": 0.10150118598468567, "grad_norm": 7.0157745277425425, "learning_rate": 1.9977542573324782e-05, "loss": 0.9324, "step": 797 }, { "epoch": 0.1016285400448923, "grad_norm": 4.361273239480551, "learning_rate": 1.9977404213310347e-05, "loss": 1.0017, "step": 798 }, { "epoch": 0.10175589410509894, "grad_norm": 5.415330262261502, "learning_rate": 1.997726542886848e-05, "loss": 0.997, "step": 799 }, { "epoch": 0.10188324816530557, "grad_norm": 6.053517402057396, "learning_rate": 1.9977126220005082e-05, "loss": 1.0096, "step": 800 }, { "epoch": 0.1020106022255122, "grad_norm": 5.6546141224148165, "learning_rate": 1.9976986586726072e-05, "loss": 1.1677, "step": 801 }, { "epoch": 0.10213795628571884, "grad_norm": 4.5596153860015125, "learning_rate": 1.9976846529037396e-05, "loss": 1.039, "step": 802 }, { "epoch": 0.10226531034592547, "grad_norm": 6.434913485996966, "learning_rate": 1.9976706046945003e-05, "loss": 1.032, "step": 803 }, { "epoch": 0.1023926644061321, "grad_norm": 6.885333254525804, "learning_rate": 1.9976565140454877e-05, "loss": 1.0179, "step": 804 }, { "epoch": 0.10252001846633874, "grad_norm": 5.634192690488991, "learning_rate": 1.9976423809573014e-05, "loss": 1.007, "step": 805 }, { "epoch": 0.10264737252654536, "grad_norm": 5.028451840415393, "learning_rate": 1.9976282054305415e-05, "loss": 1.0552, "step": 806 }, { "epoch": 0.10277472658675199, "grad_norm": 5.18185209266238, "learning_rate": 1.9976139874658117e-05, "loss": 0.985, "step": 807 }, { "epoch": 0.10290208064695863, "grad_norm": 5.351761194703276, "learning_rate": 1.9975997270637172e-05, "loss": 0.977, "step": 808 }, { "epoch": 0.10302943470716526, "grad_norm": 5.0199059016310725, "learning_rate": 1.997585424224864e-05, "loss": 1.0321, "step": 809 }, { "epoch": 0.10315678876737189, "grad_norm": 7.4969247235155425, "learning_rate": 1.9975710789498603e-05, "loss": 1.1377, "step": 810 }, { "epoch": 0.10328414282757852, "grad_norm": 5.024824126093265, "learning_rate": 1.9975566912393173e-05, "loss": 1.0533, "step": 811 }, { "epoch": 0.10341149688778516, "grad_norm": 9.205641086845837, "learning_rate": 1.9975422610938463e-05, "loss": 1.0876, "step": 812 }, { "epoch": 0.10353885094799178, "grad_norm": 4.248778448141489, "learning_rate": 1.9975277885140613e-05, "loss": 0.9902, "step": 813 }, { "epoch": 0.10366620500819841, "grad_norm": 5.970955805599458, "learning_rate": 1.9975132735005778e-05, "loss": 1.0198, "step": 814 }, { "epoch": 0.10379355906840505, "grad_norm": 5.154292249370319, "learning_rate": 1.9974987160540132e-05, "loss": 1.0242, "step": 815 }, { "epoch": 0.10392091312861168, "grad_norm": 5.2176426609278055, "learning_rate": 1.9974841161749875e-05, "loss": 1.1066, "step": 816 }, { "epoch": 0.10404826718881831, "grad_norm": 5.4636206413600235, "learning_rate": 1.997469473864121e-05, "loss": 1.1187, "step": 817 }, { "epoch": 0.10417562124902495, "grad_norm": 5.683022885312907, "learning_rate": 1.997454789122037e-05, "loss": 1.1843, "step": 818 }, { "epoch": 0.10430297530923158, "grad_norm": 5.910681621146795, "learning_rate": 1.9974400619493595e-05, "loss": 1.0454, "step": 819 }, { "epoch": 0.1044303293694382, "grad_norm": 5.36205187083293, "learning_rate": 1.9974252923467162e-05, "loss": 0.9426, "step": 820 }, { "epoch": 0.10455768342964485, "grad_norm": 5.648169546775838, "learning_rate": 1.9974104803147344e-05, "loss": 1.0387, "step": 821 }, { "epoch": 0.10468503748985147, "grad_norm": 4.938694869698999, "learning_rate": 1.9973956258540438e-05, "loss": 1.0432, "step": 822 }, { "epoch": 0.1048123915500581, "grad_norm": 6.627397211487185, "learning_rate": 1.9973807289652777e-05, "loss": 1.1652, "step": 823 }, { "epoch": 0.10493974561026474, "grad_norm": 6.178454019201148, "learning_rate": 1.9973657896490687e-05, "loss": 1.0397, "step": 824 }, { "epoch": 0.10506709967047137, "grad_norm": 6.174888628368443, "learning_rate": 1.9973508079060524e-05, "loss": 0.978, "step": 825 }, { "epoch": 0.105194453730678, "grad_norm": 6.708740158855683, "learning_rate": 1.9973357837368664e-05, "loss": 0.9996, "step": 826 }, { "epoch": 0.10532180779088464, "grad_norm": 7.273286451170561, "learning_rate": 1.99732071714215e-05, "loss": 0.9919, "step": 827 }, { "epoch": 0.10544916185109127, "grad_norm": 4.836429305944261, "learning_rate": 1.997305608122544e-05, "loss": 0.9303, "step": 828 }, { "epoch": 0.1055765159112979, "grad_norm": 5.203196115551915, "learning_rate": 1.9972904566786903e-05, "loss": 0.9287, "step": 829 }, { "epoch": 0.10570386997150454, "grad_norm": 5.355710926325952, "learning_rate": 1.9972752628112344e-05, "loss": 1.0132, "step": 830 }, { "epoch": 0.10583122403171116, "grad_norm": 5.978964807733789, "learning_rate": 1.9972600265208224e-05, "loss": 1.0344, "step": 831 }, { "epoch": 0.10595857809191779, "grad_norm": 4.315744257777001, "learning_rate": 1.9972447478081026e-05, "loss": 1.1191, "step": 832 }, { "epoch": 0.10608593215212442, "grad_norm": 5.5114553971235605, "learning_rate": 1.9972294266737243e-05, "loss": 1.0505, "step": 833 }, { "epoch": 0.10621328621233106, "grad_norm": 7.059071476639452, "learning_rate": 1.9972140631183396e-05, "loss": 1.2202, "step": 834 }, { "epoch": 0.10634064027253769, "grad_norm": 4.830647719965635, "learning_rate": 1.9971986571426024e-05, "loss": 1.0377, "step": 835 }, { "epoch": 0.10646799433274431, "grad_norm": 5.099515121448014, "learning_rate": 1.9971832087471678e-05, "loss": 1.0378, "step": 836 }, { "epoch": 0.10659534839295096, "grad_norm": 5.874044137666145, "learning_rate": 1.9971677179326925e-05, "loss": 1.0322, "step": 837 }, { "epoch": 0.10672270245315758, "grad_norm": 6.157219050476186, "learning_rate": 1.9971521846998362e-05, "loss": 1.0744, "step": 838 }, { "epoch": 0.10685005651336421, "grad_norm": 4.74324840763616, "learning_rate": 1.997136609049259e-05, "loss": 0.9892, "step": 839 }, { "epoch": 0.10697741057357085, "grad_norm": 3.737316042860656, "learning_rate": 1.997120990981624e-05, "loss": 1.082, "step": 840 }, { "epoch": 0.10710476463377748, "grad_norm": 5.970295491277951, "learning_rate": 1.9971053304975955e-05, "loss": 1.0749, "step": 841 }, { "epoch": 0.10723211869398411, "grad_norm": 8.05229262408578, "learning_rate": 1.9970896275978392e-05, "loss": 1.0926, "step": 842 }, { "epoch": 0.10735947275419075, "grad_norm": 4.885471541605568, "learning_rate": 1.9970738822830237e-05, "loss": 0.9979, "step": 843 }, { "epoch": 0.10748682681439738, "grad_norm": 6.75132907922118, "learning_rate": 1.9970580945538187e-05, "loss": 1.0148, "step": 844 }, { "epoch": 0.107614180874604, "grad_norm": 5.0097069928342846, "learning_rate": 1.997042264410895e-05, "loss": 1.1069, "step": 845 }, { "epoch": 0.10774153493481065, "grad_norm": 5.598578899449339, "learning_rate": 1.9970263918549274e-05, "loss": 1.0181, "step": 846 }, { "epoch": 0.10786888899501727, "grad_norm": 5.37790388575956, "learning_rate": 1.9970104768865895e-05, "loss": 1.0247, "step": 847 }, { "epoch": 0.1079962430552239, "grad_norm": 6.595977551571855, "learning_rate": 1.9969945195065596e-05, "loss": 1.0204, "step": 848 }, { "epoch": 0.10812359711543054, "grad_norm": 7.964236907516185, "learning_rate": 1.996978519715516e-05, "loss": 0.9665, "step": 849 }, { "epoch": 0.10825095117563717, "grad_norm": 6.392637201835179, "learning_rate": 1.9969624775141393e-05, "loss": 1.0868, "step": 850 }, { "epoch": 0.1083783052358438, "grad_norm": 5.587715175504954, "learning_rate": 1.9969463929031118e-05, "loss": 0.9563, "step": 851 }, { "epoch": 0.10850565929605044, "grad_norm": 4.658229330377023, "learning_rate": 1.9969302658831182e-05, "loss": 1.0354, "step": 852 }, { "epoch": 0.10863301335625707, "grad_norm": 4.787778299474807, "learning_rate": 1.996914096454844e-05, "loss": 0.9508, "step": 853 }, { "epoch": 0.1087603674164637, "grad_norm": 4.867284685024919, "learning_rate": 1.996897884618977e-05, "loss": 1.0619, "step": 854 }, { "epoch": 0.10888772147667033, "grad_norm": 5.147589421211179, "learning_rate": 1.9968816303762076e-05, "loss": 0.9208, "step": 855 }, { "epoch": 0.10901507553687696, "grad_norm": 6.599867455278636, "learning_rate": 1.9968653337272262e-05, "loss": 0.9198, "step": 856 }, { "epoch": 0.10914242959708359, "grad_norm": 7.3878300162248465, "learning_rate": 1.9968489946727265e-05, "loss": 1.0832, "step": 857 }, { "epoch": 0.10926978365729022, "grad_norm": 5.149081153750179, "learning_rate": 1.996832613213404e-05, "loss": 1.0392, "step": 858 }, { "epoch": 0.10939713771749686, "grad_norm": 7.385644504779292, "learning_rate": 1.9968161893499548e-05, "loss": 1.0305, "step": 859 }, { "epoch": 0.10952449177770349, "grad_norm": 6.126046850311139, "learning_rate": 1.9967997230830784e-05, "loss": 1.0887, "step": 860 }, { "epoch": 0.10965184583791011, "grad_norm": 7.1861931981170715, "learning_rate": 1.996783214413474e-05, "loss": 1.109, "step": 861 }, { "epoch": 0.10977919989811676, "grad_norm": 6.273300682466664, "learning_rate": 1.9967666633418454e-05, "loss": 1.0648, "step": 862 }, { "epoch": 0.10990655395832338, "grad_norm": 6.694825340539664, "learning_rate": 1.996750069868895e-05, "loss": 1.1229, "step": 863 }, { "epoch": 0.11003390801853001, "grad_norm": 4.287595849036246, "learning_rate": 1.9967334339953303e-05, "loss": 0.9974, "step": 864 }, { "epoch": 0.11016126207873665, "grad_norm": 5.7722573112928215, "learning_rate": 1.996716755721858e-05, "loss": 1.0254, "step": 865 }, { "epoch": 0.11028861613894328, "grad_norm": 4.54564301360154, "learning_rate": 1.9967000350491873e-05, "loss": 0.947, "step": 866 }, { "epoch": 0.11041597019914991, "grad_norm": 4.937993278489573, "learning_rate": 1.9966832719780305e-05, "loss": 1.105, "step": 867 }, { "epoch": 0.11054332425935655, "grad_norm": 5.060781116430827, "learning_rate": 1.9966664665090997e-05, "loss": 1.111, "step": 868 }, { "epoch": 0.11067067831956318, "grad_norm": 6.785440723383181, "learning_rate": 1.9966496186431106e-05, "loss": 1.0737, "step": 869 }, { "epoch": 0.1107980323797698, "grad_norm": 6.19654754989308, "learning_rate": 1.9966327283807794e-05, "loss": 1.0739, "step": 870 }, { "epoch": 0.11092538643997644, "grad_norm": 6.420826475951312, "learning_rate": 1.996615795722825e-05, "loss": 1.0866, "step": 871 }, { "epoch": 0.11105274050018307, "grad_norm": 5.052261359889531, "learning_rate": 1.996598820669967e-05, "loss": 1.0155, "step": 872 }, { "epoch": 0.1111800945603897, "grad_norm": 4.816232002119726, "learning_rate": 1.996581803222928e-05, "loss": 1.0649, "step": 873 }, { "epoch": 0.11130744862059634, "grad_norm": 4.5369893524939595, "learning_rate": 1.9965647433824315e-05, "loss": 0.9625, "step": 874 }, { "epoch": 0.11143480268080297, "grad_norm": 5.307753652943214, "learning_rate": 1.9965476411492043e-05, "loss": 0.9882, "step": 875 }, { "epoch": 0.1115621567410096, "grad_norm": 6.092648763675741, "learning_rate": 1.9965304965239723e-05, "loss": 0.8954, "step": 876 }, { "epoch": 0.11168951080121624, "grad_norm": 6.984597836210311, "learning_rate": 1.996513309507466e-05, "loss": 1.0302, "step": 877 }, { "epoch": 0.11181686486142287, "grad_norm": 5.913009731264718, "learning_rate": 1.9964960801004164e-05, "loss": 1.082, "step": 878 }, { "epoch": 0.11194421892162949, "grad_norm": 4.500339071427008, "learning_rate": 1.9964788083035554e-05, "loss": 1.0465, "step": 879 }, { "epoch": 0.11207157298183612, "grad_norm": 6.428641140272211, "learning_rate": 1.9964614941176194e-05, "loss": 1.054, "step": 880 }, { "epoch": 0.11219892704204276, "grad_norm": 5.458770889720949, "learning_rate": 1.9964441375433436e-05, "loss": 0.9883, "step": 881 }, { "epoch": 0.11232628110224939, "grad_norm": 5.406102525761324, "learning_rate": 1.996426738581467e-05, "loss": 0.9914, "step": 882 }, { "epoch": 0.11245363516245602, "grad_norm": 5.6600517858286175, "learning_rate": 1.996409297232729e-05, "loss": 0.9404, "step": 883 }, { "epoch": 0.11258098922266266, "grad_norm": 6.302272858935621, "learning_rate": 1.996391813497872e-05, "loss": 0.9588, "step": 884 }, { "epoch": 0.11270834328286929, "grad_norm": 6.660119593007626, "learning_rate": 1.99637428737764e-05, "loss": 1.0316, "step": 885 }, { "epoch": 0.11283569734307591, "grad_norm": 5.668058934930348, "learning_rate": 1.9963567188727783e-05, "loss": 1.0363, "step": 886 }, { "epoch": 0.11296305140328256, "grad_norm": 4.359857566676804, "learning_rate": 1.9963391079840344e-05, "loss": 1.0509, "step": 887 }, { "epoch": 0.11309040546348918, "grad_norm": 5.11794036685001, "learning_rate": 1.9963214547121573e-05, "loss": 1.0572, "step": 888 }, { "epoch": 0.11321775952369581, "grad_norm": 8.069355260721297, "learning_rate": 1.9963037590578977e-05, "loss": 1.0512, "step": 889 }, { "epoch": 0.11334511358390245, "grad_norm": 5.085180082630144, "learning_rate": 1.9962860210220085e-05, "loss": 0.9946, "step": 890 }, { "epoch": 0.11347246764410908, "grad_norm": 6.3451816089753414, "learning_rate": 1.9962682406052445e-05, "loss": 1.0836, "step": 891 }, { "epoch": 0.1135998217043157, "grad_norm": 4.30454150054419, "learning_rate": 1.996250417808362e-05, "loss": 1.0612, "step": 892 }, { "epoch": 0.11372717576452235, "grad_norm": 6.121196889586544, "learning_rate": 1.996232552632119e-05, "loss": 1.1126, "step": 893 }, { "epoch": 0.11385452982472898, "grad_norm": 5.352515632788303, "learning_rate": 1.9962146450772756e-05, "loss": 1.0238, "step": 894 }, { "epoch": 0.1139818838849356, "grad_norm": 6.644230409121299, "learning_rate": 1.9961966951445933e-05, "loss": 1.008, "step": 895 }, { "epoch": 0.11410923794514224, "grad_norm": 5.595505082119347, "learning_rate": 1.9961787028348357e-05, "loss": 1.085, "step": 896 }, { "epoch": 0.11423659200534887, "grad_norm": 5.556965678651519, "learning_rate": 1.9961606681487685e-05, "loss": 1.0309, "step": 897 }, { "epoch": 0.1143639460655555, "grad_norm": 4.581100382319801, "learning_rate": 1.996142591087159e-05, "loss": 1.0942, "step": 898 }, { "epoch": 0.11449130012576214, "grad_norm": 5.835283153368007, "learning_rate": 1.9961244716507757e-05, "loss": 1.1027, "step": 899 }, { "epoch": 0.11461865418596877, "grad_norm": 5.063142900500023, "learning_rate": 1.9961063098403897e-05, "loss": 0.9936, "step": 900 }, { "epoch": 0.1147460082461754, "grad_norm": 4.51443898818577, "learning_rate": 1.9960881056567732e-05, "loss": 0.9346, "step": 901 }, { "epoch": 0.11487336230638202, "grad_norm": 5.329502856059749, "learning_rate": 1.9960698591007008e-05, "loss": 1.0803, "step": 902 }, { "epoch": 0.11500071636658867, "grad_norm": 4.504037740993079, "learning_rate": 1.996051570172949e-05, "loss": 0.9776, "step": 903 }, { "epoch": 0.11512807042679529, "grad_norm": 6.117784882628449, "learning_rate": 1.9960332388742952e-05, "loss": 1.0179, "step": 904 }, { "epoch": 0.11525542448700192, "grad_norm": 5.357360995256535, "learning_rate": 1.99601486520552e-05, "loss": 1.1039, "step": 905 }, { "epoch": 0.11538277854720856, "grad_norm": 10.642215559466937, "learning_rate": 1.995996449167404e-05, "loss": 0.9746, "step": 906 }, { "epoch": 0.11551013260741519, "grad_norm": 5.78019713016134, "learning_rate": 1.9959779907607317e-05, "loss": 0.9421, "step": 907 }, { "epoch": 0.11563748666762182, "grad_norm": 6.034713778278319, "learning_rate": 1.995959489986287e-05, "loss": 1.1411, "step": 908 }, { "epoch": 0.11576484072782846, "grad_norm": 5.582388650865901, "learning_rate": 1.9959409468448582e-05, "loss": 1.0087, "step": 909 }, { "epoch": 0.11589219478803509, "grad_norm": 5.796074084454103, "learning_rate": 1.995922361337233e-05, "loss": 1.1047, "step": 910 }, { "epoch": 0.11601954884824171, "grad_norm": 6.864811231180803, "learning_rate": 1.9959037334642027e-05, "loss": 1.0263, "step": 911 }, { "epoch": 0.11614690290844835, "grad_norm": 5.806496093265156, "learning_rate": 1.9958850632265595e-05, "loss": 0.9675, "step": 912 }, { "epoch": 0.11627425696865498, "grad_norm": 4.753198851564046, "learning_rate": 1.995866350625098e-05, "loss": 0.9769, "step": 913 }, { "epoch": 0.11640161102886161, "grad_norm": 5.277047375560149, "learning_rate": 1.9958475956606133e-05, "loss": 1.0622, "step": 914 }, { "epoch": 0.11652896508906825, "grad_norm": 4.805170397798593, "learning_rate": 1.995828798333904e-05, "loss": 1.0204, "step": 915 }, { "epoch": 0.11665631914927488, "grad_norm": 3.8686471361527017, "learning_rate": 1.9958099586457696e-05, "loss": 1.0807, "step": 916 }, { "epoch": 0.1167836732094815, "grad_norm": 5.415077015337016, "learning_rate": 1.9957910765970114e-05, "loss": 1.0983, "step": 917 }, { "epoch": 0.11691102726968815, "grad_norm": 6.980435687788938, "learning_rate": 1.9957721521884322e-05, "loss": 1.1048, "step": 918 }, { "epoch": 0.11703838132989478, "grad_norm": 4.371332647836583, "learning_rate": 1.9957531854208378e-05, "loss": 1.0857, "step": 919 }, { "epoch": 0.1171657353901014, "grad_norm": 5.134828586707463, "learning_rate": 1.9957341762950346e-05, "loss": 1.0729, "step": 920 }, { "epoch": 0.11729308945030804, "grad_norm": 5.138492594150226, "learning_rate": 1.9957151248118314e-05, "loss": 1.131, "step": 921 }, { "epoch": 0.11742044351051467, "grad_norm": 4.016436424357209, "learning_rate": 1.9956960309720385e-05, "loss": 1.0131, "step": 922 }, { "epoch": 0.1175477975707213, "grad_norm": 5.120844016543802, "learning_rate": 1.995676894776468e-05, "loss": 1.0525, "step": 923 }, { "epoch": 0.11767515163092793, "grad_norm": 6.446405565457038, "learning_rate": 1.995657716225934e-05, "loss": 1.0622, "step": 924 }, { "epoch": 0.11780250569113457, "grad_norm": 5.7319024599644814, "learning_rate": 1.9956384953212526e-05, "loss": 1.0326, "step": 925 }, { "epoch": 0.1179298597513412, "grad_norm": 5.503297337568938, "learning_rate": 1.995619232063241e-05, "loss": 0.9426, "step": 926 }, { "epoch": 0.11805721381154782, "grad_norm": 5.387966927033936, "learning_rate": 1.9955999264527194e-05, "loss": 1.0234, "step": 927 }, { "epoch": 0.11818456787175446, "grad_norm": 4.988396723164246, "learning_rate": 1.9955805784905083e-05, "loss": 1.0124, "step": 928 }, { "epoch": 0.11831192193196109, "grad_norm": 5.309249422003908, "learning_rate": 1.9955611881774308e-05, "loss": 1.1048, "step": 929 }, { "epoch": 0.11843927599216772, "grad_norm": 5.357774679073277, "learning_rate": 1.995541755514312e-05, "loss": 0.9624, "step": 930 }, { "epoch": 0.11856663005237436, "grad_norm": 6.42531343281874, "learning_rate": 1.9955222805019786e-05, "loss": 1.0409, "step": 931 }, { "epoch": 0.11869398411258099, "grad_norm": 5.131309347239302, "learning_rate": 1.9955027631412584e-05, "loss": 1.1056, "step": 932 }, { "epoch": 0.11882133817278762, "grad_norm": 4.4340370527963895, "learning_rate": 1.9954832034329827e-05, "loss": 0.9607, "step": 933 }, { "epoch": 0.11894869223299426, "grad_norm": 5.347056475736151, "learning_rate": 1.9954636013779826e-05, "loss": 1.0851, "step": 934 }, { "epoch": 0.11907604629320089, "grad_norm": 6.211339474511673, "learning_rate": 1.995443956977093e-05, "loss": 1.0095, "step": 935 }, { "epoch": 0.11920340035340751, "grad_norm": 5.982173504781401, "learning_rate": 1.995424270231148e-05, "loss": 1.0042, "step": 936 }, { "epoch": 0.11933075441361415, "grad_norm": 6.007968697173025, "learning_rate": 1.9954045411409864e-05, "loss": 1.0089, "step": 937 }, { "epoch": 0.11945810847382078, "grad_norm": 4.726619196780303, "learning_rate": 1.9953847697074472e-05, "loss": 0.984, "step": 938 }, { "epoch": 0.11958546253402741, "grad_norm": 4.7802597715905275, "learning_rate": 1.995364955931371e-05, "loss": 1.0665, "step": 939 }, { "epoch": 0.11971281659423405, "grad_norm": 4.51709910688604, "learning_rate": 1.995345099813601e-05, "loss": 0.9949, "step": 940 }, { "epoch": 0.11984017065444068, "grad_norm": 5.500703073126749, "learning_rate": 1.9953252013549816e-05, "loss": 1.0011, "step": 941 }, { "epoch": 0.1199675247146473, "grad_norm": 6.24212088805979, "learning_rate": 1.9953052605563596e-05, "loss": 1.0222, "step": 942 }, { "epoch": 0.12009487877485395, "grad_norm": 4.63458043245687, "learning_rate": 1.995285277418583e-05, "loss": 1.0849, "step": 943 }, { "epoch": 0.12022223283506057, "grad_norm": 3.6977378837815587, "learning_rate": 1.9952652519425016e-05, "loss": 0.9489, "step": 944 }, { "epoch": 0.1203495868952672, "grad_norm": 5.320802371961369, "learning_rate": 1.995245184128968e-05, "loss": 1.1189, "step": 945 }, { "epoch": 0.12047694095547384, "grad_norm": 5.860292896853712, "learning_rate": 1.9952250739788356e-05, "loss": 0.9716, "step": 946 }, { "epoch": 0.12060429501568047, "grad_norm": 5.153604094452106, "learning_rate": 1.995204921492959e-05, "loss": 0.9592, "step": 947 }, { "epoch": 0.1207316490758871, "grad_norm": 7.173086026737213, "learning_rate": 1.995184726672197e-05, "loss": 1.0795, "step": 948 }, { "epoch": 0.12085900313609373, "grad_norm": 6.421640656365196, "learning_rate": 1.9951644895174076e-05, "loss": 1.0193, "step": 949 }, { "epoch": 0.12098635719630037, "grad_norm": 5.4544739024931195, "learning_rate": 1.995144210029452e-05, "loss": 0.9711, "step": 950 }, { "epoch": 0.121113711256507, "grad_norm": 5.661363850647127, "learning_rate": 1.9951238882091926e-05, "loss": 1.0674, "step": 951 }, { "epoch": 0.12124106531671362, "grad_norm": 7.360293450517743, "learning_rate": 1.995103524057494e-05, "loss": 1.0153, "step": 952 }, { "epoch": 0.12136841937692026, "grad_norm": 4.63858305836024, "learning_rate": 1.995083117575223e-05, "loss": 1.1147, "step": 953 }, { "epoch": 0.12149577343712689, "grad_norm": 5.686394552597674, "learning_rate": 1.9950626687632466e-05, "loss": 0.9923, "step": 954 }, { "epoch": 0.12162312749733352, "grad_norm": 6.112128610386265, "learning_rate": 1.9950421776224353e-05, "loss": 0.9806, "step": 955 }, { "epoch": 0.12175048155754016, "grad_norm": 6.282131759471412, "learning_rate": 1.9950216441536613e-05, "loss": 1.165, "step": 956 }, { "epoch": 0.12187783561774679, "grad_norm": 5.361202895850754, "learning_rate": 1.9950010683577968e-05, "loss": 1.0751, "step": 957 }, { "epoch": 0.12200518967795342, "grad_norm": 4.535062251730972, "learning_rate": 1.9949804502357183e-05, "loss": 1.0677, "step": 958 }, { "epoch": 0.12213254373816006, "grad_norm": 6.974919756902898, "learning_rate": 1.994959789788302e-05, "loss": 1.0085, "step": 959 }, { "epoch": 0.12225989779836668, "grad_norm": 5.319029621165861, "learning_rate": 1.994939087016427e-05, "loss": 1.004, "step": 960 }, { "epoch": 0.12238725185857331, "grad_norm": 6.546630563474553, "learning_rate": 1.9949183419209747e-05, "loss": 0.9709, "step": 961 }, { "epoch": 0.12251460591877995, "grad_norm": 5.408159569220149, "learning_rate": 1.9948975545028263e-05, "loss": 1.0071, "step": 962 }, { "epoch": 0.12264195997898658, "grad_norm": 5.251176611431323, "learning_rate": 1.994876724762867e-05, "loss": 1.0617, "step": 963 }, { "epoch": 0.12276931403919321, "grad_norm": 4.902891917186358, "learning_rate": 1.9948558527019826e-05, "loss": 1.0336, "step": 964 }, { "epoch": 0.12289666809939985, "grad_norm": 6.705538939706773, "learning_rate": 1.994834938321061e-05, "loss": 1.0693, "step": 965 }, { "epoch": 0.12302402215960648, "grad_norm": 6.087297201988893, "learning_rate": 1.994813981620992e-05, "loss": 1.0077, "step": 966 }, { "epoch": 0.1231513762198131, "grad_norm": 4.56559021693041, "learning_rate": 1.9947929826026668e-05, "loss": 1.0278, "step": 967 }, { "epoch": 0.12327873028001975, "grad_norm": 4.71483525462677, "learning_rate": 1.9947719412669787e-05, "loss": 1.1168, "step": 968 }, { "epoch": 0.12340608434022637, "grad_norm": 6.379245316884144, "learning_rate": 1.994750857614823e-05, "loss": 1.0561, "step": 969 }, { "epoch": 0.123533438400433, "grad_norm": 5.052821385327841, "learning_rate": 1.9947297316470963e-05, "loss": 1.1216, "step": 970 }, { "epoch": 0.12366079246063963, "grad_norm": 4.041449839225834, "learning_rate": 1.9947085633646977e-05, "loss": 0.9505, "step": 971 }, { "epoch": 0.12378814652084627, "grad_norm": 4.702999278064229, "learning_rate": 1.994687352768527e-05, "loss": 1.1083, "step": 972 }, { "epoch": 0.1239155005810529, "grad_norm": 4.5330116793643835, "learning_rate": 1.994666099859487e-05, "loss": 1.0322, "step": 973 }, { "epoch": 0.12404285464125953, "grad_norm": 5.515008735269388, "learning_rate": 1.994644804638482e-05, "loss": 1.005, "step": 974 }, { "epoch": 0.12417020870146617, "grad_norm": 8.7381555226762, "learning_rate": 1.9946234671064172e-05, "loss": 0.9686, "step": 975 }, { "epoch": 0.1242975627616728, "grad_norm": 5.550035376908558, "learning_rate": 1.994602087264201e-05, "loss": 1.108, "step": 976 }, { "epoch": 0.12442491682187942, "grad_norm": 5.977919959197699, "learning_rate": 1.994580665112742e-05, "loss": 1.0092, "step": 977 }, { "epoch": 0.12455227088208606, "grad_norm": 4.282609932418776, "learning_rate": 1.994559200652952e-05, "loss": 1.0182, "step": 978 }, { "epoch": 0.12467962494229269, "grad_norm": 5.907460245135092, "learning_rate": 1.9945376938857443e-05, "loss": 0.9318, "step": 979 }, { "epoch": 0.12480697900249932, "grad_norm": 4.831551629010998, "learning_rate": 1.994516144812033e-05, "loss": 1.0026, "step": 980 }, { "epoch": 0.12493433306270596, "grad_norm": 4.311846255079303, "learning_rate": 1.994494553432736e-05, "loss": 0.9798, "step": 981 }, { "epoch": 0.12506168712291257, "grad_norm": 5.6537046659323265, "learning_rate": 1.9944729197487702e-05, "loss": 1.0372, "step": 982 }, { "epoch": 0.12518904118311922, "grad_norm": 8.135826527084301, "learning_rate": 1.994451243761057e-05, "loss": 1.1158, "step": 983 }, { "epoch": 0.12531639524332586, "grad_norm": 7.315360789394048, "learning_rate": 1.9944295254705187e-05, "loss": 1.1006, "step": 984 }, { "epoch": 0.12544374930353247, "grad_norm": 8.115377978949036, "learning_rate": 1.994407764878078e-05, "loss": 1.0766, "step": 985 }, { "epoch": 0.1255711033637391, "grad_norm": 5.391810013262356, "learning_rate": 1.9943859619846617e-05, "loss": 1.0636, "step": 986 }, { "epoch": 0.12569845742394575, "grad_norm": 6.15608052441614, "learning_rate": 1.9943641167911965e-05, "loss": 1.0649, "step": 987 }, { "epoch": 0.12582581148415237, "grad_norm": 8.880531843600096, "learning_rate": 1.994342229298612e-05, "loss": 1.0144, "step": 988 }, { "epoch": 0.125953165544359, "grad_norm": 6.462124439889047, "learning_rate": 1.9943202995078394e-05, "loss": 1.0238, "step": 989 }, { "epoch": 0.12608051960456565, "grad_norm": 10.131981973721643, "learning_rate": 1.994298327419811e-05, "loss": 1.0622, "step": 990 }, { "epoch": 0.12620787366477226, "grad_norm": 6.29611830033835, "learning_rate": 1.9942763130354624e-05, "loss": 1.1349, "step": 991 }, { "epoch": 0.1263352277249789, "grad_norm": 5.458378138855743, "learning_rate": 1.9942542563557294e-05, "loss": 1.0549, "step": 992 }, { "epoch": 0.12646258178518555, "grad_norm": 5.555032754246985, "learning_rate": 1.9942321573815502e-05, "loss": 0.9425, "step": 993 }, { "epoch": 0.12658993584539216, "grad_norm": 4.502659753383141, "learning_rate": 1.994210016113865e-05, "loss": 1.1005, "step": 994 }, { "epoch": 0.1267172899055988, "grad_norm": 5.353733663701035, "learning_rate": 1.994187832553616e-05, "loss": 1.0607, "step": 995 }, { "epoch": 0.12684464396580544, "grad_norm": 3.80325171228991, "learning_rate": 1.9941656067017466e-05, "loss": 1.139, "step": 996 }, { "epoch": 0.12697199802601206, "grad_norm": 4.255154345141497, "learning_rate": 1.9941433385592022e-05, "loss": 0.9297, "step": 997 }, { "epoch": 0.1270993520862187, "grad_norm": 4.50577894650802, "learning_rate": 1.99412102812693e-05, "loss": 1.0254, "step": 998 }, { "epoch": 0.12722670614642534, "grad_norm": 6.663816351330356, "learning_rate": 1.9940986754058792e-05, "loss": 1.1039, "step": 999 }, { "epoch": 0.12735406020663195, "grad_norm": 6.1321546845655694, "learning_rate": 1.9940762803970006e-05, "loss": 1.0365, "step": 1000 }, { "epoch": 0.1274814142668386, "grad_norm": 5.012189883106625, "learning_rate": 1.9940538431012472e-05, "loss": 1.0125, "step": 1001 }, { "epoch": 0.12760876832704524, "grad_norm": 4.761917222778728, "learning_rate": 1.9940313635195728e-05, "loss": 1.0547, "step": 1002 }, { "epoch": 0.12773612238725185, "grad_norm": 5.289529123387824, "learning_rate": 1.9940088416529342e-05, "loss": 1.0712, "step": 1003 }, { "epoch": 0.1278634764474585, "grad_norm": 5.437705902059703, "learning_rate": 1.9939862775022893e-05, "loss": 1.0807, "step": 1004 }, { "epoch": 0.12799083050766513, "grad_norm": 6.951708291416241, "learning_rate": 1.993963671068598e-05, "loss": 1.0059, "step": 1005 }, { "epoch": 0.12811818456787175, "grad_norm": 4.9604876456962845, "learning_rate": 1.9939410223528215e-05, "loss": 1.0181, "step": 1006 }, { "epoch": 0.1282455386280784, "grad_norm": 7.510870367376681, "learning_rate": 1.993918331355924e-05, "loss": 1.0302, "step": 1007 }, { "epoch": 0.12837289268828503, "grad_norm": 5.293053432786597, "learning_rate": 1.9938955980788703e-05, "loss": 1.0705, "step": 1008 }, { "epoch": 0.12850024674849164, "grad_norm": 6.103049982237845, "learning_rate": 1.9938728225226273e-05, "loss": 0.9717, "step": 1009 }, { "epoch": 0.12862760080869828, "grad_norm": 4.882607026926755, "learning_rate": 1.9938500046881643e-05, "loss": 0.9793, "step": 1010 }, { "epoch": 0.12875495486890493, "grad_norm": 5.646936831956459, "learning_rate": 1.9938271445764515e-05, "loss": 0.9924, "step": 1011 }, { "epoch": 0.12888230892911154, "grad_norm": 6.42042187216709, "learning_rate": 1.9938042421884617e-05, "loss": 0.9512, "step": 1012 }, { "epoch": 0.12900966298931818, "grad_norm": 4.131351782487081, "learning_rate": 1.993781297525169e-05, "loss": 1.0183, "step": 1013 }, { "epoch": 0.12913701704952482, "grad_norm": 5.587315444117175, "learning_rate": 1.9937583105875494e-05, "loss": 0.9225, "step": 1014 }, { "epoch": 0.12926437110973144, "grad_norm": 5.817599121088648, "learning_rate": 1.9937352813765808e-05, "loss": 0.9889, "step": 1015 }, { "epoch": 0.12939172516993808, "grad_norm": 4.717808307383814, "learning_rate": 1.9937122098932428e-05, "loss": 0.9959, "step": 1016 }, { "epoch": 0.12951907923014472, "grad_norm": 4.6219142208021475, "learning_rate": 1.9936890961385168e-05, "loss": 0.9909, "step": 1017 }, { "epoch": 0.12964643329035133, "grad_norm": 5.360274258899001, "learning_rate": 1.993665940113386e-05, "loss": 1.0057, "step": 1018 }, { "epoch": 0.12977378735055797, "grad_norm": 5.580861336737161, "learning_rate": 1.9936427418188357e-05, "loss": 0.9989, "step": 1019 }, { "epoch": 0.12990114141076461, "grad_norm": 5.159156858475661, "learning_rate": 1.9936195012558524e-05, "loss": 0.9662, "step": 1020 }, { "epoch": 0.13002849547097123, "grad_norm": 5.548067034656748, "learning_rate": 1.993596218425425e-05, "loss": 1.0041, "step": 1021 }, { "epoch": 0.13015584953117787, "grad_norm": 4.9843518486800695, "learning_rate": 1.9935728933285438e-05, "loss": 1.0422, "step": 1022 }, { "epoch": 0.1302832035913845, "grad_norm": 4.844871735661303, "learning_rate": 1.9935495259662008e-05, "loss": 1.0537, "step": 1023 }, { "epoch": 0.13041055765159112, "grad_norm": 5.4011935363233645, "learning_rate": 1.9935261163393904e-05, "loss": 0.9057, "step": 1024 }, { "epoch": 0.13053791171179777, "grad_norm": 5.113145096425448, "learning_rate": 1.9935026644491082e-05, "loss": 1.0, "step": 1025 }, { "epoch": 0.1306652657720044, "grad_norm": 4.776971436010882, "learning_rate": 1.9934791702963515e-05, "loss": 1.0148, "step": 1026 }, { "epoch": 0.13079261983221102, "grad_norm": 5.132128897887906, "learning_rate": 1.9934556338821206e-05, "loss": 0.9756, "step": 1027 }, { "epoch": 0.13091997389241766, "grad_norm": 4.755116768575519, "learning_rate": 1.9934320552074162e-05, "loss": 1.0135, "step": 1028 }, { "epoch": 0.13104732795262428, "grad_norm": 4.623487276696922, "learning_rate": 1.9934084342732413e-05, "loss": 0.9728, "step": 1029 }, { "epoch": 0.13117468201283092, "grad_norm": 5.275131383046155, "learning_rate": 1.9933847710806e-05, "loss": 0.9805, "step": 1030 }, { "epoch": 0.13130203607303756, "grad_norm": 5.539016357574028, "learning_rate": 1.9933610656305006e-05, "loss": 1.1283, "step": 1031 }, { "epoch": 0.13142939013324417, "grad_norm": 5.368997460236626, "learning_rate": 1.99333731792395e-05, "loss": 1.0306, "step": 1032 }, { "epoch": 0.13155674419345081, "grad_norm": 4.374498829065241, "learning_rate": 1.9933135279619592e-05, "loss": 0.9264, "step": 1033 }, { "epoch": 0.13168409825365746, "grad_norm": 4.632795825947545, "learning_rate": 1.9932896957455397e-05, "loss": 0.9511, "step": 1034 }, { "epoch": 0.13181145231386407, "grad_norm": 5.802893791461331, "learning_rate": 1.9932658212757053e-05, "loss": 1.0492, "step": 1035 }, { "epoch": 0.1319388063740707, "grad_norm": 7.5402494612894975, "learning_rate": 1.9932419045534724e-05, "loss": 1.1128, "step": 1036 }, { "epoch": 0.13206616043427735, "grad_norm": 5.214650079795567, "learning_rate": 1.9932179455798574e-05, "loss": 0.9812, "step": 1037 }, { "epoch": 0.13219351449448397, "grad_norm": 5.761631024226228, "learning_rate": 1.9931939443558803e-05, "loss": 1.0368, "step": 1038 }, { "epoch": 0.1323208685546906, "grad_norm": 5.505637660709738, "learning_rate": 1.993169900882561e-05, "loss": 1.0554, "step": 1039 }, { "epoch": 0.13244822261489725, "grad_norm": 7.051467786001133, "learning_rate": 1.9931458151609234e-05, "loss": 1.0514, "step": 1040 }, { "epoch": 0.13257557667510386, "grad_norm": 6.307248187578084, "learning_rate": 1.9931216871919914e-05, "loss": 1.0074, "step": 1041 }, { "epoch": 0.1327029307353105, "grad_norm": 5.105711016930289, "learning_rate": 1.9930975169767918e-05, "loss": 1.0188, "step": 1042 }, { "epoch": 0.13283028479551715, "grad_norm": 5.433468782261706, "learning_rate": 1.9930733045163525e-05, "loss": 0.9844, "step": 1043 }, { "epoch": 0.13295763885572376, "grad_norm": 6.700120372544943, "learning_rate": 1.9930490498117035e-05, "loss": 1.0511, "step": 1044 }, { "epoch": 0.1330849929159304, "grad_norm": 6.7738719831634775, "learning_rate": 1.9930247528638768e-05, "loss": 0.9928, "step": 1045 }, { "epoch": 0.13321234697613704, "grad_norm": 6.388895187743395, "learning_rate": 1.9930004136739058e-05, "loss": 0.9695, "step": 1046 }, { "epoch": 0.13333970103634366, "grad_norm": 5.2922802466753955, "learning_rate": 1.9929760322428256e-05, "loss": 0.9448, "step": 1047 }, { "epoch": 0.1334670550965503, "grad_norm": 5.865390141642061, "learning_rate": 1.9929516085716736e-05, "loss": 1.1244, "step": 1048 }, { "epoch": 0.13359440915675694, "grad_norm": 5.007676716360697, "learning_rate": 1.992927142661489e-05, "loss": 0.9693, "step": 1049 }, { "epoch": 0.13372176321696355, "grad_norm": 5.057958001797684, "learning_rate": 1.992902634513312e-05, "loss": 0.949, "step": 1050 }, { "epoch": 0.1338491172771702, "grad_norm": 4.937284440755442, "learning_rate": 1.9928780841281858e-05, "loss": 1.0291, "step": 1051 }, { "epoch": 0.13397647133737683, "grad_norm": 5.169117541086143, "learning_rate": 1.9928534915071543e-05, "loss": 1.0791, "step": 1052 }, { "epoch": 0.13410382539758345, "grad_norm": 4.1750885091753736, "learning_rate": 1.9928288566512638e-05, "loss": 1.0141, "step": 1053 }, { "epoch": 0.1342311794577901, "grad_norm": 7.6227363603328575, "learning_rate": 1.9928041795615616e-05, "loss": 0.9137, "step": 1054 }, { "epoch": 0.13435853351799673, "grad_norm": 5.799733570161933, "learning_rate": 1.992779460239099e-05, "loss": 1.039, "step": 1055 }, { "epoch": 0.13448588757820334, "grad_norm": 5.177060573023828, "learning_rate": 1.9927546986849258e-05, "loss": 0.977, "step": 1056 }, { "epoch": 0.13461324163841, "grad_norm": 7.3578280951204444, "learning_rate": 1.9927298949000965e-05, "loss": 0.9648, "step": 1057 }, { "epoch": 0.13474059569861663, "grad_norm": 4.946775198994267, "learning_rate": 1.9927050488856657e-05, "loss": 1.1389, "step": 1058 }, { "epoch": 0.13486794975882324, "grad_norm": 6.061502590914894, "learning_rate": 1.9926801606426906e-05, "loss": 1.0901, "step": 1059 }, { "epoch": 0.13499530381902988, "grad_norm": 5.923565706250715, "learning_rate": 1.992655230172229e-05, "loss": 1.0891, "step": 1060 }, { "epoch": 0.13512265787923652, "grad_norm": 4.96207252794465, "learning_rate": 1.992630257475343e-05, "loss": 0.9961, "step": 1061 }, { "epoch": 0.13525001193944314, "grad_norm": 7.651995788732587, "learning_rate": 1.9926052425530936e-05, "loss": 0.9719, "step": 1062 }, { "epoch": 0.13537736599964978, "grad_norm": 6.374845016689928, "learning_rate": 1.9925801854065456e-05, "loss": 0.9967, "step": 1063 }, { "epoch": 0.13550472005985642, "grad_norm": 5.183903929938708, "learning_rate": 1.9925550860367646e-05, "loss": 0.986, "step": 1064 }, { "epoch": 0.13563207412006303, "grad_norm": 5.011184200046132, "learning_rate": 1.9925299444448183e-05, "loss": 0.9907, "step": 1065 }, { "epoch": 0.13575942818026968, "grad_norm": 6.122976786381155, "learning_rate": 1.9925047606317766e-05, "loss": 1.0059, "step": 1066 }, { "epoch": 0.13588678224047632, "grad_norm": 5.423363560969625, "learning_rate": 1.9924795345987103e-05, "loss": 1.0698, "step": 1067 }, { "epoch": 0.13601413630068293, "grad_norm": 5.226175473513691, "learning_rate": 1.9924542663466925e-05, "loss": 1.0515, "step": 1068 }, { "epoch": 0.13614149036088957, "grad_norm": 6.246310861904402, "learning_rate": 1.9924289558767982e-05, "loss": 1.096, "step": 1069 }, { "epoch": 0.1362688444210962, "grad_norm": 6.652299021796196, "learning_rate": 1.9924036031901042e-05, "loss": 1.0495, "step": 1070 }, { "epoch": 0.13639619848130283, "grad_norm": 5.8620075466350805, "learning_rate": 1.992378208287689e-05, "loss": 0.9825, "step": 1071 }, { "epoch": 0.13652355254150947, "grad_norm": 6.050447734312274, "learning_rate": 1.992352771170633e-05, "loss": 0.9908, "step": 1072 }, { "epoch": 0.13665090660171608, "grad_norm": 5.240944454895212, "learning_rate": 1.9923272918400175e-05, "loss": 0.9909, "step": 1073 }, { "epoch": 0.13677826066192272, "grad_norm": 5.091152909868271, "learning_rate": 1.992301770296927e-05, "loss": 1.0386, "step": 1074 }, { "epoch": 0.13690561472212937, "grad_norm": 5.6087775377108064, "learning_rate": 1.9922762065424474e-05, "loss": 1.0048, "step": 1075 }, { "epoch": 0.13703296878233598, "grad_norm": 6.818629036310338, "learning_rate": 1.992250600577665e-05, "loss": 0.9762, "step": 1076 }, { "epoch": 0.13716032284254262, "grad_norm": 6.777493187743689, "learning_rate": 1.9922249524036704e-05, "loss": 1.1636, "step": 1077 }, { "epoch": 0.13728767690274926, "grad_norm": 4.841139354431293, "learning_rate": 1.9921992620215544e-05, "loss": 1.0353, "step": 1078 }, { "epoch": 0.13741503096295588, "grad_norm": 4.447936344624334, "learning_rate": 1.992173529432409e-05, "loss": 1.0011, "step": 1079 }, { "epoch": 0.13754238502316252, "grad_norm": 7.258787157460566, "learning_rate": 1.9921477546373296e-05, "loss": 0.9964, "step": 1080 }, { "epoch": 0.13766973908336916, "grad_norm": 6.816925011599404, "learning_rate": 1.9921219376374123e-05, "loss": 0.8766, "step": 1081 }, { "epoch": 0.13779709314357577, "grad_norm": 5.154850070375168, "learning_rate": 1.9920960784337552e-05, "loss": 1.0945, "step": 1082 }, { "epoch": 0.1379244472037824, "grad_norm": 3.4197004541756377, "learning_rate": 1.992070177027459e-05, "loss": 0.96, "step": 1083 }, { "epoch": 0.13805180126398905, "grad_norm": 4.389033453340384, "learning_rate": 1.9920442334196248e-05, "loss": 1.0093, "step": 1084 }, { "epoch": 0.13817915532419567, "grad_norm": 5.624768231173525, "learning_rate": 1.9920182476113564e-05, "loss": 0.9758, "step": 1085 }, { "epoch": 0.1383065093844023, "grad_norm": 4.090395885605275, "learning_rate": 1.9919922196037596e-05, "loss": 0.984, "step": 1086 }, { "epoch": 0.13843386344460895, "grad_norm": 4.890555384326312, "learning_rate": 1.9919661493979408e-05, "loss": 1.0094, "step": 1087 }, { "epoch": 0.13856121750481556, "grad_norm": 9.665373310076777, "learning_rate": 1.99194003699501e-05, "loss": 1.0565, "step": 1088 }, { "epoch": 0.1386885715650222, "grad_norm": 5.37681690823644, "learning_rate": 1.991913882396077e-05, "loss": 0.9501, "step": 1089 }, { "epoch": 0.13881592562522885, "grad_norm": 4.849219402929612, "learning_rate": 1.991887685602255e-05, "loss": 0.9674, "step": 1090 }, { "epoch": 0.13894327968543546, "grad_norm": 3.6688144346749265, "learning_rate": 1.9918614466146586e-05, "loss": 1.0883, "step": 1091 }, { "epoch": 0.1390706337456421, "grad_norm": 6.9702190681083955, "learning_rate": 1.9918351654344033e-05, "loss": 1.0757, "step": 1092 }, { "epoch": 0.13919798780584874, "grad_norm": 4.918950450259129, "learning_rate": 1.991808842062607e-05, "loss": 1.042, "step": 1093 }, { "epoch": 0.13932534186605536, "grad_norm": 4.6766149658767615, "learning_rate": 1.9917824765003905e-05, "loss": 1.0009, "step": 1094 }, { "epoch": 0.139452695926262, "grad_norm": 5.286134165579219, "learning_rate": 1.9917560687488743e-05, "loss": 0.9648, "step": 1095 }, { "epoch": 0.13958004998646864, "grad_norm": 4.8150572484813035, "learning_rate": 1.9917296188091823e-05, "loss": 0.9133, "step": 1096 }, { "epoch": 0.13970740404667525, "grad_norm": 5.7510746490669575, "learning_rate": 1.9917031266824395e-05, "loss": 0.8875, "step": 1097 }, { "epoch": 0.1398347581068819, "grad_norm": 5.203916380671531, "learning_rate": 1.991676592369773e-05, "loss": 0.9772, "step": 1098 }, { "epoch": 0.13996211216708854, "grad_norm": 4.728170729754482, "learning_rate": 1.991650015872311e-05, "loss": 1.087, "step": 1099 }, { "epoch": 0.14008946622729515, "grad_norm": 4.143056479483173, "learning_rate": 1.991623397191185e-05, "loss": 1.0218, "step": 1100 }, { "epoch": 0.1402168202875018, "grad_norm": 4.069189579388974, "learning_rate": 1.9915967363275264e-05, "loss": 0.9937, "step": 1101 }, { "epoch": 0.14034417434770843, "grad_norm": 9.021029169953739, "learning_rate": 1.9915700332824696e-05, "loss": 0.9962, "step": 1102 }, { "epoch": 0.14047152840791505, "grad_norm": 5.2339468254213175, "learning_rate": 1.9915432880571508e-05, "loss": 1.0367, "step": 1103 }, { "epoch": 0.1405988824681217, "grad_norm": 5.262071083693544, "learning_rate": 1.9915165006527076e-05, "loss": 1.0258, "step": 1104 }, { "epoch": 0.14072623652832833, "grad_norm": 5.158737784672319, "learning_rate": 1.991489671070279e-05, "loss": 0.9628, "step": 1105 }, { "epoch": 0.14085359058853494, "grad_norm": 4.923590079715338, "learning_rate": 1.9914627993110072e-05, "loss": 0.9043, "step": 1106 }, { "epoch": 0.14098094464874159, "grad_norm": 5.1952724828111085, "learning_rate": 1.9914358853760346e-05, "loss": 0.9633, "step": 1107 }, { "epoch": 0.14110829870894823, "grad_norm": 4.745809212149704, "learning_rate": 1.9914089292665065e-05, "loss": 0.9792, "step": 1108 }, { "epoch": 0.14123565276915484, "grad_norm": 5.416111413911312, "learning_rate": 1.991381930983569e-05, "loss": 1.0286, "step": 1109 }, { "epoch": 0.14136300682936148, "grad_norm": 5.574909726083418, "learning_rate": 1.9913548905283714e-05, "loss": 1.083, "step": 1110 }, { "epoch": 0.14149036088956812, "grad_norm": 3.9528528655095663, "learning_rate": 1.9913278079020633e-05, "loss": 0.9055, "step": 1111 }, { "epoch": 0.14161771494977474, "grad_norm": 6.877030228906015, "learning_rate": 1.9913006831057967e-05, "loss": 1.0158, "step": 1112 }, { "epoch": 0.14174506900998138, "grad_norm": 5.337862217782534, "learning_rate": 1.9912735161407264e-05, "loss": 1.0859, "step": 1113 }, { "epoch": 0.14187242307018802, "grad_norm": 5.764308872267546, "learning_rate": 1.991246307008007e-05, "loss": 0.9251, "step": 1114 }, { "epoch": 0.14199977713039463, "grad_norm": 5.64803125970419, "learning_rate": 1.9912190557087964e-05, "loss": 0.9771, "step": 1115 }, { "epoch": 0.14212713119060127, "grad_norm": 4.898251762701724, "learning_rate": 1.991191762244254e-05, "loss": 1.0447, "step": 1116 }, { "epoch": 0.14225448525080792, "grad_norm": 6.234584565811258, "learning_rate": 1.9911644266155402e-05, "loss": 1.0607, "step": 1117 }, { "epoch": 0.14238183931101453, "grad_norm": 5.266679154554281, "learning_rate": 1.9911370488238185e-05, "loss": 0.9349, "step": 1118 }, { "epoch": 0.14250919337122117, "grad_norm": 6.256153586880418, "learning_rate": 1.9911096288702532e-05, "loss": 1.2035, "step": 1119 }, { "epoch": 0.14263654743142778, "grad_norm": 6.450600044668659, "learning_rate": 1.9910821667560106e-05, "loss": 1.0081, "step": 1120 }, { "epoch": 0.14276390149163443, "grad_norm": 3.8362862919805774, "learning_rate": 1.9910546624822596e-05, "loss": 1.0396, "step": 1121 }, { "epoch": 0.14289125555184107, "grad_norm": 5.81600157570588, "learning_rate": 1.9910271160501694e-05, "loss": 0.9653, "step": 1122 }, { "epoch": 0.14301860961204768, "grad_norm": 4.879159571085699, "learning_rate": 1.990999527460912e-05, "loss": 0.9589, "step": 1123 }, { "epoch": 0.14314596367225432, "grad_norm": 6.8926005396879795, "learning_rate": 1.990971896715661e-05, "loss": 1.0908, "step": 1124 }, { "epoch": 0.14327331773246096, "grad_norm": 7.01316195540877, "learning_rate": 1.990944223815592e-05, "loss": 1.0536, "step": 1125 }, { "epoch": 0.14340067179266758, "grad_norm": 4.769891348826958, "learning_rate": 1.990916508761882e-05, "loss": 1.0685, "step": 1126 }, { "epoch": 0.14352802585287422, "grad_norm": 5.274988017136401, "learning_rate": 1.9908887515557103e-05, "loss": 0.9612, "step": 1127 }, { "epoch": 0.14365537991308086, "grad_norm": 4.753994918159333, "learning_rate": 1.990860952198257e-05, "loss": 1.0291, "step": 1128 }, { "epoch": 0.14378273397328747, "grad_norm": 4.599970958452185, "learning_rate": 1.990833110690705e-05, "loss": 1.0102, "step": 1129 }, { "epoch": 0.14391008803349412, "grad_norm": 5.262607513561777, "learning_rate": 1.990805227034239e-05, "loss": 0.9505, "step": 1130 }, { "epoch": 0.14403744209370076, "grad_norm": 4.750430438967478, "learning_rate": 1.9907773012300442e-05, "loss": 1.1432, "step": 1131 }, { "epoch": 0.14416479615390737, "grad_norm": 5.5534318022376485, "learning_rate": 1.99074933327931e-05, "loss": 1.0166, "step": 1132 }, { "epoch": 0.144292150214114, "grad_norm": 4.36148381243615, "learning_rate": 1.9907213231832244e-05, "loss": 0.9515, "step": 1133 }, { "epoch": 0.14441950427432065, "grad_norm": 5.396342465843601, "learning_rate": 1.99069327094298e-05, "loss": 0.988, "step": 1134 }, { "epoch": 0.14454685833452727, "grad_norm": 4.160370125444187, "learning_rate": 1.99066517655977e-05, "loss": 0.9402, "step": 1135 }, { "epoch": 0.1446742123947339, "grad_norm": 4.221462705684877, "learning_rate": 1.990637040034789e-05, "loss": 0.923, "step": 1136 }, { "epoch": 0.14480156645494055, "grad_norm": 6.253117203600368, "learning_rate": 1.9906088613692348e-05, "loss": 1.0339, "step": 1137 }, { "epoch": 0.14492892051514716, "grad_norm": 6.017093469305539, "learning_rate": 1.9905806405643053e-05, "loss": 1.0136, "step": 1138 }, { "epoch": 0.1450562745753538, "grad_norm": 4.399259175443589, "learning_rate": 1.990552377621201e-05, "loss": 1.0272, "step": 1139 }, { "epoch": 0.14518362863556045, "grad_norm": 4.547342282958136, "learning_rate": 1.990524072541125e-05, "loss": 0.9937, "step": 1140 }, { "epoch": 0.14531098269576706, "grad_norm": 4.855709152837835, "learning_rate": 1.9904957253252804e-05, "loss": 0.9244, "step": 1141 }, { "epoch": 0.1454383367559737, "grad_norm": 5.5757047814984615, "learning_rate": 1.9904673359748735e-05, "loss": 0.9908, "step": 1142 }, { "epoch": 0.14556569081618034, "grad_norm": 6.043856870540701, "learning_rate": 1.9904389044911122e-05, "loss": 1.0196, "step": 1143 }, { "epoch": 0.14569304487638696, "grad_norm": 4.801416490086993, "learning_rate": 1.9904104308752053e-05, "loss": 1.0553, "step": 1144 }, { "epoch": 0.1458203989365936, "grad_norm": 5.379496091195757, "learning_rate": 1.9903819151283645e-05, "loss": 1.0304, "step": 1145 }, { "epoch": 0.14594775299680024, "grad_norm": 4.545430481586915, "learning_rate": 1.9903533572518026e-05, "loss": 0.9908, "step": 1146 }, { "epoch": 0.14607510705700685, "grad_norm": 5.220752628775305, "learning_rate": 1.9903247572467344e-05, "loss": 1.1251, "step": 1147 }, { "epoch": 0.1462024611172135, "grad_norm": 6.674829190815485, "learning_rate": 1.990296115114377e-05, "loss": 1.0095, "step": 1148 }, { "epoch": 0.14632981517742014, "grad_norm": 5.507004179382678, "learning_rate": 1.9902674308559483e-05, "loss": 1.0024, "step": 1149 }, { "epoch": 0.14645716923762675, "grad_norm": 4.71990972755875, "learning_rate": 1.9902387044726686e-05, "loss": 0.9865, "step": 1150 }, { "epoch": 0.1465845232978334, "grad_norm": 4.478833113678234, "learning_rate": 1.9902099359657597e-05, "loss": 0.9298, "step": 1151 }, { "epoch": 0.14671187735804003, "grad_norm": 6.364648530400351, "learning_rate": 1.9901811253364458e-05, "loss": 1.0776, "step": 1152 }, { "epoch": 0.14683923141824665, "grad_norm": 6.283898853112022, "learning_rate": 1.9901522725859523e-05, "loss": 1.0698, "step": 1153 }, { "epoch": 0.1469665854784533, "grad_norm": 4.844800872017544, "learning_rate": 1.9901233777155062e-05, "loss": 0.8839, "step": 1154 }, { "epoch": 0.14709393953865993, "grad_norm": 6.435025526587445, "learning_rate": 1.9900944407263373e-05, "loss": 1.0899, "step": 1155 }, { "epoch": 0.14722129359886654, "grad_norm": 6.109479898396119, "learning_rate": 1.9900654616196765e-05, "loss": 1.0966, "step": 1156 }, { "epoch": 0.14734864765907318, "grad_norm": 4.981320392872287, "learning_rate": 1.9900364403967555e-05, "loss": 0.9802, "step": 1157 }, { "epoch": 0.14747600171927983, "grad_norm": 6.337664612206646, "learning_rate": 1.9900073770588104e-05, "loss": 0.9783, "step": 1158 }, { "epoch": 0.14760335577948644, "grad_norm": 7.134358467286747, "learning_rate": 1.9899782716070764e-05, "loss": 1.0323, "step": 1159 }, { "epoch": 0.14773070983969308, "grad_norm": 6.583584293781033, "learning_rate": 1.9899491240427917e-05, "loss": 1.0655, "step": 1160 }, { "epoch": 0.14785806389989972, "grad_norm": 7.120906523954685, "learning_rate": 1.9899199343671968e-05, "loss": 1.0935, "step": 1161 }, { "epoch": 0.14798541796010634, "grad_norm": 5.433366980575255, "learning_rate": 1.9898907025815327e-05, "loss": 0.9979, "step": 1162 }, { "epoch": 0.14811277202031298, "grad_norm": 6.2705095615215205, "learning_rate": 1.9898614286870433e-05, "loss": 1.1334, "step": 1163 }, { "epoch": 0.1482401260805196, "grad_norm": 4.831201455882813, "learning_rate": 1.9898321126849743e-05, "loss": 1.0073, "step": 1164 }, { "epoch": 0.14836748014072623, "grad_norm": 4.672358765322368, "learning_rate": 1.9898027545765715e-05, "loss": 1.0821, "step": 1165 }, { "epoch": 0.14849483420093287, "grad_norm": 7.180955909703082, "learning_rate": 1.989773354363085e-05, "loss": 0.9288, "step": 1166 }, { "epoch": 0.1486221882611395, "grad_norm": 4.403280569757165, "learning_rate": 1.989743912045765e-05, "loss": 0.961, "step": 1167 }, { "epoch": 0.14874954232134613, "grad_norm": 8.908893607791116, "learning_rate": 1.9897144276258637e-05, "loss": 1.0988, "step": 1168 }, { "epoch": 0.14887689638155277, "grad_norm": 4.88691858604188, "learning_rate": 1.9896849011046356e-05, "loss": 0.9468, "step": 1169 }, { "epoch": 0.14900425044175938, "grad_norm": 4.439021534715612, "learning_rate": 1.989655332483337e-05, "loss": 1.09, "step": 1170 }, { "epoch": 0.14913160450196603, "grad_norm": 4.584781117513443, "learning_rate": 1.989625721763225e-05, "loss": 1.1378, "step": 1171 }, { "epoch": 0.14925895856217267, "grad_norm": 6.829460380092409, "learning_rate": 1.9895960689455598e-05, "loss": 1.0116, "step": 1172 }, { "epoch": 0.14938631262237928, "grad_norm": 5.925553037691101, "learning_rate": 1.9895663740316027e-05, "loss": 0.9667, "step": 1173 }, { "epoch": 0.14951366668258592, "grad_norm": 5.650634169645478, "learning_rate": 1.9895366370226164e-05, "loss": 1.0056, "step": 1174 }, { "epoch": 0.14964102074279256, "grad_norm": 6.303749288830858, "learning_rate": 1.9895068579198667e-05, "loss": 0.9515, "step": 1175 }, { "epoch": 0.14976837480299918, "grad_norm": 4.734757511813959, "learning_rate": 1.9894770367246197e-05, "loss": 1.0735, "step": 1176 }, { "epoch": 0.14989572886320582, "grad_norm": 6.024659029227881, "learning_rate": 1.9894471734381443e-05, "loss": 1.086, "step": 1177 }, { "epoch": 0.15002308292341246, "grad_norm": 7.097352579325189, "learning_rate": 1.989417268061711e-05, "loss": 0.951, "step": 1178 }, { "epoch": 0.15015043698361907, "grad_norm": 5.378720969428306, "learning_rate": 1.989387320596591e-05, "loss": 1.0527, "step": 1179 }, { "epoch": 0.15027779104382571, "grad_norm": 5.978861447650805, "learning_rate": 1.9893573310440592e-05, "loss": 1.0597, "step": 1180 }, { "epoch": 0.15040514510403236, "grad_norm": 8.018983252708948, "learning_rate": 1.989327299405391e-05, "loss": 0.8077, "step": 1181 }, { "epoch": 0.15053249916423897, "grad_norm": 5.923675182970437, "learning_rate": 1.9892972256818642e-05, "loss": 1.0515, "step": 1182 }, { "epoch": 0.1506598532244456, "grad_norm": 5.0075340729200315, "learning_rate": 1.989267109874758e-05, "loss": 0.9554, "step": 1183 }, { "epoch": 0.15078720728465225, "grad_norm": 4.6758452931338805, "learning_rate": 1.989236951985353e-05, "loss": 0.9426, "step": 1184 }, { "epoch": 0.15091456134485887, "grad_norm": 4.204069010290642, "learning_rate": 1.9892067520149325e-05, "loss": 1.0049, "step": 1185 }, { "epoch": 0.1510419154050655, "grad_norm": 5.048848336214381, "learning_rate": 1.989176509964781e-05, "loss": 1.0377, "step": 1186 }, { "epoch": 0.15116926946527215, "grad_norm": 6.138944051196484, "learning_rate": 1.9891462258361854e-05, "loss": 0.985, "step": 1187 }, { "epoch": 0.15129662352547876, "grad_norm": 5.009658449450558, "learning_rate": 1.9891158996304332e-05, "loss": 1.0061, "step": 1188 }, { "epoch": 0.1514239775856854, "grad_norm": 5.76423574524903, "learning_rate": 1.989085531348815e-05, "loss": 1.1038, "step": 1189 }, { "epoch": 0.15155133164589205, "grad_norm": 22.6411250699605, "learning_rate": 1.9890551209926228e-05, "loss": 1.0009, "step": 1190 }, { "epoch": 0.15167868570609866, "grad_norm": 7.817797032377672, "learning_rate": 1.9890246685631497e-05, "loss": 1.0513, "step": 1191 }, { "epoch": 0.1518060397663053, "grad_norm": 5.7520894110552065, "learning_rate": 1.9889941740616915e-05, "loss": 0.9986, "step": 1192 }, { "epoch": 0.15193339382651194, "grad_norm": 18.699414183880535, "learning_rate": 1.988963637489545e-05, "loss": 1.0264, "step": 1193 }, { "epoch": 0.15206074788671856, "grad_norm": 4.828371109600161, "learning_rate": 1.9889330588480092e-05, "loss": 1.1089, "step": 1194 }, { "epoch": 0.1521881019469252, "grad_norm": 5.636301851353449, "learning_rate": 1.9889024381383853e-05, "loss": 1.1113, "step": 1195 }, { "epoch": 0.15231545600713184, "grad_norm": 6.401306924892736, "learning_rate": 1.9888717753619756e-05, "loss": 1.0345, "step": 1196 }, { "epoch": 0.15244281006733845, "grad_norm": 4.8383120621596785, "learning_rate": 1.988841070520085e-05, "loss": 1.0335, "step": 1197 }, { "epoch": 0.1525701641275451, "grad_norm": 4.596736430265082, "learning_rate": 1.9888103236140187e-05, "loss": 0.9814, "step": 1198 }, { "epoch": 0.15269751818775174, "grad_norm": 5.860190047479529, "learning_rate": 1.988779534645085e-05, "loss": 1.0338, "step": 1199 }, { "epoch": 0.15282487224795835, "grad_norm": 8.045167730751048, "learning_rate": 1.9887487036145942e-05, "loss": 0.9122, "step": 1200 }, { "epoch": 0.152952226308165, "grad_norm": 5.64218312561491, "learning_rate": 1.988717830523857e-05, "loss": 1.0578, "step": 1201 }, { "epoch": 0.15307958036837163, "grad_norm": 5.637619781913279, "learning_rate": 1.9886869153741873e-05, "loss": 1.0015, "step": 1202 }, { "epoch": 0.15320693442857825, "grad_norm": 5.581245639367872, "learning_rate": 1.9886559581669e-05, "loss": 1.0501, "step": 1203 }, { "epoch": 0.1533342884887849, "grad_norm": 4.895855236680835, "learning_rate": 1.9886249589033115e-05, "loss": 1.0239, "step": 1204 }, { "epoch": 0.15346164254899153, "grad_norm": 6.5685870323690345, "learning_rate": 1.988593917584741e-05, "loss": 1.0942, "step": 1205 }, { "epoch": 0.15358899660919814, "grad_norm": 8.102356330141617, "learning_rate": 1.9885628342125093e-05, "loss": 1.093, "step": 1206 }, { "epoch": 0.15371635066940478, "grad_norm": 4.9198943497393985, "learning_rate": 1.9885317087879378e-05, "loss": 1.049, "step": 1207 }, { "epoch": 0.15384370472961142, "grad_norm": 4.840514055806769, "learning_rate": 1.9885005413123515e-05, "loss": 0.9917, "step": 1208 }, { "epoch": 0.15397105878981804, "grad_norm": 5.329710962069987, "learning_rate": 1.9884693317870754e-05, "loss": 0.8854, "step": 1209 }, { "epoch": 0.15409841285002468, "grad_norm": 6.40699250174998, "learning_rate": 1.9884380802134374e-05, "loss": 0.9925, "step": 1210 }, { "epoch": 0.1542257669102313, "grad_norm": 4.920757832972157, "learning_rate": 1.988406786592767e-05, "loss": 0.959, "step": 1211 }, { "epoch": 0.15435312097043793, "grad_norm": 5.794855058912914, "learning_rate": 1.9883754509263952e-05, "loss": 1.1196, "step": 1212 }, { "epoch": 0.15448047503064458, "grad_norm": 5.76384991041534, "learning_rate": 1.9883440732156553e-05, "loss": 1.0452, "step": 1213 }, { "epoch": 0.1546078290908512, "grad_norm": 5.213327114429976, "learning_rate": 1.9883126534618818e-05, "loss": 0.9344, "step": 1214 }, { "epoch": 0.15473518315105783, "grad_norm": 6.551134712996828, "learning_rate": 1.988281191666411e-05, "loss": 1.0484, "step": 1215 }, { "epoch": 0.15486253721126447, "grad_norm": 4.463543399675435, "learning_rate": 1.988249687830582e-05, "loss": 0.9069, "step": 1216 }, { "epoch": 0.1549898912714711, "grad_norm": 4.702098037275322, "learning_rate": 1.9882181419557342e-05, "loss": 1.0329, "step": 1217 }, { "epoch": 0.15511724533167773, "grad_norm": 5.438172923574985, "learning_rate": 1.9881865540432104e-05, "loss": 0.9534, "step": 1218 }, { "epoch": 0.15524459939188437, "grad_norm": 5.295644431518071, "learning_rate": 1.9881549240943533e-05, "loss": 1.0289, "step": 1219 }, { "epoch": 0.15537195345209098, "grad_norm": 4.500410485089305, "learning_rate": 1.988123252110509e-05, "loss": 1.0132, "step": 1220 }, { "epoch": 0.15549930751229762, "grad_norm": 4.585819379334822, "learning_rate": 1.9880915380930245e-05, "loss": 1.0024, "step": 1221 }, { "epoch": 0.15562666157250427, "grad_norm": 7.193516344506464, "learning_rate": 1.9880597820432493e-05, "loss": 0.9475, "step": 1222 }, { "epoch": 0.15575401563271088, "grad_norm": 5.1443825760360244, "learning_rate": 1.988027983962534e-05, "loss": 1.0516, "step": 1223 }, { "epoch": 0.15588136969291752, "grad_norm": 5.090082076634423, "learning_rate": 1.9879961438522312e-05, "loss": 0.966, "step": 1224 }, { "epoch": 0.15600872375312416, "grad_norm": 4.790525527190118, "learning_rate": 1.987964261713695e-05, "loss": 0.8954, "step": 1225 }, { "epoch": 0.15613607781333078, "grad_norm": 5.236927104927023, "learning_rate": 1.9879323375482825e-05, "loss": 1.0287, "step": 1226 }, { "epoch": 0.15626343187353742, "grad_norm": 5.950369610245666, "learning_rate": 1.987900371357351e-05, "loss": 1.0874, "step": 1227 }, { "epoch": 0.15639078593374406, "grad_norm": 5.524607144711156, "learning_rate": 1.9878683631422605e-05, "loss": 0.9406, "step": 1228 }, { "epoch": 0.15651813999395067, "grad_norm": 5.079437180226869, "learning_rate": 1.987836312904373e-05, "loss": 1.039, "step": 1229 }, { "epoch": 0.1566454940541573, "grad_norm": 6.124208048789126, "learning_rate": 1.9878042206450515e-05, "loss": 0.9518, "step": 1230 }, { "epoch": 0.15677284811436396, "grad_norm": 4.603658564682635, "learning_rate": 1.9877720863656605e-05, "loss": 0.9797, "step": 1231 }, { "epoch": 0.15690020217457057, "grad_norm": 5.297661376914641, "learning_rate": 1.9877399100675684e-05, "loss": 1.0236, "step": 1232 }, { "epoch": 0.1570275562347772, "grad_norm": 5.3680721259125255, "learning_rate": 1.987707691752143e-05, "loss": 0.9727, "step": 1233 }, { "epoch": 0.15715491029498385, "grad_norm": 5.842960079883927, "learning_rate": 1.987675431420755e-05, "loss": 0.9885, "step": 1234 }, { "epoch": 0.15728226435519047, "grad_norm": 3.570953778995695, "learning_rate": 1.9876431290747766e-05, "loss": 0.9444, "step": 1235 }, { "epoch": 0.1574096184153971, "grad_norm": 4.451910556028384, "learning_rate": 1.987610784715582e-05, "loss": 0.9464, "step": 1236 }, { "epoch": 0.15753697247560375, "grad_norm": 5.669527939617677, "learning_rate": 1.9875783983445473e-05, "loss": 0.9582, "step": 1237 }, { "epoch": 0.15766432653581036, "grad_norm": 5.212123844470004, "learning_rate": 1.9875459699630503e-05, "loss": 0.9536, "step": 1238 }, { "epoch": 0.157791680596017, "grad_norm": 5.681033304276333, "learning_rate": 1.9875134995724697e-05, "loss": 0.9886, "step": 1239 }, { "epoch": 0.15791903465622364, "grad_norm": 4.781488350306441, "learning_rate": 1.9874809871741877e-05, "loss": 1.0747, "step": 1240 }, { "epoch": 0.15804638871643026, "grad_norm": 5.27408922477416, "learning_rate": 1.9874484327695862e-05, "loss": 1.0797, "step": 1241 }, { "epoch": 0.1581737427766369, "grad_norm": 5.110825638930353, "learning_rate": 1.9874158363600513e-05, "loss": 0.9206, "step": 1242 }, { "epoch": 0.15830109683684354, "grad_norm": 5.286355163870387, "learning_rate": 1.9873831979469687e-05, "loss": 0.9544, "step": 1243 }, { "epoch": 0.15842845089705015, "grad_norm": 5.876306266847982, "learning_rate": 1.9873505175317272e-05, "loss": 0.9467, "step": 1244 }, { "epoch": 0.1585558049572568, "grad_norm": 3.8290624599508156, "learning_rate": 1.987317795115717e-05, "loss": 0.99, "step": 1245 }, { "epoch": 0.15868315901746344, "grad_norm": 6.334550171788543, "learning_rate": 1.98728503070033e-05, "loss": 1.0207, "step": 1246 }, { "epoch": 0.15881051307767005, "grad_norm": 5.201014001722949, "learning_rate": 1.9872522242869598e-05, "loss": 1.0032, "step": 1247 }, { "epoch": 0.1589378671378767, "grad_norm": 5.339655506580464, "learning_rate": 1.987219375877002e-05, "loss": 1.0571, "step": 1248 }, { "epoch": 0.15906522119808333, "grad_norm": 5.917438223951082, "learning_rate": 1.9871864854718545e-05, "loss": 0.9298, "step": 1249 }, { "epoch": 0.15919257525828995, "grad_norm": 5.84159509430854, "learning_rate": 1.9871535530729154e-05, "loss": 1.1235, "step": 1250 }, { "epoch": 0.1593199293184966, "grad_norm": 6.647444797729966, "learning_rate": 1.9871205786815865e-05, "loss": 1.0961, "step": 1251 }, { "epoch": 0.15944728337870323, "grad_norm": 4.506879845567531, "learning_rate": 1.9870875622992697e-05, "loss": 1.0051, "step": 1252 }, { "epoch": 0.15957463743890984, "grad_norm": 4.978751044171506, "learning_rate": 1.9870545039273704e-05, "loss": 0.9519, "step": 1253 }, { "epoch": 0.15970199149911649, "grad_norm": 4.884386465067718, "learning_rate": 1.9870214035672945e-05, "loss": 1.0405, "step": 1254 }, { "epoch": 0.1598293455593231, "grad_norm": 6.01301752032264, "learning_rate": 1.9869882612204496e-05, "loss": 0.9768, "step": 1255 }, { "epoch": 0.15995669961952974, "grad_norm": 5.66422076826293, "learning_rate": 1.986955076888246e-05, "loss": 0.8278, "step": 1256 }, { "epoch": 0.16008405367973638, "grad_norm": 6.38059333314379, "learning_rate": 1.986921850572095e-05, "loss": 0.9132, "step": 1257 }, { "epoch": 0.160211407739943, "grad_norm": 5.136288561774867, "learning_rate": 1.9868885822734104e-05, "loss": 1.0093, "step": 1258 }, { "epoch": 0.16033876180014964, "grad_norm": 6.1340803451624435, "learning_rate": 1.986855271993607e-05, "loss": 0.9994, "step": 1259 }, { "epoch": 0.16046611586035628, "grad_norm": 6.610357838234759, "learning_rate": 1.9868219197341024e-05, "loss": 0.8919, "step": 1260 }, { "epoch": 0.1605934699205629, "grad_norm": 5.068832393028328, "learning_rate": 1.9867885254963147e-05, "loss": 0.958, "step": 1261 }, { "epoch": 0.16072082398076953, "grad_norm": 5.098652919809279, "learning_rate": 1.9867550892816646e-05, "loss": 1.0166, "step": 1262 }, { "epoch": 0.16084817804097618, "grad_norm": 5.54546086727413, "learning_rate": 1.9867216110915745e-05, "loss": 0.9558, "step": 1263 }, { "epoch": 0.1609755321011828, "grad_norm": 3.6506732894506726, "learning_rate": 1.986688090927469e-05, "loss": 0.9749, "step": 1264 }, { "epoch": 0.16110288616138943, "grad_norm": 6.630011336603153, "learning_rate": 1.9866545287907732e-05, "loss": 1.087, "step": 1265 }, { "epoch": 0.16123024022159607, "grad_norm": 5.1520144550405025, "learning_rate": 1.9866209246829152e-05, "loss": 1.0564, "step": 1266 }, { "epoch": 0.16135759428180269, "grad_norm": 4.567456623073952, "learning_rate": 1.9865872786053245e-05, "loss": 0.9943, "step": 1267 }, { "epoch": 0.16148494834200933, "grad_norm": 5.429662910241082, "learning_rate": 1.9865535905594326e-05, "loss": 1.0046, "step": 1268 }, { "epoch": 0.16161230240221597, "grad_norm": 4.745719989140955, "learning_rate": 1.986519860546672e-05, "loss": 0.9511, "step": 1269 }, { "epoch": 0.16173965646242258, "grad_norm": 4.907924293524879, "learning_rate": 1.986486088568478e-05, "loss": 0.947, "step": 1270 }, { "epoch": 0.16186701052262922, "grad_norm": 5.93928420715077, "learning_rate": 1.9864522746262867e-05, "loss": 1.0879, "step": 1271 }, { "epoch": 0.16199436458283586, "grad_norm": 4.6692044079111, "learning_rate": 1.986418418721537e-05, "loss": 0.9495, "step": 1272 }, { "epoch": 0.16212171864304248, "grad_norm": 6.435001221651496, "learning_rate": 1.986384520855669e-05, "loss": 0.9653, "step": 1273 }, { "epoch": 0.16224907270324912, "grad_norm": 4.8527476756639745, "learning_rate": 1.9863505810301246e-05, "loss": 1.0149, "step": 1274 }, { "epoch": 0.16237642676345576, "grad_norm": 5.268659764215143, "learning_rate": 1.9863165992463477e-05, "loss": 0.988, "step": 1275 }, { "epoch": 0.16250378082366237, "grad_norm": 6.928202404369275, "learning_rate": 1.986282575505783e-05, "loss": 1.0151, "step": 1276 }, { "epoch": 0.16263113488386902, "grad_norm": 5.708832025208047, "learning_rate": 1.9862485098098796e-05, "loss": 0.986, "step": 1277 }, { "epoch": 0.16275848894407566, "grad_norm": 4.293735314223316, "learning_rate": 1.986214402160085e-05, "loss": 0.9516, "step": 1278 }, { "epoch": 0.16288584300428227, "grad_norm": 5.128487865527736, "learning_rate": 1.9861802525578508e-05, "loss": 0.9776, "step": 1279 }, { "epoch": 0.1630131970644889, "grad_norm": 5.926519580527757, "learning_rate": 1.986146061004629e-05, "loss": 0.9454, "step": 1280 }, { "epoch": 0.16314055112469555, "grad_norm": 4.709791762168808, "learning_rate": 1.9861118275018755e-05, "loss": 1.055, "step": 1281 }, { "epoch": 0.16326790518490217, "grad_norm": 3.9447872001005515, "learning_rate": 1.9860775520510453e-05, "loss": 1.0019, "step": 1282 }, { "epoch": 0.1633952592451088, "grad_norm": 4.657867150816275, "learning_rate": 1.9860432346535966e-05, "loss": 0.9158, "step": 1283 }, { "epoch": 0.16352261330531545, "grad_norm": 6.558586770395333, "learning_rate": 1.9860088753109896e-05, "loss": 1.0146, "step": 1284 }, { "epoch": 0.16364996736552206, "grad_norm": 6.790804131467284, "learning_rate": 1.985974474024686e-05, "loss": 1.1179, "step": 1285 }, { "epoch": 0.1637773214257287, "grad_norm": 4.380667149497051, "learning_rate": 1.9859400307961486e-05, "loss": 1.029, "step": 1286 }, { "epoch": 0.16390467548593535, "grad_norm": 6.457342584561743, "learning_rate": 1.985905545626843e-05, "loss": 1.0326, "step": 1287 }, { "epoch": 0.16403202954614196, "grad_norm": 4.491102535418511, "learning_rate": 1.985871018518236e-05, "loss": 1.0289, "step": 1288 }, { "epoch": 0.1641593836063486, "grad_norm": 5.129222154149322, "learning_rate": 1.9858364494717966e-05, "loss": 1.0228, "step": 1289 }, { "epoch": 0.16428673766655524, "grad_norm": 5.162285434060186, "learning_rate": 1.9858018384889946e-05, "loss": 0.9045, "step": 1290 }, { "epoch": 0.16441409172676186, "grad_norm": 5.6943603054539915, "learning_rate": 1.9857671855713038e-05, "loss": 0.9674, "step": 1291 }, { "epoch": 0.1645414457869685, "grad_norm": 5.6519327284473695, "learning_rate": 1.9857324907201966e-05, "loss": 0.9993, "step": 1292 }, { "epoch": 0.16466879984717514, "grad_norm": 5.529623880960854, "learning_rate": 1.98569775393715e-05, "loss": 0.973, "step": 1293 }, { "epoch": 0.16479615390738175, "grad_norm": 4.634977146985319, "learning_rate": 1.9856629752236413e-05, "loss": 1.0061, "step": 1294 }, { "epoch": 0.1649235079675884, "grad_norm": 5.395211300534669, "learning_rate": 1.9856281545811497e-05, "loss": 0.9469, "step": 1295 }, { "epoch": 0.16505086202779504, "grad_norm": 4.268896392501476, "learning_rate": 1.9855932920111563e-05, "loss": 0.9921, "step": 1296 }, { "epoch": 0.16517821608800165, "grad_norm": 7.711568727329617, "learning_rate": 1.9855583875151453e-05, "loss": 0.9872, "step": 1297 }, { "epoch": 0.1653055701482083, "grad_norm": 5.241036528942064, "learning_rate": 1.9855234410946002e-05, "loss": 0.9546, "step": 1298 }, { "epoch": 0.16543292420841493, "grad_norm": 6.764661337078974, "learning_rate": 1.985488452751008e-05, "loss": 0.9521, "step": 1299 }, { "epoch": 0.16556027826862155, "grad_norm": 6.087105126077417, "learning_rate": 1.9854534224858574e-05, "loss": 1.0039, "step": 1300 }, { "epoch": 0.1656876323288282, "grad_norm": 5.684029327166587, "learning_rate": 1.9854183503006383e-05, "loss": 1.082, "step": 1301 }, { "epoch": 0.1658149863890348, "grad_norm": 4.707341632660841, "learning_rate": 1.9853832361968424e-05, "loss": 0.9413, "step": 1302 }, { "epoch": 0.16594234044924144, "grad_norm": 5.88976947430451, "learning_rate": 1.9853480801759637e-05, "loss": 1.0828, "step": 1303 }, { "epoch": 0.16606969450944808, "grad_norm": 4.565661654367643, "learning_rate": 1.9853128822394976e-05, "loss": 1.0482, "step": 1304 }, { "epoch": 0.1661970485696547, "grad_norm": 4.481935482188907, "learning_rate": 1.9852776423889414e-05, "loss": 1.1119, "step": 1305 }, { "epoch": 0.16632440262986134, "grad_norm": 11.635677561258317, "learning_rate": 1.9852423606257943e-05, "loss": 0.9503, "step": 1306 }, { "epoch": 0.16645175669006798, "grad_norm": 7.3166335722827265, "learning_rate": 1.9852070369515566e-05, "loss": 1.0055, "step": 1307 }, { "epoch": 0.1665791107502746, "grad_norm": 5.1385735125753165, "learning_rate": 1.9851716713677315e-05, "loss": 0.9379, "step": 1308 }, { "epoch": 0.16670646481048124, "grad_norm": 4.744954692274551, "learning_rate": 1.9851362638758236e-05, "loss": 1.0048, "step": 1309 }, { "epoch": 0.16683381887068788, "grad_norm": 4.680360774292839, "learning_rate": 1.9851008144773386e-05, "loss": 0.9329, "step": 1310 }, { "epoch": 0.1669611729308945, "grad_norm": 5.54574530474556, "learning_rate": 1.9850653231737844e-05, "loss": 1.0084, "step": 1311 }, { "epoch": 0.16708852699110113, "grad_norm": 5.629096341017364, "learning_rate": 1.985029789966671e-05, "loss": 0.9647, "step": 1312 }, { "epoch": 0.16721588105130777, "grad_norm": 4.247339201112461, "learning_rate": 1.98499421485751e-05, "loss": 1.018, "step": 1313 }, { "epoch": 0.1673432351115144, "grad_norm": 4.8500695135109835, "learning_rate": 1.984958597847815e-05, "loss": 1.0345, "step": 1314 }, { "epoch": 0.16747058917172103, "grad_norm": 5.242236255129088, "learning_rate": 1.9849229389391e-05, "loss": 1.0124, "step": 1315 }, { "epoch": 0.16759794323192767, "grad_norm": 4.494156115860417, "learning_rate": 1.984887238132883e-05, "loss": 0.9282, "step": 1316 }, { "epoch": 0.16772529729213428, "grad_norm": 5.527577705958565, "learning_rate": 1.9848514954306827e-05, "loss": 0.9766, "step": 1317 }, { "epoch": 0.16785265135234093, "grad_norm": 7.153181059331445, "learning_rate": 1.9848157108340186e-05, "loss": 0.9793, "step": 1318 }, { "epoch": 0.16798000541254757, "grad_norm": 4.274232333675409, "learning_rate": 1.984779884344414e-05, "loss": 0.8939, "step": 1319 }, { "epoch": 0.16810735947275418, "grad_norm": 6.091392647998473, "learning_rate": 1.9847440159633918e-05, "loss": 1.0784, "step": 1320 }, { "epoch": 0.16823471353296082, "grad_norm": 4.3504516717848, "learning_rate": 1.9847081056924788e-05, "loss": 1.0399, "step": 1321 }, { "epoch": 0.16836206759316746, "grad_norm": 5.874738646842187, "learning_rate": 1.984672153533202e-05, "loss": 1.0156, "step": 1322 }, { "epoch": 0.16848942165337408, "grad_norm": 6.273024606003035, "learning_rate": 1.9846361594870914e-05, "loss": 1.0532, "step": 1323 }, { "epoch": 0.16861677571358072, "grad_norm": 9.43738116258996, "learning_rate": 1.9846001235556775e-05, "loss": 0.9758, "step": 1324 }, { "epoch": 0.16874412977378736, "grad_norm": 3.951354463349465, "learning_rate": 1.984564045740493e-05, "loss": 1.0017, "step": 1325 }, { "epoch": 0.16887148383399397, "grad_norm": 6.057988641405209, "learning_rate": 1.984527926043074e-05, "loss": 1.0013, "step": 1326 }, { "epoch": 0.16899883789420062, "grad_norm": 5.121047938510769, "learning_rate": 1.9844917644649553e-05, "loss": 1.0206, "step": 1327 }, { "epoch": 0.16912619195440726, "grad_norm": 6.34730347577412, "learning_rate": 1.984455561007676e-05, "loss": 1.0719, "step": 1328 }, { "epoch": 0.16925354601461387, "grad_norm": 6.9857632411586525, "learning_rate": 1.984419315672776e-05, "loss": 1.1088, "step": 1329 }, { "epoch": 0.1693809000748205, "grad_norm": 4.054283636482987, "learning_rate": 1.9843830284617975e-05, "loss": 0.9645, "step": 1330 }, { "epoch": 0.16950825413502715, "grad_norm": 5.763261181254879, "learning_rate": 1.9843466993762836e-05, "loss": 1.0112, "step": 1331 }, { "epoch": 0.16963560819523377, "grad_norm": 5.006710508504241, "learning_rate": 1.98431032841778e-05, "loss": 1.0243, "step": 1332 }, { "epoch": 0.1697629622554404, "grad_norm": 5.33453967438594, "learning_rate": 1.9842739155878337e-05, "loss": 1.0415, "step": 1333 }, { "epoch": 0.16989031631564705, "grad_norm": 4.239488505334722, "learning_rate": 1.984237460887994e-05, "loss": 1.0452, "step": 1334 }, { "epoch": 0.17001767037585366, "grad_norm": 8.172692340070634, "learning_rate": 1.9842009643198113e-05, "loss": 0.9793, "step": 1335 }, { "epoch": 0.1701450244360603, "grad_norm": 5.34305572461287, "learning_rate": 1.984164425884838e-05, "loss": 1.0048, "step": 1336 }, { "epoch": 0.17027237849626695, "grad_norm": 5.792797865504691, "learning_rate": 1.984127845584629e-05, "loss": 0.9795, "step": 1337 }, { "epoch": 0.17039973255647356, "grad_norm": 4.318874126588256, "learning_rate": 1.9840912234207396e-05, "loss": 0.9736, "step": 1338 }, { "epoch": 0.1705270866166802, "grad_norm": 4.162413651770721, "learning_rate": 1.9840545593947286e-05, "loss": 1.0151, "step": 1339 }, { "epoch": 0.17065444067688684, "grad_norm": 6.5925908086697325, "learning_rate": 1.9840178535081548e-05, "loss": 0.904, "step": 1340 }, { "epoch": 0.17078179473709346, "grad_norm": 5.536949712479812, "learning_rate": 1.98398110576258e-05, "loss": 1.0522, "step": 1341 }, { "epoch": 0.1709091487973001, "grad_norm": 7.280052774258995, "learning_rate": 1.9839443161595668e-05, "loss": 1.0073, "step": 1342 }, { "epoch": 0.17103650285750674, "grad_norm": 5.838656403647462, "learning_rate": 1.9839074847006815e-05, "loss": 0.9766, "step": 1343 }, { "epoch": 0.17116385691771335, "grad_norm": 6.120541876944528, "learning_rate": 1.9838706113874896e-05, "loss": 1.003, "step": 1344 }, { "epoch": 0.17129121097792, "grad_norm": 6.272822405730764, "learning_rate": 1.9838336962215606e-05, "loss": 0.9179, "step": 1345 }, { "epoch": 0.1714185650381266, "grad_norm": 4.571611477394392, "learning_rate": 1.983796739204464e-05, "loss": 1.1381, "step": 1346 }, { "epoch": 0.17154591909833325, "grad_norm": 5.164721379109247, "learning_rate": 1.9837597403377726e-05, "loss": 1.0764, "step": 1347 }, { "epoch": 0.1716732731585399, "grad_norm": 5.382269800120944, "learning_rate": 1.98372269962306e-05, "loss": 1.099, "step": 1348 }, { "epoch": 0.1718006272187465, "grad_norm": 5.6010520628007985, "learning_rate": 1.9836856170619018e-05, "loss": 0.8125, "step": 1349 }, { "epoch": 0.17192798127895315, "grad_norm": 6.5416130825043, "learning_rate": 1.983648492655875e-05, "loss": 0.9736, "step": 1350 }, { "epoch": 0.1720553353391598, "grad_norm": 5.986232857622864, "learning_rate": 1.9836113264065598e-05, "loss": 1.1308, "step": 1351 }, { "epoch": 0.1721826893993664, "grad_norm": 3.721914787944601, "learning_rate": 1.983574118315537e-05, "loss": 0.9768, "step": 1352 }, { "epoch": 0.17231004345957304, "grad_norm": 5.081106208746815, "learning_rate": 1.983536868384389e-05, "loss": 1.0861, "step": 1353 }, { "epoch": 0.17243739751977968, "grad_norm": 4.766787127771113, "learning_rate": 1.9834995766147e-05, "loss": 0.9764, "step": 1354 }, { "epoch": 0.1725647515799863, "grad_norm": 5.844392628957778, "learning_rate": 1.9834622430080574e-05, "loss": 0.9675, "step": 1355 }, { "epoch": 0.17269210564019294, "grad_norm": 4.266773111287161, "learning_rate": 1.9834248675660484e-05, "loss": 0.9577, "step": 1356 }, { "epoch": 0.17281945970039958, "grad_norm": 5.339045366544977, "learning_rate": 1.9833874502902636e-05, "loss": 1.0691, "step": 1357 }, { "epoch": 0.1729468137606062, "grad_norm": 5.321239149867997, "learning_rate": 1.9833499911822944e-05, "loss": 1.02, "step": 1358 }, { "epoch": 0.17307416782081284, "grad_norm": 6.036001912839067, "learning_rate": 1.983312490243734e-05, "loss": 1.0136, "step": 1359 }, { "epoch": 0.17320152188101948, "grad_norm": 5.842316966880855, "learning_rate": 1.9832749474761782e-05, "loss": 0.9487, "step": 1360 }, { "epoch": 0.1733288759412261, "grad_norm": 5.268133726086745, "learning_rate": 1.9832373628812235e-05, "loss": 0.897, "step": 1361 }, { "epoch": 0.17345623000143273, "grad_norm": 10.765103043746514, "learning_rate": 1.9831997364604693e-05, "loss": 1.057, "step": 1362 }, { "epoch": 0.17358358406163937, "grad_norm": 6.986459988359291, "learning_rate": 1.983162068215515e-05, "loss": 0.984, "step": 1363 }, { "epoch": 0.173710938121846, "grad_norm": 6.54607260801127, "learning_rate": 1.9831243581479643e-05, "loss": 1.1114, "step": 1364 }, { "epoch": 0.17383829218205263, "grad_norm": 4.736502694914325, "learning_rate": 1.983086606259421e-05, "loss": 0.9659, "step": 1365 }, { "epoch": 0.17396564624225927, "grad_norm": 5.3548235459378, "learning_rate": 1.9830488125514907e-05, "loss": 0.9164, "step": 1366 }, { "epoch": 0.17409300030246588, "grad_norm": 5.012251530527317, "learning_rate": 1.983010977025781e-05, "loss": 1.0724, "step": 1367 }, { "epoch": 0.17422035436267252, "grad_norm": 4.104810144678136, "learning_rate": 1.982973099683902e-05, "loss": 1.0126, "step": 1368 }, { "epoch": 0.17434770842287917, "grad_norm": 4.606415189475125, "learning_rate": 1.9829351805274643e-05, "loss": 0.9828, "step": 1369 }, { "epoch": 0.17447506248308578, "grad_norm": 4.9605551264425625, "learning_rate": 1.9828972195580815e-05, "loss": 1.0372, "step": 1370 }, { "epoch": 0.17460241654329242, "grad_norm": 6.515749118272252, "learning_rate": 1.9828592167773676e-05, "loss": 0.9049, "step": 1371 }, { "epoch": 0.17472977060349906, "grad_norm": 5.611944008796001, "learning_rate": 1.9828211721869404e-05, "loss": 1.0385, "step": 1372 }, { "epoch": 0.17485712466370568, "grad_norm": 4.296467265811125, "learning_rate": 1.9827830857884173e-05, "loss": 1.0467, "step": 1373 }, { "epoch": 0.17498447872391232, "grad_norm": 5.582847839807333, "learning_rate": 1.9827449575834187e-05, "loss": 0.9349, "step": 1374 }, { "epoch": 0.17511183278411896, "grad_norm": 5.693181480389708, "learning_rate": 1.9827067875735667e-05, "loss": 0.8793, "step": 1375 }, { "epoch": 0.17523918684432557, "grad_norm": 5.261651325431034, "learning_rate": 1.982668575760485e-05, "loss": 0.9581, "step": 1376 }, { "epoch": 0.17536654090453221, "grad_norm": 4.977291802306023, "learning_rate": 1.982630322145799e-05, "loss": 0.9355, "step": 1377 }, { "epoch": 0.17549389496473886, "grad_norm": 8.424306090442627, "learning_rate": 1.9825920267311358e-05, "loss": 1.0181, "step": 1378 }, { "epoch": 0.17562124902494547, "grad_norm": 4.067377536948859, "learning_rate": 1.9825536895181245e-05, "loss": 0.9847, "step": 1379 }, { "epoch": 0.1757486030851521, "grad_norm": 5.434342969610268, "learning_rate": 1.982515310508396e-05, "loss": 1.0621, "step": 1380 }, { "epoch": 0.17587595714535875, "grad_norm": 3.951322725200578, "learning_rate": 1.9824768897035833e-05, "loss": 0.9415, "step": 1381 }, { "epoch": 0.17600331120556537, "grad_norm": 4.20572669754507, "learning_rate": 1.98243842710532e-05, "loss": 0.9017, "step": 1382 }, { "epoch": 0.176130665265772, "grad_norm": 5.642519645876438, "learning_rate": 1.9823999227152426e-05, "loss": 0.9919, "step": 1383 }, { "epoch": 0.17625801932597865, "grad_norm": 5.8060276543263845, "learning_rate": 1.9823613765349894e-05, "loss": 0.9357, "step": 1384 }, { "epoch": 0.17638537338618526, "grad_norm": 6.730220495639194, "learning_rate": 1.9823227885661994e-05, "loss": 1.0167, "step": 1385 }, { "epoch": 0.1765127274463919, "grad_norm": 4.98295281189637, "learning_rate": 1.982284158810515e-05, "loss": 0.9644, "step": 1386 }, { "epoch": 0.17664008150659855, "grad_norm": 5.611349539312602, "learning_rate": 1.982245487269579e-05, "loss": 0.9759, "step": 1387 }, { "epoch": 0.17676743556680516, "grad_norm": 5.830986568146951, "learning_rate": 1.982206773945036e-05, "loss": 0.9505, "step": 1388 }, { "epoch": 0.1768947896270118, "grad_norm": 4.7729098313991845, "learning_rate": 1.9821680188385334e-05, "loss": 1.0816, "step": 1389 }, { "epoch": 0.17702214368721844, "grad_norm": 4.199991298848176, "learning_rate": 1.982129221951719e-05, "loss": 1.1747, "step": 1390 }, { "epoch": 0.17714949774742506, "grad_norm": 5.562743963827968, "learning_rate": 1.9820903832862445e-05, "loss": 1.0474, "step": 1391 }, { "epoch": 0.1772768518076317, "grad_norm": 6.610761492520081, "learning_rate": 1.9820515028437612e-05, "loss": 1.0511, "step": 1392 }, { "epoch": 0.1774042058678383, "grad_norm": 4.700877720465981, "learning_rate": 1.9820125806259233e-05, "loss": 1.034, "step": 1393 }, { "epoch": 0.17753155992804495, "grad_norm": 5.507829641761889, "learning_rate": 1.981973616634386e-05, "loss": 0.9607, "step": 1394 }, { "epoch": 0.1776589139882516, "grad_norm": 4.55152367032817, "learning_rate": 1.9819346108708074e-05, "loss": 1.0074, "step": 1395 }, { "epoch": 0.1777862680484582, "grad_norm": 5.155387667593372, "learning_rate": 1.9818955633368464e-05, "loss": 0.9597, "step": 1396 }, { "epoch": 0.17791362210866485, "grad_norm": 5.73277489064217, "learning_rate": 1.981856474034164e-05, "loss": 1.0927, "step": 1397 }, { "epoch": 0.1780409761688715, "grad_norm": 4.071601648882491, "learning_rate": 1.9818173429644237e-05, "loss": 0.9982, "step": 1398 }, { "epoch": 0.1781683302290781, "grad_norm": 3.9397300581268513, "learning_rate": 1.9817781701292892e-05, "loss": 0.9403, "step": 1399 }, { "epoch": 0.17829568428928474, "grad_norm": 4.89777375854755, "learning_rate": 1.9817389555304274e-05, "loss": 0.9848, "step": 1400 }, { "epoch": 0.1784230383494914, "grad_norm": 5.384909433154051, "learning_rate": 1.9816996991695057e-05, "loss": 1.0004, "step": 1401 }, { "epoch": 0.178550392409698, "grad_norm": 6.171274817534474, "learning_rate": 1.9816604010481955e-05, "loss": 0.9742, "step": 1402 }, { "epoch": 0.17867774646990464, "grad_norm": 5.372669158161206, "learning_rate": 1.981621061168167e-05, "loss": 1.0258, "step": 1403 }, { "epoch": 0.17880510053011128, "grad_norm": 5.42572394959394, "learning_rate": 1.9815816795310945e-05, "loss": 0.8953, "step": 1404 }, { "epoch": 0.1789324545903179, "grad_norm": 4.888856025546814, "learning_rate": 1.981542256138653e-05, "loss": 0.9233, "step": 1405 }, { "epoch": 0.17905980865052454, "grad_norm": 5.561367305342423, "learning_rate": 1.9815027909925194e-05, "loss": 0.9601, "step": 1406 }, { "epoch": 0.17918716271073118, "grad_norm": 4.272674659833029, "learning_rate": 1.9814632840943728e-05, "loss": 1.0174, "step": 1407 }, { "epoch": 0.1793145167709378, "grad_norm": 4.094915326073295, "learning_rate": 1.9814237354458937e-05, "loss": 0.9695, "step": 1408 }, { "epoch": 0.17944187083114443, "grad_norm": 4.905545323784712, "learning_rate": 1.981384145048764e-05, "loss": 0.929, "step": 1409 }, { "epoch": 0.17956922489135108, "grad_norm": 4.367058003097394, "learning_rate": 1.9813445129046685e-05, "loss": 0.9274, "step": 1410 }, { "epoch": 0.1796965789515577, "grad_norm": 6.3573246680356394, "learning_rate": 1.9813048390152926e-05, "loss": 1.0155, "step": 1411 }, { "epoch": 0.17982393301176433, "grad_norm": 6.87006575640397, "learning_rate": 1.9812651233823245e-05, "loss": 0.9062, "step": 1412 }, { "epoch": 0.17995128707197097, "grad_norm": 4.697870202127253, "learning_rate": 1.9812253660074532e-05, "loss": 1.0905, "step": 1413 }, { "epoch": 0.18007864113217759, "grad_norm": 5.959781639656172, "learning_rate": 1.98118556689237e-05, "loss": 1.119, "step": 1414 }, { "epoch": 0.18020599519238423, "grad_norm": 4.111138375223385, "learning_rate": 1.9811457260387683e-05, "loss": 0.923, "step": 1415 }, { "epoch": 0.18033334925259087, "grad_norm": 4.006177185634729, "learning_rate": 1.9811058434483422e-05, "loss": 1.0849, "step": 1416 }, { "epoch": 0.18046070331279748, "grad_norm": 4.48631303225857, "learning_rate": 1.981065919122789e-05, "loss": 1.0728, "step": 1417 }, { "epoch": 0.18058805737300412, "grad_norm": 6.525586168407294, "learning_rate": 1.9810259530638064e-05, "loss": 1.0408, "step": 1418 }, { "epoch": 0.18071541143321077, "grad_norm": 6.020659799505925, "learning_rate": 1.980985945273095e-05, "loss": 1.0975, "step": 1419 }, { "epoch": 0.18084276549341738, "grad_norm": 6.885051800346678, "learning_rate": 1.9809458957523563e-05, "loss": 0.9118, "step": 1420 }, { "epoch": 0.18097011955362402, "grad_norm": 3.859705119358011, "learning_rate": 1.9809058045032942e-05, "loss": 1.0455, "step": 1421 }, { "epoch": 0.18109747361383066, "grad_norm": 5.034443854459602, "learning_rate": 1.980865671527614e-05, "loss": 0.9053, "step": 1422 }, { "epoch": 0.18122482767403728, "grad_norm": 4.970510494794013, "learning_rate": 1.9808254968270236e-05, "loss": 1.0684, "step": 1423 }, { "epoch": 0.18135218173424392, "grad_norm": 6.993515553435381, "learning_rate": 1.9807852804032306e-05, "loss": 1.0819, "step": 1424 }, { "epoch": 0.18147953579445056, "grad_norm": 4.3244256139826405, "learning_rate": 1.980745022257947e-05, "loss": 1.0832, "step": 1425 }, { "epoch": 0.18160688985465717, "grad_norm": 5.41282386427537, "learning_rate": 1.9807047223928847e-05, "loss": 1.0109, "step": 1426 }, { "epoch": 0.1817342439148638, "grad_norm": 5.5849581199995075, "learning_rate": 1.980664380809758e-05, "loss": 0.9701, "step": 1427 }, { "epoch": 0.18186159797507045, "grad_norm": 7.5150214763849705, "learning_rate": 1.980623997510283e-05, "loss": 1.1214, "step": 1428 }, { "epoch": 0.18198895203527707, "grad_norm": 5.378983019888339, "learning_rate": 1.9805835724961783e-05, "loss": 1.0311, "step": 1429 }, { "epoch": 0.1821163060954837, "grad_norm": 5.328501578975879, "learning_rate": 1.9805431057691627e-05, "loss": 1.089, "step": 1430 }, { "epoch": 0.18224366015569035, "grad_norm": 4.827672449525467, "learning_rate": 1.9805025973309577e-05, "loss": 0.9638, "step": 1431 }, { "epoch": 0.18237101421589696, "grad_norm": 4.960158495677945, "learning_rate": 1.980462047183287e-05, "loss": 0.994, "step": 1432 }, { "epoch": 0.1824983682761036, "grad_norm": 4.417898594090962, "learning_rate": 1.9804214553278747e-05, "loss": 0.9635, "step": 1433 }, { "epoch": 0.18262572233631025, "grad_norm": 4.3511745752527755, "learning_rate": 1.9803808217664483e-05, "loss": 0.9231, "step": 1434 }, { "epoch": 0.18275307639651686, "grad_norm": 7.574369841195433, "learning_rate": 1.9803401465007363e-05, "loss": 0.9365, "step": 1435 }, { "epoch": 0.1828804304567235, "grad_norm": 4.817720034473403, "learning_rate": 1.9802994295324685e-05, "loss": 0.9657, "step": 1436 }, { "epoch": 0.18300778451693012, "grad_norm": 3.9337316979442223, "learning_rate": 1.980258670863377e-05, "loss": 1.0757, "step": 1437 }, { "epoch": 0.18313513857713676, "grad_norm": 4.912953650419165, "learning_rate": 1.980217870495196e-05, "loss": 0.8668, "step": 1438 }, { "epoch": 0.1832624926373434, "grad_norm": 6.3601703568201255, "learning_rate": 1.980177028429661e-05, "loss": 1.0216, "step": 1439 }, { "epoch": 0.18338984669755, "grad_norm": 4.361051722376627, "learning_rate": 1.980136144668509e-05, "loss": 1.0007, "step": 1440 }, { "epoch": 0.18351720075775665, "grad_norm": 4.831064656441857, "learning_rate": 1.98009521921348e-05, "loss": 1.0569, "step": 1441 }, { "epoch": 0.1836445548179633, "grad_norm": 4.673866077337553, "learning_rate": 1.9800542520663136e-05, "loss": 1.1306, "step": 1442 }, { "epoch": 0.1837719088781699, "grad_norm": 5.025317562564007, "learning_rate": 1.980013243228754e-05, "loss": 1.0482, "step": 1443 }, { "epoch": 0.18389926293837655, "grad_norm": 4.917699005025452, "learning_rate": 1.979972192702544e-05, "loss": 0.9926, "step": 1444 }, { "epoch": 0.1840266169985832, "grad_norm": 6.695090006422005, "learning_rate": 1.9799311004894314e-05, "loss": 1.0492, "step": 1445 }, { "epoch": 0.1841539710587898, "grad_norm": 5.783826429176605, "learning_rate": 1.9798899665911636e-05, "loss": 0.9291, "step": 1446 }, { "epoch": 0.18428132511899645, "grad_norm": 8.583389528948576, "learning_rate": 1.97984879100949e-05, "loss": 1.0729, "step": 1447 }, { "epoch": 0.1844086791792031, "grad_norm": 4.9845657551297196, "learning_rate": 1.9798075737461627e-05, "loss": 1.081, "step": 1448 }, { "epoch": 0.1845360332394097, "grad_norm": 4.630384249468881, "learning_rate": 1.9797663148029352e-05, "loss": 0.9889, "step": 1449 }, { "epoch": 0.18466338729961634, "grad_norm": 4.459600725990378, "learning_rate": 1.9797250141815617e-05, "loss": 1.0447, "step": 1450 }, { "epoch": 0.18479074135982299, "grad_norm": 7.9994144225384165, "learning_rate": 1.9796836718838e-05, "loss": 1.0084, "step": 1451 }, { "epoch": 0.1849180954200296, "grad_norm": 5.037288099049157, "learning_rate": 1.9796422879114082e-05, "loss": 0.9369, "step": 1452 }, { "epoch": 0.18504544948023624, "grad_norm": 5.12163116638082, "learning_rate": 1.9796008622661472e-05, "loss": 1.0629, "step": 1453 }, { "epoch": 0.18517280354044288, "grad_norm": 4.711496399073382, "learning_rate": 1.9795593949497786e-05, "loss": 1.0271, "step": 1454 }, { "epoch": 0.1853001576006495, "grad_norm": 6.076948429563655, "learning_rate": 1.979517885964067e-05, "loss": 0.9624, "step": 1455 }, { "epoch": 0.18542751166085614, "grad_norm": 4.85950570375367, "learning_rate": 1.979476335310778e-05, "loss": 1.0016, "step": 1456 }, { "epoch": 0.18555486572106278, "grad_norm": 6.515755558306762, "learning_rate": 1.9794347429916786e-05, "loss": 0.9468, "step": 1457 }, { "epoch": 0.1856822197812694, "grad_norm": 4.261883670402554, "learning_rate": 1.9793931090085385e-05, "loss": 0.9449, "step": 1458 }, { "epoch": 0.18580957384147603, "grad_norm": 5.087873755370306, "learning_rate": 1.9793514333631287e-05, "loss": 1.0691, "step": 1459 }, { "epoch": 0.18593692790168267, "grad_norm": 6.254327115367058, "learning_rate": 1.9793097160572223e-05, "loss": 1.0433, "step": 1460 }, { "epoch": 0.1860642819618893, "grad_norm": 6.42593203660469, "learning_rate": 1.9792679570925933e-05, "loss": 1.0038, "step": 1461 }, { "epoch": 0.18619163602209593, "grad_norm": 4.263729582608785, "learning_rate": 1.9792261564710188e-05, "loss": 1.041, "step": 1462 }, { "epoch": 0.18631899008230257, "grad_norm": 4.486672480798122, "learning_rate": 1.9791843141942763e-05, "loss": 1.0097, "step": 1463 }, { "epoch": 0.18644634414250918, "grad_norm": 8.109797883813664, "learning_rate": 1.979142430264146e-05, "loss": 0.9008, "step": 1464 }, { "epoch": 0.18657369820271583, "grad_norm": 5.566391858534027, "learning_rate": 1.97910050468241e-05, "loss": 0.9353, "step": 1465 }, { "epoch": 0.18670105226292247, "grad_norm": 5.093553410580432, "learning_rate": 1.979058537450851e-05, "loss": 0.9952, "step": 1466 }, { "epoch": 0.18682840632312908, "grad_norm": 6.860038429797663, "learning_rate": 1.979016528571255e-05, "loss": 0.9609, "step": 1467 }, { "epoch": 0.18695576038333572, "grad_norm": 6.188720582735326, "learning_rate": 1.9789744780454082e-05, "loss": 1.0192, "step": 1468 }, { "epoch": 0.18708311444354236, "grad_norm": 5.187779614369765, "learning_rate": 1.9789323858751e-05, "loss": 1.1023, "step": 1469 }, { "epoch": 0.18721046850374898, "grad_norm": 5.060137161989241, "learning_rate": 1.978890252062121e-05, "loss": 1.0415, "step": 1470 }, { "epoch": 0.18733782256395562, "grad_norm": 4.245005646287378, "learning_rate": 1.9788480766082626e-05, "loss": 1.0162, "step": 1471 }, { "epoch": 0.18746517662416226, "grad_norm": 5.212867024991071, "learning_rate": 1.9788058595153202e-05, "loss": 0.8992, "step": 1472 }, { "epoch": 0.18759253068436887, "grad_norm": 3.5576216105774727, "learning_rate": 1.978763600785089e-05, "loss": 0.9837, "step": 1473 }, { "epoch": 0.18771988474457552, "grad_norm": 4.995126447174946, "learning_rate": 1.9787213004193665e-05, "loss": 0.967, "step": 1474 }, { "epoch": 0.18784723880478216, "grad_norm": 5.481986109719116, "learning_rate": 1.9786789584199523e-05, "loss": 0.9039, "step": 1475 }, { "epoch": 0.18797459286498877, "grad_norm": 5.599221052763061, "learning_rate": 1.9786365747886475e-05, "loss": 1.0468, "step": 1476 }, { "epoch": 0.1881019469251954, "grad_norm": 4.9890705819427374, "learning_rate": 1.9785941495272553e-05, "loss": 1.0024, "step": 1477 }, { "epoch": 0.18822930098540205, "grad_norm": 6.845984246635087, "learning_rate": 1.9785516826375805e-05, "loss": 0.9939, "step": 1478 }, { "epoch": 0.18835665504560867, "grad_norm": 4.7652065406059565, "learning_rate": 1.978509174121429e-05, "loss": 0.946, "step": 1479 }, { "epoch": 0.1884840091058153, "grad_norm": 7.010725251690263, "learning_rate": 1.978466623980609e-05, "loss": 1.0261, "step": 1480 }, { "epoch": 0.18861136316602195, "grad_norm": 5.342474712763604, "learning_rate": 1.9784240322169316e-05, "loss": 1.0171, "step": 1481 }, { "epoch": 0.18873871722622856, "grad_norm": 6.2207481969521, "learning_rate": 1.9783813988322076e-05, "loss": 1.0655, "step": 1482 }, { "epoch": 0.1888660712864352, "grad_norm": 6.051037206093783, "learning_rate": 1.9783387238282513e-05, "loss": 0.9079, "step": 1483 }, { "epoch": 0.18899342534664182, "grad_norm": 5.941004953369172, "learning_rate": 1.9782960072068772e-05, "loss": 1.0933, "step": 1484 }, { "epoch": 0.18912077940684846, "grad_norm": 3.9146807214716772, "learning_rate": 1.978253248969903e-05, "loss": 1.0632, "step": 1485 }, { "epoch": 0.1892481334670551, "grad_norm": 5.4431079577859105, "learning_rate": 1.9782104491191475e-05, "loss": 0.9911, "step": 1486 }, { "epoch": 0.18937548752726172, "grad_norm": 4.052988037995278, "learning_rate": 1.9781676076564316e-05, "loss": 1.0377, "step": 1487 }, { "epoch": 0.18950284158746836, "grad_norm": 5.62963947124253, "learning_rate": 1.978124724583577e-05, "loss": 0.9062, "step": 1488 }, { "epoch": 0.189630195647675, "grad_norm": 5.138572399027655, "learning_rate": 1.978081799902409e-05, "loss": 0.9558, "step": 1489 }, { "epoch": 0.1897575497078816, "grad_norm": 4.889826407754536, "learning_rate": 1.9780388336147524e-05, "loss": 1.0003, "step": 1490 }, { "epoch": 0.18988490376808825, "grad_norm": 7.62897875429576, "learning_rate": 1.9779958257224355e-05, "loss": 1.0335, "step": 1491 }, { "epoch": 0.1900122578282949, "grad_norm": 6.335914922220099, "learning_rate": 1.9779527762272877e-05, "loss": 1.0124, "step": 1492 }, { "epoch": 0.1901396118885015, "grad_norm": 4.470530335373219, "learning_rate": 1.9779096851311406e-05, "loss": 1.093, "step": 1493 }, { "epoch": 0.19026696594870815, "grad_norm": 8.02603348111266, "learning_rate": 1.977866552435827e-05, "loss": 1.0535, "step": 1494 }, { "epoch": 0.1903943200089148, "grad_norm": 4.870751901973264, "learning_rate": 1.9778233781431814e-05, "loss": 1.0418, "step": 1495 }, { "epoch": 0.1905216740691214, "grad_norm": 4.832118681946968, "learning_rate": 1.977780162255041e-05, "loss": 1.0292, "step": 1496 }, { "epoch": 0.19064902812932805, "grad_norm": 5.930854692540798, "learning_rate": 1.9777369047732438e-05, "loss": 0.9462, "step": 1497 }, { "epoch": 0.1907763821895347, "grad_norm": 5.495050110245704, "learning_rate": 1.9776936056996297e-05, "loss": 0.8484, "step": 1498 }, { "epoch": 0.1909037362497413, "grad_norm": 5.508620355758505, "learning_rate": 1.977650265036041e-05, "loss": 1.0204, "step": 1499 }, { "epoch": 0.19103109030994794, "grad_norm": 5.195661444167357, "learning_rate": 1.9776068827843214e-05, "loss": 0.9216, "step": 1500 }, { "epoch": 0.19115844437015458, "grad_norm": 7.5291732520655374, "learning_rate": 1.9775634589463158e-05, "loss": 0.9515, "step": 1501 }, { "epoch": 0.1912857984303612, "grad_norm": 3.888663536687444, "learning_rate": 1.977519993523872e-05, "loss": 0.9452, "step": 1502 }, { "epoch": 0.19141315249056784, "grad_norm": 5.3393483019284895, "learning_rate": 1.9774764865188388e-05, "loss": 0.9975, "step": 1503 }, { "epoch": 0.19154050655077448, "grad_norm": 7.013216892528038, "learning_rate": 1.977432937933067e-05, "loss": 0.9489, "step": 1504 }, { "epoch": 0.1916678606109811, "grad_norm": 4.091945570836384, "learning_rate": 1.9773893477684086e-05, "loss": 1.0263, "step": 1505 }, { "epoch": 0.19179521467118774, "grad_norm": 6.133647497396183, "learning_rate": 1.977345716026718e-05, "loss": 0.9503, "step": 1506 }, { "epoch": 0.19192256873139438, "grad_norm": 6.061068965688803, "learning_rate": 1.9773020427098522e-05, "loss": 0.996, "step": 1507 }, { "epoch": 0.192049922791601, "grad_norm": 5.329895262413773, "learning_rate": 1.9772583278196677e-05, "loss": 0.988, "step": 1508 }, { "epoch": 0.19217727685180763, "grad_norm": 6.2580468828024545, "learning_rate": 1.977214571358025e-05, "loss": 0.9768, "step": 1509 }, { "epoch": 0.19230463091201427, "grad_norm": 5.9261243566210045, "learning_rate": 1.9771707733267852e-05, "loss": 0.9682, "step": 1510 }, { "epoch": 0.1924319849722209, "grad_norm": 5.140494208715624, "learning_rate": 1.977126933727811e-05, "loss": 1.08, "step": 1511 }, { "epoch": 0.19255933903242753, "grad_norm": 6.7300714252643825, "learning_rate": 1.977083052562968e-05, "loss": 1.11, "step": 1512 }, { "epoch": 0.19268669309263417, "grad_norm": 5.585036667965532, "learning_rate": 1.977039129834122e-05, "loss": 1.1374, "step": 1513 }, { "epoch": 0.19281404715284078, "grad_norm": 4.049696480009359, "learning_rate": 1.9769951655431426e-05, "loss": 0.9615, "step": 1514 }, { "epoch": 0.19294140121304743, "grad_norm": 5.976439949723937, "learning_rate": 1.976951159691899e-05, "loss": 1.0128, "step": 1515 }, { "epoch": 0.19306875527325407, "grad_norm": 4.616711174007855, "learning_rate": 1.9769071122822634e-05, "loss": 1.0134, "step": 1516 }, { "epoch": 0.19319610933346068, "grad_norm": 6.802833628690229, "learning_rate": 1.97686302331611e-05, "loss": 0.8562, "step": 1517 }, { "epoch": 0.19332346339366732, "grad_norm": 4.702031512207982, "learning_rate": 1.9768188927953134e-05, "loss": 1.0063, "step": 1518 }, { "epoch": 0.19345081745387396, "grad_norm": 4.7894770830185385, "learning_rate": 1.9767747207217516e-05, "loss": 0.9469, "step": 1519 }, { "epoch": 0.19357817151408058, "grad_norm": 5.886225263924752, "learning_rate": 1.9767305070973033e-05, "loss": 1.0466, "step": 1520 }, { "epoch": 0.19370552557428722, "grad_norm": 4.56563617096565, "learning_rate": 1.9766862519238493e-05, "loss": 1.0864, "step": 1521 }, { "epoch": 0.19383287963449386, "grad_norm": 5.274761480208912, "learning_rate": 1.9766419552032723e-05, "loss": 1.0344, "step": 1522 }, { "epoch": 0.19396023369470047, "grad_norm": 6.4716598673917485, "learning_rate": 1.9765976169374566e-05, "loss": 1.0106, "step": 1523 }, { "epoch": 0.19408758775490711, "grad_norm": 6.49377231098414, "learning_rate": 1.9765532371282882e-05, "loss": 1.0763, "step": 1524 }, { "epoch": 0.19421494181511376, "grad_norm": 5.465689014999976, "learning_rate": 1.976508815777655e-05, "loss": 0.9861, "step": 1525 }, { "epoch": 0.19434229587532037, "grad_norm": 5.770104464373687, "learning_rate": 1.976464352887447e-05, "loss": 1.0394, "step": 1526 }, { "epoch": 0.194469649935527, "grad_norm": 7.618250626931756, "learning_rate": 1.9764198484595553e-05, "loss": 1.0921, "step": 1527 }, { "epoch": 0.19459700399573365, "grad_norm": 4.240896460573713, "learning_rate": 1.9763753024958724e-05, "loss": 0.9663, "step": 1528 }, { "epoch": 0.19472435805594027, "grad_norm": 4.724200807830366, "learning_rate": 1.9763307149982945e-05, "loss": 1.1107, "step": 1529 }, { "epoch": 0.1948517121161469, "grad_norm": 6.047163924379785, "learning_rate": 1.9762860859687173e-05, "loss": 0.9791, "step": 1530 }, { "epoch": 0.19497906617635352, "grad_norm": 5.820781427733166, "learning_rate": 1.9762414154090398e-05, "loss": 0.9669, "step": 1531 }, { "epoch": 0.19510642023656016, "grad_norm": 6.023807657745205, "learning_rate": 1.976196703321162e-05, "loss": 0.953, "step": 1532 }, { "epoch": 0.1952337742967668, "grad_norm": 6.153987255679557, "learning_rate": 1.9761519497069863e-05, "loss": 0.9781, "step": 1533 }, { "epoch": 0.19536112835697342, "grad_norm": 6.374491054190222, "learning_rate": 1.9761071545684158e-05, "loss": 0.9647, "step": 1534 }, { "epoch": 0.19548848241718006, "grad_norm": 4.914819316472806, "learning_rate": 1.9760623179073568e-05, "loss": 1.0278, "step": 1535 }, { "epoch": 0.1956158364773867, "grad_norm": 5.046922051650349, "learning_rate": 1.9760174397257158e-05, "loss": 0.9767, "step": 1536 }, { "epoch": 0.19574319053759331, "grad_norm": 6.035067442206626, "learning_rate": 1.9759725200254027e-05, "loss": 0.9667, "step": 1537 }, { "epoch": 0.19587054459779996, "grad_norm": 7.227809225227867, "learning_rate": 1.9759275588083273e-05, "loss": 0.8844, "step": 1538 }, { "epoch": 0.1959978986580066, "grad_norm": 5.3397633812304175, "learning_rate": 1.975882556076403e-05, "loss": 0.88, "step": 1539 }, { "epoch": 0.1961252527182132, "grad_norm": 5.160049056182221, "learning_rate": 1.975837511831544e-05, "loss": 0.9729, "step": 1540 }, { "epoch": 0.19625260677841985, "grad_norm": 5.5355201230080056, "learning_rate": 1.9757924260756665e-05, "loss": 0.983, "step": 1541 }, { "epoch": 0.1963799608386265, "grad_norm": 5.863808218740776, "learning_rate": 1.975747298810688e-05, "loss": 0.9425, "step": 1542 }, { "epoch": 0.1965073148988331, "grad_norm": 4.844791423453396, "learning_rate": 1.9757021300385288e-05, "loss": 0.9033, "step": 1543 }, { "epoch": 0.19663466895903975, "grad_norm": 5.3108868113088405, "learning_rate": 1.9756569197611098e-05, "loss": 1.0034, "step": 1544 }, { "epoch": 0.1967620230192464, "grad_norm": 5.669919856613489, "learning_rate": 1.975611667980354e-05, "loss": 1.0133, "step": 1545 }, { "epoch": 0.196889377079453, "grad_norm": 5.563726225720538, "learning_rate": 1.9755663746981873e-05, "loss": 1.0517, "step": 1546 }, { "epoch": 0.19701673113965965, "grad_norm": 4.65256759143498, "learning_rate": 1.9755210399165358e-05, "loss": 0.9638, "step": 1547 }, { "epoch": 0.1971440851998663, "grad_norm": 5.692109304765883, "learning_rate": 1.9754756636373277e-05, "loss": 1.0341, "step": 1548 }, { "epoch": 0.1972714392600729, "grad_norm": 5.938750125384786, "learning_rate": 1.975430245862494e-05, "loss": 1.0953, "step": 1549 }, { "epoch": 0.19739879332027954, "grad_norm": 4.856150646003446, "learning_rate": 1.9753847865939657e-05, "loss": 0.9406, "step": 1550 }, { "epoch": 0.19752614738048618, "grad_norm": 4.081784765659373, "learning_rate": 1.9753392858336776e-05, "loss": 1.0257, "step": 1551 }, { "epoch": 0.1976535014406928, "grad_norm": 5.313787865208813, "learning_rate": 1.975293743583565e-05, "loss": 1.0489, "step": 1552 }, { "epoch": 0.19778085550089944, "grad_norm": 5.256694158959123, "learning_rate": 1.9752481598455648e-05, "loss": 1.0289, "step": 1553 }, { "epoch": 0.19790820956110608, "grad_norm": 3.9844003713959366, "learning_rate": 1.9752025346216164e-05, "loss": 1.0425, "step": 1554 }, { "epoch": 0.1980355636213127, "grad_norm": 6.3238206329291895, "learning_rate": 1.975156867913661e-05, "loss": 0.9651, "step": 1555 }, { "epoch": 0.19816291768151933, "grad_norm": 4.594948125801704, "learning_rate": 1.97511115972364e-05, "loss": 1.0036, "step": 1556 }, { "epoch": 0.19829027174172598, "grad_norm": 5.107193694114138, "learning_rate": 1.9750654100534992e-05, "loss": 1.0419, "step": 1557 }, { "epoch": 0.1984176258019326, "grad_norm": 5.134129835650002, "learning_rate": 1.9750196189051837e-05, "loss": 0.9756, "step": 1558 }, { "epoch": 0.19854497986213923, "grad_norm": 4.923212553870282, "learning_rate": 1.974973786280642e-05, "loss": 0.9265, "step": 1559 }, { "epoch": 0.19867233392234587, "grad_norm": 3.993883702005666, "learning_rate": 1.9749279121818235e-05, "loss": 0.9821, "step": 1560 }, { "epoch": 0.1987996879825525, "grad_norm": 5.936873553507101, "learning_rate": 1.97488199661068e-05, "loss": 1.1268, "step": 1561 }, { "epoch": 0.19892704204275913, "grad_norm": 5.895892714705748, "learning_rate": 1.9748360395691644e-05, "loss": 1.0248, "step": 1562 }, { "epoch": 0.19905439610296577, "grad_norm": 6.590647469234013, "learning_rate": 1.9747900410592314e-05, "loss": 0.8957, "step": 1563 }, { "epoch": 0.19918175016317238, "grad_norm": 4.850838673233757, "learning_rate": 1.9747440010828384e-05, "loss": 0.9782, "step": 1564 }, { "epoch": 0.19930910422337902, "grad_norm": 4.491532730813909, "learning_rate": 1.974697919641943e-05, "loss": 1.0488, "step": 1565 }, { "epoch": 0.19943645828358567, "grad_norm": 6.2761674501680424, "learning_rate": 1.974651796738506e-05, "loss": 0.9976, "step": 1566 }, { "epoch": 0.19956381234379228, "grad_norm": 3.8229999723721324, "learning_rate": 1.9746056323744897e-05, "loss": 1.0592, "step": 1567 }, { "epoch": 0.19969116640399892, "grad_norm": 6.720524043585452, "learning_rate": 1.9745594265518574e-05, "loss": 0.9593, "step": 1568 }, { "epoch": 0.19981852046420556, "grad_norm": 4.296872336646902, "learning_rate": 1.9745131792725748e-05, "loss": 0.9543, "step": 1569 }, { "epoch": 0.19994587452441218, "grad_norm": 3.8372903524284334, "learning_rate": 1.974466890538609e-05, "loss": 0.9131, "step": 1570 }, { "epoch": 0.20007322858461882, "grad_norm": 5.553138656470616, "learning_rate": 1.9744205603519293e-05, "loss": 0.9076, "step": 1571 }, { "epoch": 0.20020058264482546, "grad_norm": 6.149281231816345, "learning_rate": 1.9743741887145067e-05, "loss": 1.1001, "step": 1572 }, { "epoch": 0.20032793670503207, "grad_norm": 4.25695904218789, "learning_rate": 1.9743277756283133e-05, "loss": 0.9958, "step": 1573 }, { "epoch": 0.20045529076523871, "grad_norm": 5.06832475301277, "learning_rate": 1.974281321095324e-05, "loss": 0.9478, "step": 1574 }, { "epoch": 0.20058264482544533, "grad_norm": 5.257481285421065, "learning_rate": 1.9742348251175146e-05, "loss": 0.9822, "step": 1575 }, { "epoch": 0.20070999888565197, "grad_norm": 5.073481575950944, "learning_rate": 1.974188287696863e-05, "loss": 0.9306, "step": 1576 }, { "epoch": 0.2008373529458586, "grad_norm": 5.268382632298335, "learning_rate": 1.974141708835349e-05, "loss": 1.0149, "step": 1577 }, { "epoch": 0.20096470700606522, "grad_norm": 9.723198856770377, "learning_rate": 1.9740950885349536e-05, "loss": 1.0157, "step": 1578 }, { "epoch": 0.20109206106627187, "grad_norm": 5.560161827806057, "learning_rate": 1.9740484267976608e-05, "loss": 0.9826, "step": 1579 }, { "epoch": 0.2012194151264785, "grad_norm": 5.714445234524424, "learning_rate": 1.974001723625455e-05, "loss": 0.9877, "step": 1580 }, { "epoch": 0.20134676918668512, "grad_norm": 4.86436044752818, "learning_rate": 1.9739549790203224e-05, "loss": 1.0302, "step": 1581 }, { "epoch": 0.20147412324689176, "grad_norm": 5.9878916952751995, "learning_rate": 1.973908192984252e-05, "loss": 0.9946, "step": 1582 }, { "epoch": 0.2016014773070984, "grad_norm": 6.879338230864136, "learning_rate": 1.9738613655192345e-05, "loss": 0.9916, "step": 1583 }, { "epoch": 0.20172883136730502, "grad_norm": 4.571181099017479, "learning_rate": 1.973814496627261e-05, "loss": 1.1194, "step": 1584 }, { "epoch": 0.20185618542751166, "grad_norm": 5.342571284702611, "learning_rate": 1.9737675863103257e-05, "loss": 0.9541, "step": 1585 }, { "epoch": 0.2019835394877183, "grad_norm": 5.99493798663698, "learning_rate": 1.9737206345704244e-05, "loss": 1.0448, "step": 1586 }, { "epoch": 0.2021108935479249, "grad_norm": 5.206679954039697, "learning_rate": 1.9736736414095538e-05, "loss": 0.9584, "step": 1587 }, { "epoch": 0.20223824760813155, "grad_norm": 6.131071545787127, "learning_rate": 1.973626606829713e-05, "loss": 0.9996, "step": 1588 }, { "epoch": 0.2023656016683382, "grad_norm": 5.296658401786897, "learning_rate": 1.9735795308329037e-05, "loss": 0.9626, "step": 1589 }, { "epoch": 0.2024929557285448, "grad_norm": 4.480860805236891, "learning_rate": 1.973532413421127e-05, "loss": 0.8541, "step": 1590 }, { "epoch": 0.20262030978875145, "grad_norm": 5.458814740488239, "learning_rate": 1.973485254596388e-05, "loss": 0.942, "step": 1591 }, { "epoch": 0.2027476638489581, "grad_norm": 5.60844851522458, "learning_rate": 1.9734380543606932e-05, "loss": 1.0469, "step": 1592 }, { "epoch": 0.2028750179091647, "grad_norm": 3.9807266586051577, "learning_rate": 1.9733908127160495e-05, "loss": 0.9929, "step": 1593 }, { "epoch": 0.20300237196937135, "grad_norm": 5.214944879929989, "learning_rate": 1.973343529664467e-05, "loss": 0.8852, "step": 1594 }, { "epoch": 0.203129726029578, "grad_norm": 5.241255425764933, "learning_rate": 1.9732962052079575e-05, "loss": 0.9598, "step": 1595 }, { "epoch": 0.2032570800897846, "grad_norm": 5.1585897050212255, "learning_rate": 1.9732488393485332e-05, "loss": 1.0306, "step": 1596 }, { "epoch": 0.20338443414999124, "grad_norm": 6.224674029413985, "learning_rate": 1.9732014320882096e-05, "loss": 0.9012, "step": 1597 }, { "epoch": 0.20351178821019789, "grad_norm": 4.540211622372655, "learning_rate": 1.9731539834290034e-05, "loss": 1.0735, "step": 1598 }, { "epoch": 0.2036391422704045, "grad_norm": 5.2255767211946855, "learning_rate": 1.9731064933729324e-05, "loss": 0.8904, "step": 1599 }, { "epoch": 0.20376649633061114, "grad_norm": 5.265525749018251, "learning_rate": 1.9730589619220177e-05, "loss": 0.9952, "step": 1600 }, { "epoch": 0.20389385039081778, "grad_norm": 4.648503766268739, "learning_rate": 1.9730113890782804e-05, "loss": 1.0357, "step": 1601 }, { "epoch": 0.2040212044510244, "grad_norm": 5.008336936877452, "learning_rate": 1.9729637748437448e-05, "loss": 0.9865, "step": 1602 }, { "epoch": 0.20414855851123104, "grad_norm": 4.832030263174414, "learning_rate": 1.972916119220436e-05, "loss": 1.0622, "step": 1603 }, { "epoch": 0.20427591257143768, "grad_norm": 8.765669824278218, "learning_rate": 1.9728684222103812e-05, "loss": 0.9047, "step": 1604 }, { "epoch": 0.2044032666316443, "grad_norm": 5.656930787651489, "learning_rate": 1.9728206838156098e-05, "loss": 0.9878, "step": 1605 }, { "epoch": 0.20453062069185093, "grad_norm": 3.652824427343459, "learning_rate": 1.9727729040381517e-05, "loss": 0.906, "step": 1606 }, { "epoch": 0.20465797475205758, "grad_norm": 4.8632365397041175, "learning_rate": 1.97272508288004e-05, "loss": 1.0788, "step": 1607 }, { "epoch": 0.2047853288122642, "grad_norm": 6.17770195662139, "learning_rate": 1.972677220343309e-05, "loss": 1.0285, "step": 1608 }, { "epoch": 0.20491268287247083, "grad_norm": 5.537076995055236, "learning_rate": 1.972629316429995e-05, "loss": 0.9512, "step": 1609 }, { "epoch": 0.20504003693267747, "grad_norm": 4.737506296762595, "learning_rate": 1.972581371142135e-05, "loss": 0.9401, "step": 1610 }, { "epoch": 0.20516739099288409, "grad_norm": 5.31851788920953, "learning_rate": 1.9725333844817688e-05, "loss": 1.0683, "step": 1611 }, { "epoch": 0.20529474505309073, "grad_norm": 5.594261103461401, "learning_rate": 1.972485356450938e-05, "loss": 0.8846, "step": 1612 }, { "epoch": 0.20542209911329737, "grad_norm": 5.099392439612177, "learning_rate": 1.972437287051685e-05, "loss": 1.0609, "step": 1613 }, { "epoch": 0.20554945317350398, "grad_norm": 6.0986471208048725, "learning_rate": 1.9723891762860557e-05, "loss": 1.0413, "step": 1614 }, { "epoch": 0.20567680723371062, "grad_norm": 5.832802375970654, "learning_rate": 1.9723410241560958e-05, "loss": 1.043, "step": 1615 }, { "epoch": 0.20580416129391726, "grad_norm": 5.085532270422708, "learning_rate": 1.972292830663854e-05, "loss": 0.9361, "step": 1616 }, { "epoch": 0.20593151535412388, "grad_norm": 7.483943281683596, "learning_rate": 1.9722445958113803e-05, "loss": 1.0771, "step": 1617 }, { "epoch": 0.20605886941433052, "grad_norm": 7.7429546203268815, "learning_rate": 1.9721963196007263e-05, "loss": 0.9862, "step": 1618 }, { "epoch": 0.20618622347453716, "grad_norm": 4.275373003118664, "learning_rate": 1.972148002033946e-05, "loss": 0.9736, "step": 1619 }, { "epoch": 0.20631357753474378, "grad_norm": 4.442061108412303, "learning_rate": 1.9720996431130946e-05, "loss": 0.962, "step": 1620 }, { "epoch": 0.20644093159495042, "grad_norm": 4.36612542464768, "learning_rate": 1.9720512428402293e-05, "loss": 0.9597, "step": 1621 }, { "epoch": 0.20656828565515703, "grad_norm": 5.255502269780313, "learning_rate": 1.9720028012174094e-05, "loss": 0.9574, "step": 1622 }, { "epoch": 0.20669563971536367, "grad_norm": 5.427818009431217, "learning_rate": 1.9719543182466944e-05, "loss": 1.0017, "step": 1623 }, { "epoch": 0.2068229937755703, "grad_norm": 5.819726533003532, "learning_rate": 1.9719057939301477e-05, "loss": 0.9151, "step": 1624 }, { "epoch": 0.20695034783577693, "grad_norm": 5.716644399764548, "learning_rate": 1.9718572282698335e-05, "loss": 1.0101, "step": 1625 }, { "epoch": 0.20707770189598357, "grad_norm": 4.933284650167188, "learning_rate": 1.971808621267817e-05, "loss": 0.9385, "step": 1626 }, { "epoch": 0.2072050559561902, "grad_norm": 4.628939626066382, "learning_rate": 1.9717599729261666e-05, "loss": 1.0506, "step": 1627 }, { "epoch": 0.20733241001639682, "grad_norm": 5.3615858672677215, "learning_rate": 1.971711283246951e-05, "loss": 1.0117, "step": 1628 }, { "epoch": 0.20745976407660346, "grad_norm": 6.214288232361231, "learning_rate": 1.971662552232242e-05, "loss": 0.9271, "step": 1629 }, { "epoch": 0.2075871181368101, "grad_norm": 6.244529771655492, "learning_rate": 1.971613779884113e-05, "loss": 1.1197, "step": 1630 }, { "epoch": 0.20771447219701672, "grad_norm": 6.029004879854435, "learning_rate": 1.9715649662046378e-05, "loss": 1.0605, "step": 1631 }, { "epoch": 0.20784182625722336, "grad_norm": 5.489495215773398, "learning_rate": 1.971516111195893e-05, "loss": 0.9659, "step": 1632 }, { "epoch": 0.20796918031743, "grad_norm": 5.247503867981655, "learning_rate": 1.971467214859957e-05, "loss": 0.8921, "step": 1633 }, { "epoch": 0.20809653437763662, "grad_norm": 7.338572393551121, "learning_rate": 1.97141827719891e-05, "loss": 1.0235, "step": 1634 }, { "epoch": 0.20822388843784326, "grad_norm": 6.658093694099257, "learning_rate": 1.971369298214834e-05, "loss": 1.0838, "step": 1635 }, { "epoch": 0.2083512424980499, "grad_norm": 4.663091357163081, "learning_rate": 1.9713202779098118e-05, "loss": 1.0795, "step": 1636 }, { "epoch": 0.2084785965582565, "grad_norm": 5.371071910466963, "learning_rate": 1.9712712162859287e-05, "loss": 1.1829, "step": 1637 }, { "epoch": 0.20860595061846315, "grad_norm": 5.334058751998837, "learning_rate": 1.9712221133452722e-05, "loss": 1.0557, "step": 1638 }, { "epoch": 0.2087333046786698, "grad_norm": 3.8505080445218827, "learning_rate": 1.971172969089931e-05, "loss": 0.9079, "step": 1639 }, { "epoch": 0.2088606587388764, "grad_norm": 5.877852700967215, "learning_rate": 1.9711237835219955e-05, "loss": 0.9663, "step": 1640 }, { "epoch": 0.20898801279908305, "grad_norm": 5.391358610350945, "learning_rate": 1.9710745566435578e-05, "loss": 0.993, "step": 1641 }, { "epoch": 0.2091153668592897, "grad_norm": 5.2566674900155075, "learning_rate": 1.9710252884567124e-05, "loss": 1.0123, "step": 1642 }, { "epoch": 0.2092427209194963, "grad_norm": 4.4334306997826625, "learning_rate": 1.970975978963555e-05, "loss": 0.928, "step": 1643 }, { "epoch": 0.20937007497970295, "grad_norm": 4.981977214882108, "learning_rate": 1.970926628166183e-05, "loss": 1.0223, "step": 1644 }, { "epoch": 0.2094974290399096, "grad_norm": 4.961837278454065, "learning_rate": 1.9708772360666958e-05, "loss": 0.9146, "step": 1645 }, { "epoch": 0.2096247831001162, "grad_norm": 5.469682363322358, "learning_rate": 1.970827802667194e-05, "loss": 0.9955, "step": 1646 }, { "epoch": 0.20975213716032284, "grad_norm": 7.40464849097527, "learning_rate": 1.9707783279697816e-05, "loss": 0.9915, "step": 1647 }, { "epoch": 0.20987949122052948, "grad_norm": 4.157092245708658, "learning_rate": 1.9707288119765625e-05, "loss": 1.0057, "step": 1648 }, { "epoch": 0.2100068452807361, "grad_norm": 5.203421186201981, "learning_rate": 1.9706792546896425e-05, "loss": 1.0078, "step": 1649 }, { "epoch": 0.21013419934094274, "grad_norm": 4.969316942031503, "learning_rate": 1.9706296561111308e-05, "loss": 1.0448, "step": 1650 }, { "epoch": 0.21026155340114938, "grad_norm": 4.850189456975334, "learning_rate": 1.9705800162431365e-05, "loss": 1.0313, "step": 1651 }, { "epoch": 0.210388907461356, "grad_norm": 6.328925489493655, "learning_rate": 1.9705303350877714e-05, "loss": 1.047, "step": 1652 }, { "epoch": 0.21051626152156264, "grad_norm": 5.55827205153297, "learning_rate": 1.970480612647149e-05, "loss": 1.0637, "step": 1653 }, { "epoch": 0.21064361558176928, "grad_norm": 6.3722659961571955, "learning_rate": 1.9704308489233846e-05, "loss": 1.047, "step": 1654 }, { "epoch": 0.2107709696419759, "grad_norm": 4.8103105593092375, "learning_rate": 1.9703810439185946e-05, "loss": 1.0598, "step": 1655 }, { "epoch": 0.21089832370218253, "grad_norm": 8.339356140604846, "learning_rate": 1.970331197634898e-05, "loss": 0.9091, "step": 1656 }, { "epoch": 0.21102567776238917, "grad_norm": 4.713283176254165, "learning_rate": 1.970281310074415e-05, "loss": 1.0329, "step": 1657 }, { "epoch": 0.2111530318225958, "grad_norm": 5.932060239417839, "learning_rate": 1.9702313812392683e-05, "loss": 1.0818, "step": 1658 }, { "epoch": 0.21128038588280243, "grad_norm": 5.183738355768398, "learning_rate": 1.9701814111315813e-05, "loss": 1.0107, "step": 1659 }, { "epoch": 0.21140773994300907, "grad_norm": 5.967140180385663, "learning_rate": 1.9701313997534798e-05, "loss": 0.9086, "step": 1660 }, { "epoch": 0.21153509400321568, "grad_norm": 4.887698336818495, "learning_rate": 1.9700813471070907e-05, "loss": 1.0905, "step": 1661 }, { "epoch": 0.21166244806342233, "grad_norm": 6.2653132715471616, "learning_rate": 1.9700312531945444e-05, "loss": 0.963, "step": 1662 }, { "epoch": 0.21178980212362897, "grad_norm": 6.325315553536174, "learning_rate": 1.9699811180179704e-05, "loss": 0.9581, "step": 1663 }, { "epoch": 0.21191715618383558, "grad_norm": 4.819017233161328, "learning_rate": 1.9699309415795027e-05, "loss": 1.023, "step": 1664 }, { "epoch": 0.21204451024404222, "grad_norm": 5.740437392739419, "learning_rate": 1.9698807238812748e-05, "loss": 0.9332, "step": 1665 }, { "epoch": 0.21217186430424884, "grad_norm": 5.980074858946459, "learning_rate": 1.9698304649254235e-05, "loss": 1.0194, "step": 1666 }, { "epoch": 0.21229921836445548, "grad_norm": 4.768733977244646, "learning_rate": 1.9697801647140865e-05, "loss": 1.0535, "step": 1667 }, { "epoch": 0.21242657242466212, "grad_norm": 5.7997365299760535, "learning_rate": 1.9697298232494037e-05, "loss": 0.9816, "step": 1668 }, { "epoch": 0.21255392648486873, "grad_norm": 5.853278729669982, "learning_rate": 1.969679440533516e-05, "loss": 0.9141, "step": 1669 }, { "epoch": 0.21268128054507537, "grad_norm": 5.938993888444609, "learning_rate": 1.9696290165685674e-05, "loss": 0.9569, "step": 1670 }, { "epoch": 0.21280863460528202, "grad_norm": 5.824831771058527, "learning_rate": 1.9695785513567024e-05, "loss": 1.076, "step": 1671 }, { "epoch": 0.21293598866548863, "grad_norm": 9.641584104438895, "learning_rate": 1.969528044900068e-05, "loss": 1.0234, "step": 1672 }, { "epoch": 0.21306334272569527, "grad_norm": 3.97168845182061, "learning_rate": 1.969477497200812e-05, "loss": 0.9538, "step": 1673 }, { "epoch": 0.2131906967859019, "grad_norm": 8.086281884272529, "learning_rate": 1.9694269082610856e-05, "loss": 0.9032, "step": 1674 }, { "epoch": 0.21331805084610853, "grad_norm": 6.182984833608822, "learning_rate": 1.9693762780830404e-05, "loss": 0.9367, "step": 1675 }, { "epoch": 0.21344540490631517, "grad_norm": 6.781903072851365, "learning_rate": 1.96932560666883e-05, "loss": 1.0375, "step": 1676 }, { "epoch": 0.2135727589665218, "grad_norm": 4.736966773952431, "learning_rate": 1.96927489402061e-05, "loss": 0.9725, "step": 1677 }, { "epoch": 0.21370011302672842, "grad_norm": 5.598186757942846, "learning_rate": 1.9692241401405376e-05, "loss": 0.9808, "step": 1678 }, { "epoch": 0.21382746708693506, "grad_norm": 5.052495985449508, "learning_rate": 1.9691733450307723e-05, "loss": 0.9624, "step": 1679 }, { "epoch": 0.2139548211471417, "grad_norm": 6.730913657850143, "learning_rate": 1.9691225086934743e-05, "loss": 0.9979, "step": 1680 }, { "epoch": 0.21408217520734832, "grad_norm": 6.456859330175002, "learning_rate": 1.969071631130806e-05, "loss": 1.0151, "step": 1681 }, { "epoch": 0.21420952926755496, "grad_norm": 5.2489839206361335, "learning_rate": 1.9690207123449322e-05, "loss": 1.0065, "step": 1682 }, { "epoch": 0.2143368833277616, "grad_norm": 5.993312287482855, "learning_rate": 1.968969752338019e-05, "loss": 1.0205, "step": 1683 }, { "epoch": 0.21446423738796822, "grad_norm": 6.105879854601475, "learning_rate": 1.968918751112233e-05, "loss": 0.9294, "step": 1684 }, { "epoch": 0.21459159144817486, "grad_norm": 4.22773700972384, "learning_rate": 1.968867708669745e-05, "loss": 0.9431, "step": 1685 }, { "epoch": 0.2147189455083815, "grad_norm": 4.66537789305537, "learning_rate": 1.968816625012726e-05, "loss": 0.9277, "step": 1686 }, { "epoch": 0.2148462995685881, "grad_norm": 5.7675083464920505, "learning_rate": 1.968765500143349e-05, "loss": 1.0131, "step": 1687 }, { "epoch": 0.21497365362879475, "grad_norm": 4.252982047072925, "learning_rate": 1.9687143340637885e-05, "loss": 0.9675, "step": 1688 }, { "epoch": 0.2151010076890014, "grad_norm": 4.236290755223603, "learning_rate": 1.9686631267762216e-05, "loss": 0.9622, "step": 1689 }, { "epoch": 0.215228361749208, "grad_norm": 6.703800763121683, "learning_rate": 1.968611878282826e-05, "loss": 1.0032, "step": 1690 }, { "epoch": 0.21535571580941465, "grad_norm": 3.940146147442758, "learning_rate": 1.968560588585782e-05, "loss": 0.9759, "step": 1691 }, { "epoch": 0.2154830698696213, "grad_norm": 4.799271273927135, "learning_rate": 1.9685092576872717e-05, "loss": 1.0655, "step": 1692 }, { "epoch": 0.2156104239298279, "grad_norm": 6.680157489861707, "learning_rate": 1.9684578855894783e-05, "loss": 0.9126, "step": 1693 }, { "epoch": 0.21573777799003455, "grad_norm": 4.623934442616402, "learning_rate": 1.968406472294587e-05, "loss": 0.8562, "step": 1694 }, { "epoch": 0.2158651320502412, "grad_norm": 6.912497891913811, "learning_rate": 1.9683550178047852e-05, "loss": 1.0111, "step": 1695 }, { "epoch": 0.2159924861104478, "grad_norm": 4.770563687140798, "learning_rate": 1.9683035221222617e-05, "loss": 0.8775, "step": 1696 }, { "epoch": 0.21611984017065444, "grad_norm": 6.298702154748113, "learning_rate": 1.9682519852492066e-05, "loss": 0.9678, "step": 1697 }, { "epoch": 0.21624719423086108, "grad_norm": 4.4266732538319795, "learning_rate": 1.9682004071878128e-05, "loss": 0.9295, "step": 1698 }, { "epoch": 0.2163745482910677, "grad_norm": 4.86071601836413, "learning_rate": 1.968148787940274e-05, "loss": 0.9146, "step": 1699 }, { "epoch": 0.21650190235127434, "grad_norm": 4.516499048667697, "learning_rate": 1.9680971275087862e-05, "loss": 1.0534, "step": 1700 }, { "epoch": 0.21662925641148098, "grad_norm": 3.7220862033214726, "learning_rate": 1.968045425895547e-05, "loss": 0.9642, "step": 1701 }, { "epoch": 0.2167566104716876, "grad_norm": 6.026022429395702, "learning_rate": 1.9679936831027558e-05, "loss": 1.0535, "step": 1702 }, { "epoch": 0.21688396453189424, "grad_norm": 6.490947875880115, "learning_rate": 1.9679418991326134e-05, "loss": 1.0937, "step": 1703 }, { "epoch": 0.21701131859210088, "grad_norm": 5.384847093145054, "learning_rate": 1.967890073987323e-05, "loss": 0.9663, "step": 1704 }, { "epoch": 0.2171386726523075, "grad_norm": 6.540822875724549, "learning_rate": 1.9678382076690883e-05, "loss": 0.9928, "step": 1705 }, { "epoch": 0.21726602671251413, "grad_norm": 5.064704509223324, "learning_rate": 1.9677863001801167e-05, "loss": 1.0063, "step": 1706 }, { "epoch": 0.21739338077272077, "grad_norm": 5.422335646351518, "learning_rate": 1.967734351522616e-05, "loss": 0.9905, "step": 1707 }, { "epoch": 0.2175207348329274, "grad_norm": 4.819619597306839, "learning_rate": 1.9676823616987958e-05, "loss": 1.0503, "step": 1708 }, { "epoch": 0.21764808889313403, "grad_norm": 4.844071309907478, "learning_rate": 1.967630330710868e-05, "loss": 1.0092, "step": 1709 }, { "epoch": 0.21777544295334067, "grad_norm": 6.182490314826213, "learning_rate": 1.967578258561045e-05, "loss": 0.9271, "step": 1710 }, { "epoch": 0.21790279701354728, "grad_norm": 4.720366952500596, "learning_rate": 1.9675261452515434e-05, "loss": 0.9913, "step": 1711 }, { "epoch": 0.21803015107375393, "grad_norm": 5.772697432269532, "learning_rate": 1.967473990784579e-05, "loss": 0.8759, "step": 1712 }, { "epoch": 0.21815750513396054, "grad_norm": 5.444777255796088, "learning_rate": 1.967421795162371e-05, "loss": 0.9836, "step": 1713 }, { "epoch": 0.21828485919416718, "grad_norm": 5.684695044720064, "learning_rate": 1.9673695583871392e-05, "loss": 0.9398, "step": 1714 }, { "epoch": 0.21841221325437382, "grad_norm": 5.116815311786866, "learning_rate": 1.9673172804611057e-05, "loss": 0.8838, "step": 1715 }, { "epoch": 0.21853956731458044, "grad_norm": 5.017741199544419, "learning_rate": 1.9672649613864946e-05, "loss": 0.988, "step": 1716 }, { "epoch": 0.21866692137478708, "grad_norm": 6.147091011630903, "learning_rate": 1.967212601165531e-05, "loss": 1.1042, "step": 1717 }, { "epoch": 0.21879427543499372, "grad_norm": 4.417358464217229, "learning_rate": 1.9671601998004436e-05, "loss": 0.9115, "step": 1718 }, { "epoch": 0.21892162949520033, "grad_norm": 6.183131669998234, "learning_rate": 1.96710775729346e-05, "loss": 1.053, "step": 1719 }, { "epoch": 0.21904898355540697, "grad_norm": 5.625086126727791, "learning_rate": 1.9670552736468117e-05, "loss": 1.0128, "step": 1720 }, { "epoch": 0.21917633761561361, "grad_norm": 4.117668804143339, "learning_rate": 1.9670027488627314e-05, "loss": 1.0047, "step": 1721 }, { "epoch": 0.21930369167582023, "grad_norm": 5.02721456437311, "learning_rate": 1.966950182943453e-05, "loss": 0.9749, "step": 1722 }, { "epoch": 0.21943104573602687, "grad_norm": 5.166146877763162, "learning_rate": 1.966897575891213e-05, "loss": 0.8736, "step": 1723 }, { "epoch": 0.2195583997962335, "grad_norm": 5.609421073703104, "learning_rate": 1.9668449277082492e-05, "loss": 0.8433, "step": 1724 }, { "epoch": 0.21968575385644012, "grad_norm": 5.286167721833049, "learning_rate": 1.966792238396801e-05, "loss": 0.8466, "step": 1725 }, { "epoch": 0.21981310791664677, "grad_norm": 4.961751171277027, "learning_rate": 1.9667395079591096e-05, "loss": 0.9572, "step": 1726 }, { "epoch": 0.2199404619768534, "grad_norm": 5.827625570298563, "learning_rate": 1.9666867363974187e-05, "loss": 1.0829, "step": 1727 }, { "epoch": 0.22006781603706002, "grad_norm": 6.267381835602661, "learning_rate": 1.9666339237139723e-05, "loss": 1.0272, "step": 1728 }, { "epoch": 0.22019517009726666, "grad_norm": 4.6393897470833165, "learning_rate": 1.966581069911018e-05, "loss": 0.941, "step": 1729 }, { "epoch": 0.2203225241574733, "grad_norm": 4.630764230227775, "learning_rate": 1.9665281749908034e-05, "loss": 1.0309, "step": 1730 }, { "epoch": 0.22044987821767992, "grad_norm": 5.536201975914104, "learning_rate": 1.966475238955579e-05, "loss": 1.0152, "step": 1731 }, { "epoch": 0.22057723227788656, "grad_norm": 4.906899767570099, "learning_rate": 1.9664222618075958e-05, "loss": 1.0782, "step": 1732 }, { "epoch": 0.2207045863380932, "grad_norm": 5.744214007840506, "learning_rate": 1.9663692435491084e-05, "loss": 0.9878, "step": 1733 }, { "epoch": 0.22083194039829981, "grad_norm": 4.846756918555281, "learning_rate": 1.966316184182372e-05, "loss": 1.0303, "step": 1734 }, { "epoch": 0.22095929445850646, "grad_norm": 3.883458903060236, "learning_rate": 1.966263083709643e-05, "loss": 1.0827, "step": 1735 }, { "epoch": 0.2210866485187131, "grad_norm": 5.049937733181349, "learning_rate": 1.966209942133181e-05, "loss": 1.0007, "step": 1736 }, { "epoch": 0.2212140025789197, "grad_norm": 3.77498605990836, "learning_rate": 1.966156759455246e-05, "loss": 0.9559, "step": 1737 }, { "epoch": 0.22134135663912635, "grad_norm": 6.192356246123623, "learning_rate": 1.9661035356781007e-05, "loss": 0.9042, "step": 1738 }, { "epoch": 0.221468710699333, "grad_norm": 7.090534035783035, "learning_rate": 1.9660502708040094e-05, "loss": 1.0616, "step": 1739 }, { "epoch": 0.2215960647595396, "grad_norm": 5.8361028726987545, "learning_rate": 1.965996964835237e-05, "loss": 1.0419, "step": 1740 }, { "epoch": 0.22172341881974625, "grad_norm": 5.605022984235532, "learning_rate": 1.9659436177740518e-05, "loss": 0.9657, "step": 1741 }, { "epoch": 0.2218507728799529, "grad_norm": 5.7205697588158575, "learning_rate": 1.965890229622723e-05, "loss": 0.9856, "step": 1742 }, { "epoch": 0.2219781269401595, "grad_norm": 5.185543196797189, "learning_rate": 1.9658368003835223e-05, "loss": 1.0353, "step": 1743 }, { "epoch": 0.22210548100036615, "grad_norm": 9.360388597426082, "learning_rate": 1.965783330058721e-05, "loss": 1.1027, "step": 1744 }, { "epoch": 0.2222328350605728, "grad_norm": 7.419286400959521, "learning_rate": 1.9657298186505952e-05, "loss": 1.0728, "step": 1745 }, { "epoch": 0.2223601891207794, "grad_norm": 6.056021463434238, "learning_rate": 1.96567626616142e-05, "loss": 1.0296, "step": 1746 }, { "epoch": 0.22248754318098604, "grad_norm": 4.617228809565627, "learning_rate": 1.9656226725934745e-05, "loss": 0.9012, "step": 1747 }, { "epoch": 0.22261489724119268, "grad_norm": 4.107763634438576, "learning_rate": 1.965569037949038e-05, "loss": 0.9253, "step": 1748 }, { "epoch": 0.2227422513013993, "grad_norm": 6.390962251377734, "learning_rate": 1.9655153622303918e-05, "loss": 1.0192, "step": 1749 }, { "epoch": 0.22286960536160594, "grad_norm": 4.793158368103816, "learning_rate": 1.9654616454398194e-05, "loss": 0.9457, "step": 1750 }, { "epoch": 0.22299695942181258, "grad_norm": 4.688639998095213, "learning_rate": 1.9654078875796064e-05, "loss": 0.9644, "step": 1751 }, { "epoch": 0.2231243134820192, "grad_norm": 5.905266573942803, "learning_rate": 1.9653540886520387e-05, "loss": 0.9638, "step": 1752 }, { "epoch": 0.22325166754222583, "grad_norm": 5.442243589670223, "learning_rate": 1.9653002486594057e-05, "loss": 1.0104, "step": 1753 }, { "epoch": 0.22337902160243248, "grad_norm": 4.397132693549584, "learning_rate": 1.965246367603997e-05, "loss": 0.9889, "step": 1754 }, { "epoch": 0.2235063756626391, "grad_norm": 5.403560785443919, "learning_rate": 1.965192445488105e-05, "loss": 0.9676, "step": 1755 }, { "epoch": 0.22363372972284573, "grad_norm": 6.487656831407575, "learning_rate": 1.9651384823140237e-05, "loss": 1.1338, "step": 1756 }, { "epoch": 0.22376108378305234, "grad_norm": 5.990041097203875, "learning_rate": 1.9650844780840475e-05, "loss": 1.0659, "step": 1757 }, { "epoch": 0.22388843784325899, "grad_norm": 6.69981491060953, "learning_rate": 1.9650304328004752e-05, "loss": 0.9826, "step": 1758 }, { "epoch": 0.22401579190346563, "grad_norm": 5.43961722710271, "learning_rate": 1.9649763464656052e-05, "loss": 0.9696, "step": 1759 }, { "epoch": 0.22414314596367224, "grad_norm": 5.3903222925085155, "learning_rate": 1.9649222190817382e-05, "loss": 1.0604, "step": 1760 }, { "epoch": 0.22427050002387888, "grad_norm": 6.023288987205302, "learning_rate": 1.9648680506511763e-05, "loss": 0.9395, "step": 1761 }, { "epoch": 0.22439785408408552, "grad_norm": 6.729129031364457, "learning_rate": 1.9648138411762245e-05, "loss": 1.0276, "step": 1762 }, { "epoch": 0.22452520814429214, "grad_norm": 5.303370631142946, "learning_rate": 1.9647595906591884e-05, "loss": 0.9666, "step": 1763 }, { "epoch": 0.22465256220449878, "grad_norm": 4.003173046434199, "learning_rate": 1.964705299102376e-05, "loss": 0.9826, "step": 1764 }, { "epoch": 0.22477991626470542, "grad_norm": 4.822631673720893, "learning_rate": 1.9646509665080967e-05, "loss": 0.9451, "step": 1765 }, { "epoch": 0.22490727032491203, "grad_norm": 11.400257950926209, "learning_rate": 1.9645965928786615e-05, "loss": 0.9215, "step": 1766 }, { "epoch": 0.22503462438511868, "grad_norm": 12.665581171471478, "learning_rate": 1.9645421782163838e-05, "loss": 0.8822, "step": 1767 }, { "epoch": 0.22516197844532532, "grad_norm": 6.241716923360686, "learning_rate": 1.964487722523578e-05, "loss": 0.9472, "step": 1768 }, { "epoch": 0.22528933250553193, "grad_norm": 4.949288889351066, "learning_rate": 1.9644332258025604e-05, "loss": 0.8934, "step": 1769 }, { "epoch": 0.22541668656573857, "grad_norm": 4.900761541238467, "learning_rate": 1.96437868805565e-05, "loss": 1.0991, "step": 1770 }, { "epoch": 0.2255440406259452, "grad_norm": 6.7477896568198945, "learning_rate": 1.9643241092851664e-05, "loss": 0.9397, "step": 1771 }, { "epoch": 0.22567139468615183, "grad_norm": 4.4663847952956806, "learning_rate": 1.964269489493431e-05, "loss": 0.9228, "step": 1772 }, { "epoch": 0.22579874874635847, "grad_norm": 4.780268150692939, "learning_rate": 1.9642148286827674e-05, "loss": 1.0703, "step": 1773 }, { "epoch": 0.2259261028065651, "grad_norm": 3.7715650378964707, "learning_rate": 1.964160126855501e-05, "loss": 0.9436, "step": 1774 }, { "epoch": 0.22605345686677172, "grad_norm": 4.992705937599941, "learning_rate": 1.964105384013959e-05, "loss": 1.0159, "step": 1775 }, { "epoch": 0.22618081092697837, "grad_norm": 4.893049230375066, "learning_rate": 1.9640506001604693e-05, "loss": 0.9875, "step": 1776 }, { "epoch": 0.226308164987185, "grad_norm": 5.730724700849259, "learning_rate": 1.9639957752973632e-05, "loss": 0.9165, "step": 1777 }, { "epoch": 0.22643551904739162, "grad_norm": 5.960767591062262, "learning_rate": 1.963940909426972e-05, "loss": 1.0084, "step": 1778 }, { "epoch": 0.22656287310759826, "grad_norm": 5.127943728705067, "learning_rate": 1.9638860025516305e-05, "loss": 0.9076, "step": 1779 }, { "epoch": 0.2266902271678049, "grad_norm": 6.308130546075289, "learning_rate": 1.963831054673674e-05, "loss": 0.9977, "step": 1780 }, { "epoch": 0.22681758122801152, "grad_norm": 7.182944803137629, "learning_rate": 1.96377606579544e-05, "loss": 1.0313, "step": 1781 }, { "epoch": 0.22694493528821816, "grad_norm": 4.015812375932955, "learning_rate": 1.9637210359192673e-05, "loss": 0.9307, "step": 1782 }, { "epoch": 0.2270722893484248, "grad_norm": 4.165843704331801, "learning_rate": 1.9636659650474973e-05, "loss": 0.9104, "step": 1783 }, { "epoch": 0.2271996434086314, "grad_norm": 5.612109524539742, "learning_rate": 1.9636108531824725e-05, "loss": 1.0234, "step": 1784 }, { "epoch": 0.22732699746883805, "grad_norm": 6.010012537582954, "learning_rate": 1.9635557003265374e-05, "loss": 0.8986, "step": 1785 }, { "epoch": 0.2274543515290447, "grad_norm": 6.456680316007987, "learning_rate": 1.9635005064820377e-05, "loss": 0.9593, "step": 1786 }, { "epoch": 0.2275817055892513, "grad_norm": 7.727566645127223, "learning_rate": 1.9634452716513215e-05, "loss": 1.0032, "step": 1787 }, { "epoch": 0.22770905964945795, "grad_norm": 5.337484691160166, "learning_rate": 1.9633899958367384e-05, "loss": 0.9968, "step": 1788 }, { "epoch": 0.2278364137096646, "grad_norm": 6.736345679884345, "learning_rate": 1.9633346790406402e-05, "loss": 0.9217, "step": 1789 }, { "epoch": 0.2279637677698712, "grad_norm": 5.726731364218154, "learning_rate": 1.9632793212653795e-05, "loss": 0.9961, "step": 1790 }, { "epoch": 0.22809112183007785, "grad_norm": 4.315122829558817, "learning_rate": 1.9632239225133116e-05, "loss": 0.9914, "step": 1791 }, { "epoch": 0.2282184758902845, "grad_norm": 5.034835959132728, "learning_rate": 1.9631684827867926e-05, "loss": 1.0849, "step": 1792 }, { "epoch": 0.2283458299504911, "grad_norm": 13.966661267215047, "learning_rate": 1.9631130020881806e-05, "loss": 1.0368, "step": 1793 }, { "epoch": 0.22847318401069774, "grad_norm": 5.465619221025032, "learning_rate": 1.963057480419837e-05, "loss": 0.9769, "step": 1794 }, { "epoch": 0.22860053807090439, "grad_norm": 4.252969714040979, "learning_rate": 1.9630019177841224e-05, "loss": 1.0516, "step": 1795 }, { "epoch": 0.228727892131111, "grad_norm": 5.119678432678006, "learning_rate": 1.9629463141834008e-05, "loss": 0.9679, "step": 1796 }, { "epoch": 0.22885524619131764, "grad_norm": 6.089616361413847, "learning_rate": 1.9628906696200375e-05, "loss": 1.0558, "step": 1797 }, { "epoch": 0.22898260025152428, "grad_norm": 3.8992771016674848, "learning_rate": 1.9628349840963997e-05, "loss": 1.0169, "step": 1798 }, { "epoch": 0.2291099543117309, "grad_norm": 4.4152406993673186, "learning_rate": 1.9627792576148558e-05, "loss": 0.9963, "step": 1799 }, { "epoch": 0.22923730837193754, "grad_norm": 4.569440187853551, "learning_rate": 1.9627234901777768e-05, "loss": 1.0469, "step": 1800 }, { "epoch": 0.22936466243214418, "grad_norm": 4.675970929345057, "learning_rate": 1.9626676817875343e-05, "loss": 0.9363, "step": 1801 }, { "epoch": 0.2294920164923508, "grad_norm": 6.711610771396723, "learning_rate": 1.9626118324465035e-05, "loss": 0.9359, "step": 1802 }, { "epoch": 0.22961937055255743, "grad_norm": 5.1523400480938175, "learning_rate": 1.9625559421570587e-05, "loss": 0.9964, "step": 1803 }, { "epoch": 0.22974672461276405, "grad_norm": 4.124765562851582, "learning_rate": 1.9625000109215787e-05, "loss": 0.9782, "step": 1804 }, { "epoch": 0.2298740786729707, "grad_norm": 5.117340877651308, "learning_rate": 1.962444038742442e-05, "loss": 0.9557, "step": 1805 }, { "epoch": 0.23000143273317733, "grad_norm": 5.100902941188911, "learning_rate": 1.96238802562203e-05, "loss": 1.0152, "step": 1806 }, { "epoch": 0.23012878679338394, "grad_norm": 7.678748631254456, "learning_rate": 1.962331971562725e-05, "loss": 0.9388, "step": 1807 }, { "epoch": 0.23025614085359059, "grad_norm": 4.230898623055076, "learning_rate": 1.9622758765669117e-05, "loss": 1.0215, "step": 1808 }, { "epoch": 0.23038349491379723, "grad_norm": 5.386940755826735, "learning_rate": 1.9622197406369764e-05, "loss": 1.0412, "step": 1809 }, { "epoch": 0.23051084897400384, "grad_norm": 4.193235881463476, "learning_rate": 1.962163563775307e-05, "loss": 0.9781, "step": 1810 }, { "epoch": 0.23063820303421048, "grad_norm": 8.132097636089389, "learning_rate": 1.962107345984293e-05, "loss": 0.9979, "step": 1811 }, { "epoch": 0.23076555709441712, "grad_norm": 5.556965678651519, "learning_rate": 1.962051087266326e-05, "loss": 0.9099, "step": 1812 }, { "epoch": 0.23089291115462374, "grad_norm": 6.530311918651575, "learning_rate": 1.9619947876237996e-05, "loss": 0.9762, "step": 1813 }, { "epoch": 0.23102026521483038, "grad_norm": 5.109577139959268, "learning_rate": 1.9619384470591082e-05, "loss": 1.0315, "step": 1814 }, { "epoch": 0.23114761927503702, "grad_norm": 5.855383406024149, "learning_rate": 1.9618820655746488e-05, "loss": 0.9857, "step": 1815 }, { "epoch": 0.23127497333524363, "grad_norm": 4.980952029248604, "learning_rate": 1.961825643172819e-05, "loss": 1.132, "step": 1816 }, { "epoch": 0.23140232739545027, "grad_norm": 3.5464613080863323, "learning_rate": 1.96176917985602e-05, "loss": 0.855, "step": 1817 }, { "epoch": 0.23152968145565692, "grad_norm": 5.782907939000674, "learning_rate": 1.9617126756266533e-05, "loss": 1.062, "step": 1818 }, { "epoch": 0.23165703551586353, "grad_norm": 5.962578743788539, "learning_rate": 1.961656130487122e-05, "loss": 0.9667, "step": 1819 }, { "epoch": 0.23178438957607017, "grad_norm": 6.864858464548359, "learning_rate": 1.9615995444398326e-05, "loss": 1.0125, "step": 1820 }, { "epoch": 0.2319117436362768, "grad_norm": 5.593194518399508, "learning_rate": 1.9615429174871915e-05, "loss": 0.9814, "step": 1821 }, { "epoch": 0.23203909769648343, "grad_norm": 4.107295797592486, "learning_rate": 1.961486249631607e-05, "loss": 0.9323, "step": 1822 }, { "epoch": 0.23216645175669007, "grad_norm": 6.355699531477541, "learning_rate": 1.9614295408754908e-05, "loss": 0.9149, "step": 1823 }, { "epoch": 0.2322938058168967, "grad_norm": 6.169294604965106, "learning_rate": 1.961372791221254e-05, "loss": 1.0927, "step": 1824 }, { "epoch": 0.23242115987710332, "grad_norm": 5.650601933916082, "learning_rate": 1.961316000671312e-05, "loss": 0.9797, "step": 1825 }, { "epoch": 0.23254851393730996, "grad_norm": 5.9618915879234455, "learning_rate": 1.9612591692280798e-05, "loss": 0.9699, "step": 1826 }, { "epoch": 0.2326758679975166, "grad_norm": 5.440465710283945, "learning_rate": 1.961202296893975e-05, "loss": 1.0496, "step": 1827 }, { "epoch": 0.23280322205772322, "grad_norm": 4.8719408266955835, "learning_rate": 1.961145383671417e-05, "loss": 1.0317, "step": 1828 }, { "epoch": 0.23293057611792986, "grad_norm": 6.786863473513039, "learning_rate": 1.961088429562827e-05, "loss": 1.0223, "step": 1829 }, { "epoch": 0.2330579301781365, "grad_norm": 5.715195181543214, "learning_rate": 1.9610314345706275e-05, "loss": 1.091, "step": 1830 }, { "epoch": 0.23318528423834312, "grad_norm": 5.537056154671483, "learning_rate": 1.9609743986972427e-05, "loss": 1.0621, "step": 1831 }, { "epoch": 0.23331263829854976, "grad_norm": 5.431294901742344, "learning_rate": 1.9609173219450998e-05, "loss": 0.9379, "step": 1832 }, { "epoch": 0.2334399923587564, "grad_norm": 4.635917562217512, "learning_rate": 1.960860204316626e-05, "loss": 1.034, "step": 1833 }, { "epoch": 0.233567346418963, "grad_norm": 5.937687921059879, "learning_rate": 1.960803045814251e-05, "loss": 0.9789, "step": 1834 }, { "epoch": 0.23369470047916965, "grad_norm": 4.805508377539422, "learning_rate": 1.9607458464404065e-05, "loss": 0.9514, "step": 1835 }, { "epoch": 0.2338220545393763, "grad_norm": 5.356028944192273, "learning_rate": 1.9606886061975258e-05, "loss": 0.8895, "step": 1836 }, { "epoch": 0.2339494085995829, "grad_norm": 4.521239278589203, "learning_rate": 1.960631325088044e-05, "loss": 1.1037, "step": 1837 }, { "epoch": 0.23407676265978955, "grad_norm": 4.408012308259316, "learning_rate": 1.9605740031143972e-05, "loss": 0.9899, "step": 1838 }, { "epoch": 0.2342041167199962, "grad_norm": 6.913754351801961, "learning_rate": 1.9605166402790242e-05, "loss": 0.8878, "step": 1839 }, { "epoch": 0.2343314707802028, "grad_norm": 5.3691903587638405, "learning_rate": 1.960459236584365e-05, "loss": 0.9946, "step": 1840 }, { "epoch": 0.23445882484040945, "grad_norm": 4.323790435116697, "learning_rate": 1.9604017920328613e-05, "loss": 0.944, "step": 1841 }, { "epoch": 0.2345861789006161, "grad_norm": 5.879725077942632, "learning_rate": 1.9603443066269575e-05, "loss": 0.9666, "step": 1842 }, { "epoch": 0.2347135329608227, "grad_norm": 6.673614632504906, "learning_rate": 1.960286780369098e-05, "loss": 0.9177, "step": 1843 }, { "epoch": 0.23484088702102934, "grad_norm": 5.059346665131627, "learning_rate": 1.96022921326173e-05, "loss": 0.9912, "step": 1844 }, { "epoch": 0.23496824108123598, "grad_norm": 5.094966438939124, "learning_rate": 1.9601716053073034e-05, "loss": 0.9824, "step": 1845 }, { "epoch": 0.2350955951414426, "grad_norm": 4.978471566473348, "learning_rate": 1.9601139565082677e-05, "loss": 1.0614, "step": 1846 }, { "epoch": 0.23522294920164924, "grad_norm": 6.046248529200191, "learning_rate": 1.9600562668670756e-05, "loss": 1.0039, "step": 1847 }, { "epoch": 0.23535030326185585, "grad_norm": 4.59692065788742, "learning_rate": 1.959998536386181e-05, "loss": 0.997, "step": 1848 }, { "epoch": 0.2354776573220625, "grad_norm": 4.304352956742825, "learning_rate": 1.9599407650680397e-05, "loss": 1.0137, "step": 1849 }, { "epoch": 0.23560501138226914, "grad_norm": 5.236432118220893, "learning_rate": 1.9598829529151096e-05, "loss": 0.9861, "step": 1850 }, { "epoch": 0.23573236544247575, "grad_norm": 4.803484113145805, "learning_rate": 1.9598250999298495e-05, "loss": 0.9934, "step": 1851 }, { "epoch": 0.2358597195026824, "grad_norm": 4.582977535424973, "learning_rate": 1.9597672061147207e-05, "loss": 0.9304, "step": 1852 }, { "epoch": 0.23598707356288903, "grad_norm": 4.685459150452763, "learning_rate": 1.9597092714721858e-05, "loss": 0.9713, "step": 1853 }, { "epoch": 0.23611442762309565, "grad_norm": 4.903927781934567, "learning_rate": 1.9596512960047092e-05, "loss": 0.914, "step": 1854 }, { "epoch": 0.2362417816833023, "grad_norm": 4.935888099725095, "learning_rate": 1.9595932797147573e-05, "loss": 0.9847, "step": 1855 }, { "epoch": 0.23636913574350893, "grad_norm": 4.720197645025626, "learning_rate": 1.959535222604798e-05, "loss": 1.0138, "step": 1856 }, { "epoch": 0.23649648980371554, "grad_norm": 7.660251365899786, "learning_rate": 1.959477124677301e-05, "loss": 0.928, "step": 1857 }, { "epoch": 0.23662384386392218, "grad_norm": 6.028727107533511, "learning_rate": 1.9594189859347376e-05, "loss": 1.0456, "step": 1858 }, { "epoch": 0.23675119792412883, "grad_norm": 5.285424382864863, "learning_rate": 1.9593608063795808e-05, "loss": 1.0047, "step": 1859 }, { "epoch": 0.23687855198433544, "grad_norm": 4.413545885489942, "learning_rate": 1.9593025860143058e-05, "loss": 0.9942, "step": 1860 }, { "epoch": 0.23700590604454208, "grad_norm": 6.422270899540137, "learning_rate": 1.9592443248413896e-05, "loss": 0.9535, "step": 1861 }, { "epoch": 0.23713326010474872, "grad_norm": 4.863549110971738, "learning_rate": 1.9591860228633093e-05, "loss": 0.8156, "step": 1862 }, { "epoch": 0.23726061416495534, "grad_norm": 5.639933789453035, "learning_rate": 1.959127680082546e-05, "loss": 0.9758, "step": 1863 }, { "epoch": 0.23738796822516198, "grad_norm": 5.272758021907773, "learning_rate": 1.9590692965015818e-05, "loss": 0.9138, "step": 1864 }, { "epoch": 0.23751532228536862, "grad_norm": 5.844045050205219, "learning_rate": 1.9590108721228994e-05, "loss": 1.0062, "step": 1865 }, { "epoch": 0.23764267634557523, "grad_norm": 5.212810494302638, "learning_rate": 1.958952406948985e-05, "loss": 1.0041, "step": 1866 }, { "epoch": 0.23777003040578187, "grad_norm": 4.192451169042947, "learning_rate": 1.9588939009823246e-05, "loss": 1.0265, "step": 1867 }, { "epoch": 0.23789738446598852, "grad_norm": 7.087079512352341, "learning_rate": 1.9588353542254076e-05, "loss": 1.1435, "step": 1868 }, { "epoch": 0.23802473852619513, "grad_norm": 4.905834010280595, "learning_rate": 1.9587767666807245e-05, "loss": 1.0076, "step": 1869 }, { "epoch": 0.23815209258640177, "grad_norm": 5.806849861160403, "learning_rate": 1.9587181383507678e-05, "loss": 1.0408, "step": 1870 }, { "epoch": 0.2382794466466084, "grad_norm": 4.947380706135362, "learning_rate": 1.958659469238031e-05, "loss": 0.9148, "step": 1871 }, { "epoch": 0.23840680070681503, "grad_norm": 4.263103256718209, "learning_rate": 1.9586007593450098e-05, "loss": 0.8835, "step": 1872 }, { "epoch": 0.23853415476702167, "grad_norm": 4.573667680364965, "learning_rate": 1.958542008674202e-05, "loss": 1.1058, "step": 1873 }, { "epoch": 0.2386615088272283, "grad_norm": 4.741512948188774, "learning_rate": 1.9584832172281064e-05, "loss": 1.0385, "step": 1874 }, { "epoch": 0.23878886288743492, "grad_norm": 5.806566388694644, "learning_rate": 1.9584243850092246e-05, "loss": 0.9899, "step": 1875 }, { "epoch": 0.23891621694764156, "grad_norm": 7.7527082079816125, "learning_rate": 1.9583655120200586e-05, "loss": 1.1014, "step": 1876 }, { "epoch": 0.2390435710078482, "grad_norm": 5.3338465245025795, "learning_rate": 1.9583065982631128e-05, "loss": 1.0165, "step": 1877 }, { "epoch": 0.23917092506805482, "grad_norm": 4.761253878751353, "learning_rate": 1.9582476437408937e-05, "loss": 1.1034, "step": 1878 }, { "epoch": 0.23929827912826146, "grad_norm": 6.169684144379374, "learning_rate": 1.958188648455909e-05, "loss": 0.9707, "step": 1879 }, { "epoch": 0.2394256331884681, "grad_norm": 5.183876702549582, "learning_rate": 1.9581296124106682e-05, "loss": 0.9681, "step": 1880 }, { "epoch": 0.23955298724867471, "grad_norm": 3.8059335661646267, "learning_rate": 1.9580705356076826e-05, "loss": 0.8875, "step": 1881 }, { "epoch": 0.23968034130888136, "grad_norm": 4.421657597350017, "learning_rate": 1.9580114180494655e-05, "loss": 0.86, "step": 1882 }, { "epoch": 0.239807695369088, "grad_norm": 6.634587673780053, "learning_rate": 1.9579522597385315e-05, "loss": 1.0404, "step": 1883 }, { "epoch": 0.2399350494292946, "grad_norm": 4.316824251536347, "learning_rate": 1.9578930606773975e-05, "loss": 0.9788, "step": 1884 }, { "epoch": 0.24006240348950125, "grad_norm": 4.444924175014784, "learning_rate": 1.957833820868581e-05, "loss": 1.0692, "step": 1885 }, { "epoch": 0.2401897575497079, "grad_norm": 5.789228552780425, "learning_rate": 1.9577745403146026e-05, "loss": 1.043, "step": 1886 }, { "epoch": 0.2403171116099145, "grad_norm": 5.8358917438801985, "learning_rate": 1.9577152190179837e-05, "loss": 0.9655, "step": 1887 }, { "epoch": 0.24044446567012115, "grad_norm": 3.9383483457232096, "learning_rate": 1.9576558569812484e-05, "loss": 0.9138, "step": 1888 }, { "epoch": 0.2405718197303278, "grad_norm": 4.634923238602164, "learning_rate": 1.957596454206921e-05, "loss": 1.0484, "step": 1889 }, { "epoch": 0.2406991737905344, "grad_norm": 6.206385603515261, "learning_rate": 1.9575370106975288e-05, "loss": 0.9826, "step": 1890 }, { "epoch": 0.24082652785074105, "grad_norm": 4.170206829403069, "learning_rate": 1.9574775264556005e-05, "loss": 0.9325, "step": 1891 }, { "epoch": 0.2409538819109477, "grad_norm": 5.2007134608348675, "learning_rate": 1.9574180014836668e-05, "loss": 0.9554, "step": 1892 }, { "epoch": 0.2410812359711543, "grad_norm": 6.073407615154317, "learning_rate": 1.9573584357842592e-05, "loss": 1.0859, "step": 1893 }, { "epoch": 0.24120859003136094, "grad_norm": 5.51343611815531, "learning_rate": 1.9572988293599124e-05, "loss": 1.0487, "step": 1894 }, { "epoch": 0.24133594409156756, "grad_norm": 5.406242238298257, "learning_rate": 1.9572391822131606e-05, "loss": 1.0153, "step": 1895 }, { "epoch": 0.2414632981517742, "grad_norm": 3.6163605966280175, "learning_rate": 1.9571794943465424e-05, "loss": 0.9876, "step": 1896 }, { "epoch": 0.24159065221198084, "grad_norm": 7.0188141436885605, "learning_rate": 1.9571197657625967e-05, "loss": 0.9274, "step": 1897 }, { "epoch": 0.24171800627218745, "grad_norm": 5.502167881358521, "learning_rate": 1.957059996463864e-05, "loss": 1.1126, "step": 1898 }, { "epoch": 0.2418453603323941, "grad_norm": 5.245404730091922, "learning_rate": 1.9570001864528863e-05, "loss": 0.9728, "step": 1899 }, { "epoch": 0.24197271439260074, "grad_norm": 6.017843417046496, "learning_rate": 1.956940335732209e-05, "loss": 0.9613, "step": 1900 }, { "epoch": 0.24210006845280735, "grad_norm": 5.594564708859503, "learning_rate": 1.9568804443043774e-05, "loss": 1.0798, "step": 1901 }, { "epoch": 0.242227422513014, "grad_norm": 5.132520043156988, "learning_rate": 1.956820512171939e-05, "loss": 0.9427, "step": 1902 }, { "epoch": 0.24235477657322063, "grad_norm": 6.049832352766647, "learning_rate": 1.956760539337444e-05, "loss": 1.0261, "step": 1903 }, { "epoch": 0.24248213063342725, "grad_norm": 5.616791054605702, "learning_rate": 1.9567005258034423e-05, "loss": 0.9481, "step": 1904 }, { "epoch": 0.2426094846936339, "grad_norm": 6.4642734241444035, "learning_rate": 1.9566404715724884e-05, "loss": 0.9793, "step": 1905 }, { "epoch": 0.24273683875384053, "grad_norm": 6.313155640649643, "learning_rate": 1.9565803766471355e-05, "loss": 1.035, "step": 1906 }, { "epoch": 0.24286419281404714, "grad_norm": 7.304047484077455, "learning_rate": 1.9565202410299415e-05, "loss": 1.0329, "step": 1907 }, { "epoch": 0.24299154687425378, "grad_norm": 6.860041766248072, "learning_rate": 1.9564600647234628e-05, "loss": 0.9587, "step": 1908 }, { "epoch": 0.24311890093446042, "grad_norm": 6.482658729007516, "learning_rate": 1.9563998477302604e-05, "loss": 0.9059, "step": 1909 }, { "epoch": 0.24324625499466704, "grad_norm": 6.916423778022661, "learning_rate": 1.9563395900528956e-05, "loss": 0.8524, "step": 1910 }, { "epoch": 0.24337360905487368, "grad_norm": 4.723823699658888, "learning_rate": 1.956279291693931e-05, "loss": 1.0333, "step": 1911 }, { "epoch": 0.24350096311508032, "grad_norm": 8.42233790326866, "learning_rate": 1.9562189526559333e-05, "loss": 0.9848, "step": 1912 }, { "epoch": 0.24362831717528693, "grad_norm": 4.949374635284291, "learning_rate": 1.9561585729414675e-05, "loss": 0.9388, "step": 1913 }, { "epoch": 0.24375567123549358, "grad_norm": 5.121845297875669, "learning_rate": 1.9560981525531027e-05, "loss": 1.0829, "step": 1914 }, { "epoch": 0.24388302529570022, "grad_norm": 4.631129559020309, "learning_rate": 1.9560376914934098e-05, "loss": 0.8969, "step": 1915 }, { "epoch": 0.24401037935590683, "grad_norm": 5.316361403994358, "learning_rate": 1.9559771897649592e-05, "loss": 1.0067, "step": 1916 }, { "epoch": 0.24413773341611347, "grad_norm": 6.211563635400579, "learning_rate": 1.9559166473703265e-05, "loss": 0.8899, "step": 1917 }, { "epoch": 0.24426508747632011, "grad_norm": 6.407445284356433, "learning_rate": 1.9558560643120855e-05, "loss": 0.951, "step": 1918 }, { "epoch": 0.24439244153652673, "grad_norm": 5.549064784254947, "learning_rate": 1.9557954405928142e-05, "loss": 0.967, "step": 1919 }, { "epoch": 0.24451979559673337, "grad_norm": 5.021470320988695, "learning_rate": 1.955734776215091e-05, "loss": 1.0486, "step": 1920 }, { "epoch": 0.24464714965694, "grad_norm": 6.492243026330688, "learning_rate": 1.955674071181497e-05, "loss": 0.9918, "step": 1921 }, { "epoch": 0.24477450371714662, "grad_norm": 6.736913640652619, "learning_rate": 1.955613325494614e-05, "loss": 0.9308, "step": 1922 }, { "epoch": 0.24490185777735327, "grad_norm": 5.582164850898567, "learning_rate": 1.955552539157026e-05, "loss": 0.9773, "step": 1923 }, { "epoch": 0.2450292118375599, "grad_norm": 4.496928968496731, "learning_rate": 1.9554917121713198e-05, "loss": 1.0006, "step": 1924 }, { "epoch": 0.24515656589776652, "grad_norm": 4.202497811302691, "learning_rate": 1.955430844540082e-05, "loss": 0.9419, "step": 1925 }, { "epoch": 0.24528391995797316, "grad_norm": 7.195137013501995, "learning_rate": 1.9553699362659016e-05, "loss": 0.9848, "step": 1926 }, { "epoch": 0.2454112740181798, "grad_norm": 5.762608511807093, "learning_rate": 1.9553089873513702e-05, "loss": 0.9671, "step": 1927 }, { "epoch": 0.24553862807838642, "grad_norm": 6.201908174791442, "learning_rate": 1.9552479977990802e-05, "loss": 0.9007, "step": 1928 }, { "epoch": 0.24566598213859306, "grad_norm": 6.22489587168181, "learning_rate": 1.955186967611626e-05, "loss": 0.9826, "step": 1929 }, { "epoch": 0.2457933361987997, "grad_norm": 7.069124878674373, "learning_rate": 1.955125896791604e-05, "loss": 1.1846, "step": 1930 }, { "epoch": 0.2459206902590063, "grad_norm": 5.212998927546745, "learning_rate": 1.955064785341612e-05, "loss": 0.8786, "step": 1931 }, { "epoch": 0.24604804431921296, "grad_norm": 4.894650103929042, "learning_rate": 1.9550036332642496e-05, "loss": 0.9747, "step": 1932 }, { "epoch": 0.2461753983794196, "grad_norm": 6.111738523392531, "learning_rate": 1.954942440562118e-05, "loss": 0.8886, "step": 1933 }, { "epoch": 0.2463027524396262, "grad_norm": 5.085283416081644, "learning_rate": 1.9548812072378208e-05, "loss": 1.0079, "step": 1934 }, { "epoch": 0.24643010649983285, "grad_norm": 7.121943838967712, "learning_rate": 1.954819933293962e-05, "loss": 1.0411, "step": 1935 }, { "epoch": 0.2465574605600395, "grad_norm": 4.765921361494564, "learning_rate": 1.954758618733148e-05, "loss": 0.978, "step": 1936 }, { "epoch": 0.2466848146202461, "grad_norm": 5.157511608796732, "learning_rate": 1.954697263557988e-05, "loss": 0.9591, "step": 1937 }, { "epoch": 0.24681216868045275, "grad_norm": 4.500125035562163, "learning_rate": 1.9546358677710917e-05, "loss": 0.9751, "step": 1938 }, { "epoch": 0.24693952274065936, "grad_norm": 5.388541794880216, "learning_rate": 1.9545744313750706e-05, "loss": 0.9856, "step": 1939 }, { "epoch": 0.247066876800866, "grad_norm": 5.033008339265417, "learning_rate": 1.954512954372538e-05, "loss": 0.9379, "step": 1940 }, { "epoch": 0.24719423086107264, "grad_norm": 5.048770702042888, "learning_rate": 1.954451436766109e-05, "loss": 1.0365, "step": 1941 }, { "epoch": 0.24732158492127926, "grad_norm": 5.618927623754484, "learning_rate": 1.954389878558401e-05, "loss": 0.9645, "step": 1942 }, { "epoch": 0.2474489389814859, "grad_norm": 6.407757539845989, "learning_rate": 1.954328279752032e-05, "loss": 1.1237, "step": 1943 }, { "epoch": 0.24757629304169254, "grad_norm": 6.006274121858267, "learning_rate": 1.9542666403496232e-05, "loss": 1.0167, "step": 1944 }, { "epoch": 0.24770364710189915, "grad_norm": 5.895080985693145, "learning_rate": 1.954204960353796e-05, "loss": 1.0035, "step": 1945 }, { "epoch": 0.2478310011621058, "grad_norm": 4.438704206380312, "learning_rate": 1.954143239767174e-05, "loss": 0.972, "step": 1946 }, { "epoch": 0.24795835522231244, "grad_norm": 4.267205808272698, "learning_rate": 1.9540814785923832e-05, "loss": 1.0307, "step": 1947 }, { "epoch": 0.24808570928251905, "grad_norm": 5.4795965260572705, "learning_rate": 1.954019676832051e-05, "loss": 0.9458, "step": 1948 }, { "epoch": 0.2482130633427257, "grad_norm": 5.286442839000025, "learning_rate": 1.9539578344888057e-05, "loss": 0.9832, "step": 1949 }, { "epoch": 0.24834041740293233, "grad_norm": 5.362946768691932, "learning_rate": 1.9538959515652786e-05, "loss": 1.0164, "step": 1950 }, { "epoch": 0.24846777146313895, "grad_norm": 5.207804457197322, "learning_rate": 1.9538340280641018e-05, "loss": 0.9356, "step": 1951 }, { "epoch": 0.2485951255233456, "grad_norm": 6.356636376204565, "learning_rate": 1.9537720639879096e-05, "loss": 1.0118, "step": 1952 }, { "epoch": 0.24872247958355223, "grad_norm": 4.992868296448806, "learning_rate": 1.953710059339338e-05, "loss": 1.0178, "step": 1953 }, { "epoch": 0.24884983364375884, "grad_norm": 5.213643939691587, "learning_rate": 1.9536480141210242e-05, "loss": 0.9866, "step": 1954 }, { "epoch": 0.24897718770396549, "grad_norm": 4.929728899266538, "learning_rate": 1.953585928335608e-05, "loss": 1.0418, "step": 1955 }, { "epoch": 0.24910454176417213, "grad_norm": 6.206567841810802, "learning_rate": 1.95352380198573e-05, "loss": 1.0212, "step": 1956 }, { "epoch": 0.24923189582437874, "grad_norm": 4.618441080749805, "learning_rate": 1.9534616350740336e-05, "loss": 1.0703, "step": 1957 }, { "epoch": 0.24935924988458538, "grad_norm": 9.527918699338851, "learning_rate": 1.953399427603163e-05, "loss": 1.0498, "step": 1958 }, { "epoch": 0.24948660394479202, "grad_norm": 4.475756507734566, "learning_rate": 1.953337179575764e-05, "loss": 0.9581, "step": 1959 }, { "epoch": 0.24961395800499864, "grad_norm": 5.777107362895541, "learning_rate": 1.953274890994485e-05, "loss": 1.068, "step": 1960 }, { "epoch": 0.24974131206520528, "grad_norm": 5.472723688857767, "learning_rate": 1.953212561861976e-05, "loss": 1.0794, "step": 1961 }, { "epoch": 0.24986866612541192, "grad_norm": 4.63711363630185, "learning_rate": 1.9531501921808877e-05, "loss": 0.9387, "step": 1962 }, { "epoch": 0.24999602018561853, "grad_norm": 5.787593845735321, "learning_rate": 1.9530877819538736e-05, "loss": 0.9188, "step": 1963 }, { "epoch": 0.25012337424582515, "grad_norm": 5.269182491991631, "learning_rate": 1.9530253311835884e-05, "loss": 0.9789, "step": 1964 }, { "epoch": 0.2502507283060318, "grad_norm": 4.672931462385088, "learning_rate": 1.9529628398726892e-05, "loss": 1.0085, "step": 1965 }, { "epoch": 0.25037808236623843, "grad_norm": 5.314662358638423, "learning_rate": 1.9529003080238337e-05, "loss": 0.9308, "step": 1966 }, { "epoch": 0.25050543642644507, "grad_norm": 4.658522083550714, "learning_rate": 1.9528377356396825e-05, "loss": 0.9868, "step": 1967 }, { "epoch": 0.2506327904866517, "grad_norm": 4.732259850713794, "learning_rate": 1.9527751227228964e-05, "loss": 0.9688, "step": 1968 }, { "epoch": 0.25076014454685835, "grad_norm": 4.549413437815234, "learning_rate": 1.95271246927614e-05, "loss": 0.9979, "step": 1969 }, { "epoch": 0.25088749860706494, "grad_norm": 5.563897118149959, "learning_rate": 1.9526497753020776e-05, "loss": 0.9984, "step": 1970 }, { "epoch": 0.2510148526672716, "grad_norm": 4.3159957207206885, "learning_rate": 1.952587040803377e-05, "loss": 1.0247, "step": 1971 }, { "epoch": 0.2511422067274782, "grad_norm": 3.989694432769271, "learning_rate": 1.9525242657827063e-05, "loss": 0.981, "step": 1972 }, { "epoch": 0.25126956078768486, "grad_norm": 4.9109895980746545, "learning_rate": 1.9524614502427358e-05, "loss": 1.0609, "step": 1973 }, { "epoch": 0.2513969148478915, "grad_norm": 6.58293704357173, "learning_rate": 1.9523985941861376e-05, "loss": 0.9695, "step": 1974 }, { "epoch": 0.25152426890809815, "grad_norm": 5.057107950233266, "learning_rate": 1.952335697615586e-05, "loss": 0.9499, "step": 1975 }, { "epoch": 0.25165162296830473, "grad_norm": 5.667578717304745, "learning_rate": 1.952272760533756e-05, "loss": 1.0221, "step": 1976 }, { "epoch": 0.2517789770285114, "grad_norm": 7.079981410977373, "learning_rate": 1.9522097829433252e-05, "loss": 0.9485, "step": 1977 }, { "epoch": 0.251906331088718, "grad_norm": 5.338151642733225, "learning_rate": 1.9521467648469728e-05, "loss": 0.9187, "step": 1978 }, { "epoch": 0.25203368514892466, "grad_norm": 5.3896529496096806, "learning_rate": 1.9520837062473788e-05, "loss": 0.9913, "step": 1979 }, { "epoch": 0.2521610392091313, "grad_norm": 6.123413503278127, "learning_rate": 1.9520206071472264e-05, "loss": 0.9032, "step": 1980 }, { "epoch": 0.25228839326933794, "grad_norm": 5.591531153636707, "learning_rate": 1.9519574675491995e-05, "loss": 0.9196, "step": 1981 }, { "epoch": 0.2524157473295445, "grad_norm": 7.5871264295038205, "learning_rate": 1.951894287455984e-05, "loss": 0.9486, "step": 1982 }, { "epoch": 0.25254310138975117, "grad_norm": 5.71256743209739, "learning_rate": 1.951831066870267e-05, "loss": 0.9122, "step": 1983 }, { "epoch": 0.2526704554499578, "grad_norm": 5.781656446861163, "learning_rate": 1.9517678057947385e-05, "loss": 0.8943, "step": 1984 }, { "epoch": 0.25279780951016445, "grad_norm": 5.738200478605293, "learning_rate": 1.9517045042320893e-05, "loss": 0.8755, "step": 1985 }, { "epoch": 0.2529251635703711, "grad_norm": 6.186663946830765, "learning_rate": 1.951641162185012e-05, "loss": 1.0455, "step": 1986 }, { "epoch": 0.25305251763057773, "grad_norm": 4.660019136315867, "learning_rate": 1.9515777796562016e-05, "loss": 0.9795, "step": 1987 }, { "epoch": 0.2531798716907843, "grad_norm": 4.994360222611413, "learning_rate": 1.951514356648354e-05, "loss": 0.9546, "step": 1988 }, { "epoch": 0.25330722575099096, "grad_norm": 5.91407604543528, "learning_rate": 1.951450893164167e-05, "loss": 0.99, "step": 1989 }, { "epoch": 0.2534345798111976, "grad_norm": 5.697067429917377, "learning_rate": 1.9513873892063403e-05, "loss": 0.9586, "step": 1990 }, { "epoch": 0.25356193387140424, "grad_norm": 5.019744037318567, "learning_rate": 1.9513238447775757e-05, "loss": 0.9406, "step": 1991 }, { "epoch": 0.2536892879316109, "grad_norm": 5.232808621741993, "learning_rate": 1.951260259880576e-05, "loss": 0.9392, "step": 1992 }, { "epoch": 0.2538166419918175, "grad_norm": 6.66558555738008, "learning_rate": 1.9511966345180457e-05, "loss": 0.9115, "step": 1993 }, { "epoch": 0.2539439960520241, "grad_norm": 4.94620991499485, "learning_rate": 1.9511329686926922e-05, "loss": 1.0263, "step": 1994 }, { "epoch": 0.25407135011223075, "grad_norm": 5.9253145158562175, "learning_rate": 1.951069262407223e-05, "loss": 1.0708, "step": 1995 }, { "epoch": 0.2541987041724374, "grad_norm": 4.381045061615913, "learning_rate": 1.9510055156643485e-05, "loss": 0.8709, "step": 1996 }, { "epoch": 0.25432605823264404, "grad_norm": 7.938926268266305, "learning_rate": 1.95094172846678e-05, "loss": 1.0671, "step": 1997 }, { "epoch": 0.2544534122928507, "grad_norm": 7.953716143689362, "learning_rate": 1.9508779008172314e-05, "loss": 0.9639, "step": 1998 }, { "epoch": 0.2545807663530573, "grad_norm": 7.371930016817276, "learning_rate": 1.950814032718418e-05, "loss": 0.9721, "step": 1999 }, { "epoch": 0.2547081204132639, "grad_norm": 5.248665594633744, "learning_rate": 1.9507501241730557e-05, "loss": 0.9776, "step": 2000 }, { "epoch": 0.25483547447347055, "grad_norm": 5.744965712907299, "learning_rate": 1.950686175183864e-05, "loss": 1.0409, "step": 2001 }, { "epoch": 0.2549628285336772, "grad_norm": 5.022489134879771, "learning_rate": 1.950622185753563e-05, "loss": 0.9637, "step": 2002 }, { "epoch": 0.25509018259388383, "grad_norm": 4.256987269512674, "learning_rate": 1.9505581558848747e-05, "loss": 0.9848, "step": 2003 }, { "epoch": 0.25521753665409047, "grad_norm": 5.743151691477282, "learning_rate": 1.9504940855805227e-05, "loss": 0.9223, "step": 2004 }, { "epoch": 0.2553448907142971, "grad_norm": 5.6296157548595875, "learning_rate": 1.9504299748432328e-05, "loss": 0.9437, "step": 2005 }, { "epoch": 0.2554722447745037, "grad_norm": 5.183459075351116, "learning_rate": 1.950365823675732e-05, "loss": 0.9489, "step": 2006 }, { "epoch": 0.25559959883471034, "grad_norm": 4.723628673666008, "learning_rate": 1.9503016320807495e-05, "loss": 1.0171, "step": 2007 }, { "epoch": 0.255726952894917, "grad_norm": 4.789530844758245, "learning_rate": 1.9502374000610152e-05, "loss": 1.0385, "step": 2008 }, { "epoch": 0.2558543069551236, "grad_norm": 6.038662955229844, "learning_rate": 1.950173127619262e-05, "loss": 0.9552, "step": 2009 }, { "epoch": 0.25598166101533026, "grad_norm": 5.222860031317926, "learning_rate": 1.9501088147582243e-05, "loss": 0.8481, "step": 2010 }, { "epoch": 0.25610901507553685, "grad_norm": 6.446247564747698, "learning_rate": 1.950044461480637e-05, "loss": 0.9903, "step": 2011 }, { "epoch": 0.2562363691357435, "grad_norm": 4.804986414101216, "learning_rate": 1.9499800677892386e-05, "loss": 0.9854, "step": 2012 }, { "epoch": 0.25636372319595013, "grad_norm": 5.666979650193999, "learning_rate": 1.9499156336867677e-05, "loss": 0.9222, "step": 2013 }, { "epoch": 0.2564910772561568, "grad_norm": 6.352340819471099, "learning_rate": 1.9498511591759653e-05, "loss": 0.9644, "step": 2014 }, { "epoch": 0.2566184313163634, "grad_norm": 4.626991049601858, "learning_rate": 1.9497866442595744e-05, "loss": 0.9152, "step": 2015 }, { "epoch": 0.25674578537657006, "grad_norm": 4.667148610614885, "learning_rate": 1.949722088940339e-05, "loss": 0.9716, "step": 2016 }, { "epoch": 0.25687313943677664, "grad_norm": 4.562568559523414, "learning_rate": 1.9496574932210056e-05, "loss": 0.9077, "step": 2017 }, { "epoch": 0.2570004934969833, "grad_norm": 4.833598469444722, "learning_rate": 1.9495928571043213e-05, "loss": 0.9008, "step": 2018 }, { "epoch": 0.2571278475571899, "grad_norm": 4.208463660510322, "learning_rate": 1.949528180593037e-05, "loss": 0.9618, "step": 2019 }, { "epoch": 0.25725520161739657, "grad_norm": 4.80389665747538, "learning_rate": 1.9494634636899023e-05, "loss": 1.0306, "step": 2020 }, { "epoch": 0.2573825556776032, "grad_norm": 5.772586413926603, "learning_rate": 1.9493987063976715e-05, "loss": 0.9685, "step": 2021 }, { "epoch": 0.25750990973780985, "grad_norm": 4.485702265062965, "learning_rate": 1.949333908719099e-05, "loss": 1.0295, "step": 2022 }, { "epoch": 0.25763726379801644, "grad_norm": 4.000994320309323, "learning_rate": 1.9492690706569406e-05, "loss": 1.0433, "step": 2023 }, { "epoch": 0.2577646178582231, "grad_norm": 6.682145872084098, "learning_rate": 1.949204192213955e-05, "loss": 0.9644, "step": 2024 }, { "epoch": 0.2578919719184297, "grad_norm": 4.093824546076993, "learning_rate": 1.949139273392902e-05, "loss": 0.9027, "step": 2025 }, { "epoch": 0.25801932597863636, "grad_norm": 5.737812228215539, "learning_rate": 1.949074314196543e-05, "loss": 1.076, "step": 2026 }, { "epoch": 0.258146680038843, "grad_norm": 5.741275637100357, "learning_rate": 1.9490093146276413e-05, "loss": 1.1259, "step": 2027 }, { "epoch": 0.25827403409904964, "grad_norm": 4.069759748711396, "learning_rate": 1.9489442746889623e-05, "loss": 0.9098, "step": 2028 }, { "epoch": 0.25840138815925623, "grad_norm": 6.913165604344778, "learning_rate": 1.948879194383272e-05, "loss": 1.0538, "step": 2029 }, { "epoch": 0.25852874221946287, "grad_norm": 4.0628089200454385, "learning_rate": 1.9488140737133397e-05, "loss": 0.9584, "step": 2030 }, { "epoch": 0.2586560962796695, "grad_norm": 4.649874630130122, "learning_rate": 1.948748912681935e-05, "loss": 0.9008, "step": 2031 }, { "epoch": 0.25878345033987615, "grad_norm": 6.504962493999473, "learning_rate": 1.94868371129183e-05, "loss": 0.9865, "step": 2032 }, { "epoch": 0.2589108044000828, "grad_norm": 4.524654876867488, "learning_rate": 1.948618469545798e-05, "loss": 1.0607, "step": 2033 }, { "epoch": 0.25903815846028944, "grad_norm": 7.1720050475852215, "learning_rate": 1.9485531874466145e-05, "loss": 1.0816, "step": 2034 }, { "epoch": 0.259165512520496, "grad_norm": 5.875656091295152, "learning_rate": 1.9484878649970563e-05, "loss": 0.9564, "step": 2035 }, { "epoch": 0.25929286658070266, "grad_norm": 4.292422453369248, "learning_rate": 1.9484225021999032e-05, "loss": 0.9985, "step": 2036 }, { "epoch": 0.2594202206409093, "grad_norm": 5.771045814026921, "learning_rate": 1.9483570990579342e-05, "loss": 1.0955, "step": 2037 }, { "epoch": 0.25954757470111595, "grad_norm": 5.2503463994143065, "learning_rate": 1.948291655573932e-05, "loss": 1.0243, "step": 2038 }, { "epoch": 0.2596749287613226, "grad_norm": 5.007229156558939, "learning_rate": 1.948226171750681e-05, "loss": 0.9475, "step": 2039 }, { "epoch": 0.25980228282152923, "grad_norm": 5.632431546475503, "learning_rate": 1.948160647590966e-05, "loss": 1.0223, "step": 2040 }, { "epoch": 0.2599296368817358, "grad_norm": 9.10566900385757, "learning_rate": 1.9480950830975752e-05, "loss": 1.0305, "step": 2041 }, { "epoch": 0.26005699094194246, "grad_norm": 5.6965521573960505, "learning_rate": 1.9480294782732966e-05, "loss": 1.0181, "step": 2042 }, { "epoch": 0.2601843450021491, "grad_norm": 4.369382248780932, "learning_rate": 1.9479638331209218e-05, "loss": 1.0361, "step": 2043 }, { "epoch": 0.26031169906235574, "grad_norm": 3.7265955125797974, "learning_rate": 1.947898147643243e-05, "loss": 0.9158, "step": 2044 }, { "epoch": 0.2604390531225624, "grad_norm": 4.451581935230915, "learning_rate": 1.9478324218430544e-05, "loss": 0.9377, "step": 2045 }, { "epoch": 0.260566407182769, "grad_norm": 5.931155378674257, "learning_rate": 1.947766655723152e-05, "loss": 0.9793, "step": 2046 }, { "epoch": 0.2606937612429756, "grad_norm": 4.903956563635871, "learning_rate": 1.947700849286333e-05, "loss": 0.9661, "step": 2047 }, { "epoch": 0.26082111530318225, "grad_norm": 6.194315537887149, "learning_rate": 1.9476350025353975e-05, "loss": 0.8985, "step": 2048 }, { "epoch": 0.2609484693633889, "grad_norm": 4.716346583384696, "learning_rate": 1.9475691154731458e-05, "loss": 1.0179, "step": 2049 }, { "epoch": 0.26107582342359553, "grad_norm": 5.822480523751272, "learning_rate": 1.9475031881023807e-05, "loss": 0.9497, "step": 2050 }, { "epoch": 0.2612031774838022, "grad_norm": 4.490659769092194, "learning_rate": 1.9474372204259072e-05, "loss": 0.9742, "step": 2051 }, { "epoch": 0.2613305315440088, "grad_norm": 4.625444339336894, "learning_rate": 1.947371212446531e-05, "loss": 0.9551, "step": 2052 }, { "epoch": 0.2614578856042154, "grad_norm": 5.441190147079479, "learning_rate": 1.9473051641670606e-05, "loss": 1.0847, "step": 2053 }, { "epoch": 0.26158523966442204, "grad_norm": 5.68205621116733, "learning_rate": 1.947239075590305e-05, "loss": 0.8826, "step": 2054 }, { "epoch": 0.2617125937246287, "grad_norm": 5.107477891662221, "learning_rate": 1.9471729467190757e-05, "loss": 0.9449, "step": 2055 }, { "epoch": 0.2618399477848353, "grad_norm": 5.246364606396695, "learning_rate": 1.947106777556186e-05, "loss": 0.9755, "step": 2056 }, { "epoch": 0.26196730184504197, "grad_norm": 5.182116553129121, "learning_rate": 1.94704056810445e-05, "loss": 0.9489, "step": 2057 }, { "epoch": 0.26209465590524855, "grad_norm": 4.90976273470358, "learning_rate": 1.9469743183666852e-05, "loss": 1.0466, "step": 2058 }, { "epoch": 0.2622220099654552, "grad_norm": 8.779726378189636, "learning_rate": 1.9469080283457087e-05, "loss": 0.9995, "step": 2059 }, { "epoch": 0.26234936402566184, "grad_norm": 3.972650848655067, "learning_rate": 1.9468416980443413e-05, "loss": 0.9669, "step": 2060 }, { "epoch": 0.2624767180858685, "grad_norm": 6.234750683777755, "learning_rate": 1.946775327465404e-05, "loss": 0.8686, "step": 2061 }, { "epoch": 0.2626040721460751, "grad_norm": 5.671675917649363, "learning_rate": 1.9467089166117207e-05, "loss": 0.9811, "step": 2062 }, { "epoch": 0.26273142620628176, "grad_norm": 5.635781869201618, "learning_rate": 1.9466424654861158e-05, "loss": 0.9497, "step": 2063 }, { "epoch": 0.26285878026648835, "grad_norm": 5.679230431553005, "learning_rate": 1.9465759740914166e-05, "loss": 0.9809, "step": 2064 }, { "epoch": 0.262986134326695, "grad_norm": 5.675753021695117, "learning_rate": 1.946509442430451e-05, "loss": 0.9852, "step": 2065 }, { "epoch": 0.26311348838690163, "grad_norm": 4.959731067934566, "learning_rate": 1.9464428705060497e-05, "loss": 1.0174, "step": 2066 }, { "epoch": 0.26324084244710827, "grad_norm": 4.304803587080349, "learning_rate": 1.9463762583210446e-05, "loss": 1.0509, "step": 2067 }, { "epoch": 0.2633681965073149, "grad_norm": 7.089286038239846, "learning_rate": 1.946309605878269e-05, "loss": 0.8987, "step": 2068 }, { "epoch": 0.26349555056752155, "grad_norm": 5.526170715301709, "learning_rate": 1.9462429131805582e-05, "loss": 0.9999, "step": 2069 }, { "epoch": 0.26362290462772814, "grad_norm": 4.33938950600082, "learning_rate": 1.9461761802307494e-05, "loss": 0.9302, "step": 2070 }, { "epoch": 0.2637502586879348, "grad_norm": 5.172476281194202, "learning_rate": 1.9461094070316816e-05, "loss": 0.8788, "step": 2071 }, { "epoch": 0.2638776127481414, "grad_norm": 7.461355467268366, "learning_rate": 1.946042593586195e-05, "loss": 0.9263, "step": 2072 }, { "epoch": 0.26400496680834806, "grad_norm": 4.231338369328546, "learning_rate": 1.9459757398971314e-05, "loss": 0.9991, "step": 2073 }, { "epoch": 0.2641323208685547, "grad_norm": 8.699708552247195, "learning_rate": 1.945908845967335e-05, "loss": 1.0096, "step": 2074 }, { "epoch": 0.26425967492876135, "grad_norm": 4.364700619283261, "learning_rate": 1.9458419117996516e-05, "loss": 0.8847, "step": 2075 }, { "epoch": 0.26438702898896793, "grad_norm": 7.3584693950833895, "learning_rate": 1.9457749373969284e-05, "loss": 0.9253, "step": 2076 }, { "epoch": 0.2645143830491746, "grad_norm": 4.702713755120298, "learning_rate": 1.945707922762014e-05, "loss": 0.9356, "step": 2077 }, { "epoch": 0.2646417371093812, "grad_norm": 5.385745286292173, "learning_rate": 1.94564086789776e-05, "loss": 0.9187, "step": 2078 }, { "epoch": 0.26476909116958786, "grad_norm": 5.6318644705959615, "learning_rate": 1.9455737728070177e-05, "loss": 1.0098, "step": 2079 }, { "epoch": 0.2648964452297945, "grad_norm": 5.453807536697538, "learning_rate": 1.9455066374926425e-05, "loss": 0.8969, "step": 2080 }, { "epoch": 0.26502379929000114, "grad_norm": 7.346760623744158, "learning_rate": 1.945439461957489e-05, "loss": 1.0585, "step": 2081 }, { "epoch": 0.2651511533502077, "grad_norm": 4.826254670665253, "learning_rate": 1.9453722462044157e-05, "loss": 0.9824, "step": 2082 }, { "epoch": 0.26527850741041437, "grad_norm": 5.021486084268962, "learning_rate": 1.9453049902362812e-05, "loss": 1.0138, "step": 2083 }, { "epoch": 0.265405861470621, "grad_norm": 6.190231799097874, "learning_rate": 1.945237694055947e-05, "loss": 0.9568, "step": 2084 }, { "epoch": 0.26553321553082765, "grad_norm": 5.8248848179247155, "learning_rate": 1.9451703576662758e-05, "loss": 0.9281, "step": 2085 }, { "epoch": 0.2656605695910343, "grad_norm": 5.697414601561821, "learning_rate": 1.945102981070132e-05, "loss": 1.0025, "step": 2086 }, { "epoch": 0.26578792365124093, "grad_norm": 5.596524024919309, "learning_rate": 1.9450355642703812e-05, "loss": 0.964, "step": 2087 }, { "epoch": 0.2659152777114475, "grad_norm": 6.7048323442063795, "learning_rate": 1.944968107269892e-05, "loss": 0.92, "step": 2088 }, { "epoch": 0.26604263177165416, "grad_norm": 4.041925769379109, "learning_rate": 1.9449006100715334e-05, "loss": 1.06, "step": 2089 }, { "epoch": 0.2661699858318608, "grad_norm": 4.404246855661088, "learning_rate": 1.9448330726781767e-05, "loss": 0.9563, "step": 2090 }, { "epoch": 0.26629733989206744, "grad_norm": 5.6408880706053415, "learning_rate": 1.9447654950926953e-05, "loss": 0.8927, "step": 2091 }, { "epoch": 0.2664246939522741, "grad_norm": 4.602432039288525, "learning_rate": 1.944697877317963e-05, "loss": 0.9112, "step": 2092 }, { "epoch": 0.2665520480124807, "grad_norm": 6.002397693936305, "learning_rate": 1.9446302193568573e-05, "loss": 0.9994, "step": 2093 }, { "epoch": 0.2666794020726873, "grad_norm": 5.333935564530883, "learning_rate": 1.9445625212122557e-05, "loss": 0.9706, "step": 2094 }, { "epoch": 0.26680675613289395, "grad_norm": 4.394012339154899, "learning_rate": 1.9444947828870378e-05, "loss": 0.9955, "step": 2095 }, { "epoch": 0.2669341101931006, "grad_norm": 4.777318597261054, "learning_rate": 1.9444270043840854e-05, "loss": 0.8549, "step": 2096 }, { "epoch": 0.26706146425330723, "grad_norm": 5.132405954457194, "learning_rate": 1.9443591857062817e-05, "loss": 0.9659, "step": 2097 }, { "epoch": 0.2671888183135139, "grad_norm": 20.38579324937467, "learning_rate": 1.9442913268565117e-05, "loss": 0.921, "step": 2098 }, { "epoch": 0.26731617237372046, "grad_norm": 7.045345030350159, "learning_rate": 1.9442234278376614e-05, "loss": 0.9305, "step": 2099 }, { "epoch": 0.2674435264339271, "grad_norm": 5.591244269542286, "learning_rate": 1.9441554886526205e-05, "loss": 1.0025, "step": 2100 }, { "epoch": 0.26757088049413374, "grad_norm": 4.347419341591126, "learning_rate": 1.944087509304278e-05, "loss": 0.9737, "step": 2101 }, { "epoch": 0.2676982345543404, "grad_norm": 6.371188950660585, "learning_rate": 1.9440194897955254e-05, "loss": 1.0623, "step": 2102 }, { "epoch": 0.267825588614547, "grad_norm": 6.076203265548938, "learning_rate": 1.943951430129257e-05, "loss": 0.96, "step": 2103 }, { "epoch": 0.26795294267475367, "grad_norm": 6.038994910884709, "learning_rate": 1.9438833303083677e-05, "loss": 0.9492, "step": 2104 }, { "epoch": 0.26808029673496026, "grad_norm": 4.045103650173802, "learning_rate": 1.9438151903357544e-05, "loss": 0.9108, "step": 2105 }, { "epoch": 0.2682076507951669, "grad_norm": 4.164069141719023, "learning_rate": 1.9437470102143154e-05, "loss": 0.9377, "step": 2106 }, { "epoch": 0.26833500485537354, "grad_norm": 5.60331990062065, "learning_rate": 1.9436787899469516e-05, "loss": 0.9973, "step": 2107 }, { "epoch": 0.2684623589155802, "grad_norm": 6.443835353933519, "learning_rate": 1.9436105295365646e-05, "loss": 0.975, "step": 2108 }, { "epoch": 0.2685897129757868, "grad_norm": 4.604516730945988, "learning_rate": 1.943542228986058e-05, "loss": 0.9554, "step": 2109 }, { "epoch": 0.26871706703599346, "grad_norm": 5.836052215551749, "learning_rate": 1.9434738882983373e-05, "loss": 0.9645, "step": 2110 }, { "epoch": 0.26884442109620005, "grad_norm": 4.024787631859274, "learning_rate": 1.94340550747631e-05, "loss": 0.9828, "step": 2111 }, { "epoch": 0.2689717751564067, "grad_norm": 5.9467104497646295, "learning_rate": 1.9433370865228845e-05, "loss": 0.9566, "step": 2112 }, { "epoch": 0.26909912921661333, "grad_norm": 5.602568085242719, "learning_rate": 1.943268625440972e-05, "loss": 0.9917, "step": 2113 }, { "epoch": 0.26922648327682, "grad_norm": 5.070369302816449, "learning_rate": 1.9432001242334838e-05, "loss": 0.8965, "step": 2114 }, { "epoch": 0.2693538373370266, "grad_norm": 5.521602514654055, "learning_rate": 1.9431315829033344e-05, "loss": 0.9536, "step": 2115 }, { "epoch": 0.26948119139723326, "grad_norm": 9.027116403725293, "learning_rate": 1.9430630014534393e-05, "loss": 1.002, "step": 2116 }, { "epoch": 0.26960854545743984, "grad_norm": 4.626692591071444, "learning_rate": 1.9429943798867163e-05, "loss": 0.9336, "step": 2117 }, { "epoch": 0.2697358995176465, "grad_norm": 4.604174974989263, "learning_rate": 1.942925718206084e-05, "loss": 1.0236, "step": 2118 }, { "epoch": 0.2698632535778531, "grad_norm": 5.1544051135494735, "learning_rate": 1.9428570164144638e-05, "loss": 0.9576, "step": 2119 }, { "epoch": 0.26999060763805977, "grad_norm": 4.422244861892009, "learning_rate": 1.9427882745147774e-05, "loss": 0.913, "step": 2120 }, { "epoch": 0.2701179616982664, "grad_norm": 4.751228374461091, "learning_rate": 1.9427194925099494e-05, "loss": 1.0159, "step": 2121 }, { "epoch": 0.27024531575847305, "grad_norm": 6.1472368437362395, "learning_rate": 1.9426506704029057e-05, "loss": 1.0347, "step": 2122 }, { "epoch": 0.27037266981867963, "grad_norm": 7.381197119902135, "learning_rate": 1.942581808196574e-05, "loss": 0.946, "step": 2123 }, { "epoch": 0.2705000238788863, "grad_norm": 4.629822974229692, "learning_rate": 1.9425129058938833e-05, "loss": 0.9148, "step": 2124 }, { "epoch": 0.2706273779390929, "grad_norm": 6.2872656109237735, "learning_rate": 1.942443963497765e-05, "loss": 0.8569, "step": 2125 }, { "epoch": 0.27075473199929956, "grad_norm": 4.445213598347837, "learning_rate": 1.9423749810111515e-05, "loss": 0.9109, "step": 2126 }, { "epoch": 0.2708820860595062, "grad_norm": 4.725868363286199, "learning_rate": 1.9423059584369777e-05, "loss": 0.8506, "step": 2127 }, { "epoch": 0.27100944011971284, "grad_norm": 4.728926642736469, "learning_rate": 1.942236895778179e-05, "loss": 0.9154, "step": 2128 }, { "epoch": 0.2711367941799194, "grad_norm": 5.349078639187127, "learning_rate": 1.942167793037694e-05, "loss": 0.9821, "step": 2129 }, { "epoch": 0.27126414824012607, "grad_norm": 6.218273662170755, "learning_rate": 1.9420986502184622e-05, "loss": 0.9623, "step": 2130 }, { "epoch": 0.2713915023003327, "grad_norm": 5.325666738686139, "learning_rate": 1.9420294673234243e-05, "loss": 0.8549, "step": 2131 }, { "epoch": 0.27151885636053935, "grad_norm": 5.920119697288612, "learning_rate": 1.9419602443555235e-05, "loss": 0.9248, "step": 2132 }, { "epoch": 0.271646210420746, "grad_norm": 4.745675176051024, "learning_rate": 1.941890981317705e-05, "loss": 0.9967, "step": 2133 }, { "epoch": 0.27177356448095263, "grad_norm": 5.462550196646151, "learning_rate": 1.9418216782129143e-05, "loss": 0.9816, "step": 2134 }, { "epoch": 0.2719009185411592, "grad_norm": 5.622792467759951, "learning_rate": 1.9417523350440998e-05, "loss": 0.9734, "step": 2135 }, { "epoch": 0.27202827260136586, "grad_norm": 6.308646357172296, "learning_rate": 1.941682951814212e-05, "loss": 0.9461, "step": 2136 }, { "epoch": 0.2721556266615725, "grad_norm": 4.48245570350797, "learning_rate": 1.941613528526201e-05, "loss": 0.9579, "step": 2137 }, { "epoch": 0.27228298072177914, "grad_norm": 4.87663163487821, "learning_rate": 1.941544065183021e-05, "loss": 0.9087, "step": 2138 }, { "epoch": 0.2724103347819858, "grad_norm": 4.2627433013836535, "learning_rate": 1.9414745617876266e-05, "loss": 0.9046, "step": 2139 }, { "epoch": 0.2725376888421924, "grad_norm": 5.896345603803443, "learning_rate": 1.9414050183429746e-05, "loss": 1.0447, "step": 2140 }, { "epoch": 0.272665042902399, "grad_norm": 5.08166793474578, "learning_rate": 1.9413354348520228e-05, "loss": 0.8864, "step": 2141 }, { "epoch": 0.27279239696260565, "grad_norm": 3.5575958762314617, "learning_rate": 1.941265811317732e-05, "loss": 1.0043, "step": 2142 }, { "epoch": 0.2729197510228123, "grad_norm": 5.786739897608854, "learning_rate": 1.941196147743063e-05, "loss": 1.0257, "step": 2143 }, { "epoch": 0.27304710508301894, "grad_norm": 5.147892878297467, "learning_rate": 1.9411264441309796e-05, "loss": 1.0505, "step": 2144 }, { "epoch": 0.2731744591432256, "grad_norm": 5.728107072351583, "learning_rate": 1.9410567004844473e-05, "loss": 0.9318, "step": 2145 }, { "epoch": 0.27330181320343216, "grad_norm": 4.038316786970015, "learning_rate": 1.9409869168064324e-05, "loss": 0.8729, "step": 2146 }, { "epoch": 0.2734291672636388, "grad_norm": 7.130208571113339, "learning_rate": 1.9409170930999034e-05, "loss": 0.8927, "step": 2147 }, { "epoch": 0.27355652132384545, "grad_norm": 4.537345206822918, "learning_rate": 1.940847229367831e-05, "loss": 0.9948, "step": 2148 }, { "epoch": 0.2736838753840521, "grad_norm": 4.963497240388744, "learning_rate": 1.9407773256131864e-05, "loss": 0.9192, "step": 2149 }, { "epoch": 0.27381122944425873, "grad_norm": 5.19684136632701, "learning_rate": 1.9407073818389443e-05, "loss": 0.8569, "step": 2150 }, { "epoch": 0.27393858350446537, "grad_norm": 5.640266047756749, "learning_rate": 1.940637398048079e-05, "loss": 1.0116, "step": 2151 }, { "epoch": 0.27406593756467196, "grad_norm": 4.243294755790774, "learning_rate": 1.9405673742435677e-05, "loss": 0.9527, "step": 2152 }, { "epoch": 0.2741932916248786, "grad_norm": 5.02708860044787, "learning_rate": 1.94049731042839e-05, "loss": 0.9728, "step": 2153 }, { "epoch": 0.27432064568508524, "grad_norm": 5.839142803878154, "learning_rate": 1.940427206605525e-05, "loss": 0.8718, "step": 2154 }, { "epoch": 0.2744479997452919, "grad_norm": 4.133955741494076, "learning_rate": 1.940357062777956e-05, "loss": 0.9685, "step": 2155 }, { "epoch": 0.2745753538054985, "grad_norm": 5.065511302389787, "learning_rate": 1.940286878948666e-05, "loss": 0.9478, "step": 2156 }, { "epoch": 0.27470270786570516, "grad_norm": 4.505612581965912, "learning_rate": 1.9402166551206408e-05, "loss": 0.9988, "step": 2157 }, { "epoch": 0.27483006192591175, "grad_norm": 4.575460758875095, "learning_rate": 1.9401463912968685e-05, "loss": 0.9066, "step": 2158 }, { "epoch": 0.2749574159861184, "grad_norm": 4.836540911718069, "learning_rate": 1.9400760874803366e-05, "loss": 1.0354, "step": 2159 }, { "epoch": 0.27508477004632503, "grad_norm": 5.825454877520664, "learning_rate": 1.9400057436740365e-05, "loss": 0.9798, "step": 2160 }, { "epoch": 0.2752121241065317, "grad_norm": 4.282066991218745, "learning_rate": 1.9399353598809607e-05, "loss": 0.9718, "step": 2161 }, { "epoch": 0.2753394781667383, "grad_norm": 5.652867101181758, "learning_rate": 1.939864936104103e-05, "loss": 0.9801, "step": 2162 }, { "epoch": 0.27546683222694496, "grad_norm": 6.888994158005449, "learning_rate": 1.939794472346459e-05, "loss": 0.9607, "step": 2163 }, { "epoch": 0.27559418628715154, "grad_norm": 6.213630140580125, "learning_rate": 1.9397239686110265e-05, "loss": 0.9854, "step": 2164 }, { "epoch": 0.2757215403473582, "grad_norm": 5.6043437187655085, "learning_rate": 1.9396534249008045e-05, "loss": 0.9612, "step": 2165 }, { "epoch": 0.2758488944075648, "grad_norm": 9.74711996726469, "learning_rate": 1.9395828412187935e-05, "loss": 0.9139, "step": 2166 }, { "epoch": 0.27597624846777147, "grad_norm": 6.243065611147912, "learning_rate": 1.9395122175679963e-05, "loss": 0.9308, "step": 2167 }, { "epoch": 0.2761036025279781, "grad_norm": 4.795528458958782, "learning_rate": 1.9394415539514176e-05, "loss": 1.0395, "step": 2168 }, { "epoch": 0.27623095658818475, "grad_norm": 5.3907500763908915, "learning_rate": 1.939370850372063e-05, "loss": 0.8726, "step": 2169 }, { "epoch": 0.27635831064839134, "grad_norm": 4.109812666064779, "learning_rate": 1.9393001068329397e-05, "loss": 0.8617, "step": 2170 }, { "epoch": 0.276485664708598, "grad_norm": 4.106566887506217, "learning_rate": 1.939229323337058e-05, "loss": 1.0532, "step": 2171 }, { "epoch": 0.2766130187688046, "grad_norm": 6.404471099242713, "learning_rate": 1.939158499887428e-05, "loss": 1.0061, "step": 2172 }, { "epoch": 0.27674037282901126, "grad_norm": 4.63695383480242, "learning_rate": 1.939087636487063e-05, "loss": 0.9687, "step": 2173 }, { "epoch": 0.2768677268892179, "grad_norm": 6.040798080972259, "learning_rate": 1.9390167331389775e-05, "loss": 0.8687, "step": 2174 }, { "epoch": 0.27699508094942454, "grad_norm": 5.314279953708837, "learning_rate": 1.938945789846187e-05, "loss": 1.0041, "step": 2175 }, { "epoch": 0.27712243500963113, "grad_norm": 6.906600459946274, "learning_rate": 1.93887480661171e-05, "loss": 1.028, "step": 2176 }, { "epoch": 0.27724978906983777, "grad_norm": 4.511376517390788, "learning_rate": 1.9388037834385657e-05, "loss": 0.9285, "step": 2177 }, { "epoch": 0.2773771431300444, "grad_norm": 4.932406151908792, "learning_rate": 1.938732720329776e-05, "loss": 0.8798, "step": 2178 }, { "epoch": 0.27750449719025105, "grad_norm": 4.798830509767417, "learning_rate": 1.938661617288363e-05, "loss": 1.0224, "step": 2179 }, { "epoch": 0.2776318512504577, "grad_norm": 4.0022023813604415, "learning_rate": 1.938590474317352e-05, "loss": 0.9788, "step": 2180 }, { "epoch": 0.27775920531066434, "grad_norm": 7.220675389198151, "learning_rate": 1.9385192914197683e-05, "loss": 1.0862, "step": 2181 }, { "epoch": 0.2778865593708709, "grad_norm": 6.0366433493495455, "learning_rate": 1.9384480685986413e-05, "loss": 0.9735, "step": 2182 }, { "epoch": 0.27801391343107756, "grad_norm": 5.040262242109857, "learning_rate": 1.938376805857e-05, "loss": 1.071, "step": 2183 }, { "epoch": 0.2781412674912842, "grad_norm": 5.827009569868901, "learning_rate": 1.9383055031978758e-05, "loss": 0.9307, "step": 2184 }, { "epoch": 0.27826862155149085, "grad_norm": 5.67942924909352, "learning_rate": 1.938234160624302e-05, "loss": 0.9089, "step": 2185 }, { "epoch": 0.2783959756116975, "grad_norm": 3.6466410296257834, "learning_rate": 1.9381627781393133e-05, "loss": 0.8968, "step": 2186 }, { "epoch": 0.27852332967190413, "grad_norm": 5.40414387686617, "learning_rate": 1.9380913557459466e-05, "loss": 0.9997, "step": 2187 }, { "epoch": 0.2786506837321107, "grad_norm": 4.81303600942812, "learning_rate": 1.9380198934472395e-05, "loss": 0.8685, "step": 2188 }, { "epoch": 0.27877803779231736, "grad_norm": 4.868552615041732, "learning_rate": 1.9379483912462326e-05, "loss": 0.9996, "step": 2189 }, { "epoch": 0.278905391852524, "grad_norm": 5.896146983867609, "learning_rate": 1.937876849145967e-05, "loss": 1.0238, "step": 2190 }, { "epoch": 0.27903274591273064, "grad_norm": 5.272075381085948, "learning_rate": 1.9378052671494868e-05, "loss": 1.0004, "step": 2191 }, { "epoch": 0.2791600999729373, "grad_norm": 6.049733356243118, "learning_rate": 1.9377336452598357e-05, "loss": 0.9743, "step": 2192 }, { "epoch": 0.27928745403314387, "grad_norm": 4.897660237875231, "learning_rate": 1.9376619834800615e-05, "loss": 0.981, "step": 2193 }, { "epoch": 0.2794148080933505, "grad_norm": 7.611368470431305, "learning_rate": 1.9375902818132123e-05, "loss": 0.8759, "step": 2194 }, { "epoch": 0.27954216215355715, "grad_norm": 3.924073716220743, "learning_rate": 1.937518540262338e-05, "loss": 0.9978, "step": 2195 }, { "epoch": 0.2796695162137638, "grad_norm": 5.295565012755607, "learning_rate": 1.9374467588304906e-05, "loss": 1.0047, "step": 2196 }, { "epoch": 0.27979687027397043, "grad_norm": 5.1462553317153805, "learning_rate": 1.9373749375207235e-05, "loss": 0.9304, "step": 2197 }, { "epoch": 0.2799242243341771, "grad_norm": 5.6895952662041696, "learning_rate": 1.9373030763360923e-05, "loss": 1.026, "step": 2198 }, { "epoch": 0.28005157839438366, "grad_norm": 4.357188780674366, "learning_rate": 1.9372311752796535e-05, "loss": 1.0325, "step": 2199 }, { "epoch": 0.2801789324545903, "grad_norm": 3.9551517884729583, "learning_rate": 1.9371592343544655e-05, "loss": 0.9527, "step": 2200 }, { "epoch": 0.28030628651479694, "grad_norm": 4.443566272655604, "learning_rate": 1.937087253563589e-05, "loss": 0.9822, "step": 2201 }, { "epoch": 0.2804336405750036, "grad_norm": 4.52590163716275, "learning_rate": 1.937015232910086e-05, "loss": 0.887, "step": 2202 }, { "epoch": 0.2805609946352102, "grad_norm": 5.936427290536799, "learning_rate": 1.93694317239702e-05, "loss": 1.0584, "step": 2203 }, { "epoch": 0.28068834869541687, "grad_norm": 5.113348019880136, "learning_rate": 1.9368710720274562e-05, "loss": 0.9538, "step": 2204 }, { "epoch": 0.28081570275562345, "grad_norm": 4.1197774945516485, "learning_rate": 1.9367989318044617e-05, "loss": 0.9214, "step": 2205 }, { "epoch": 0.2809430568158301, "grad_norm": 4.789527658894194, "learning_rate": 1.9367267517311057e-05, "loss": 1.0283, "step": 2206 }, { "epoch": 0.28107041087603674, "grad_norm": 5.414703287183188, "learning_rate": 1.936654531810458e-05, "loss": 0.9595, "step": 2207 }, { "epoch": 0.2811977649362434, "grad_norm": 7.921686474966427, "learning_rate": 1.9365822720455915e-05, "loss": 1.0014, "step": 2208 }, { "epoch": 0.28132511899645, "grad_norm": 4.842456275691338, "learning_rate": 1.9365099724395796e-05, "loss": 0.8966, "step": 2209 }, { "epoch": 0.28145247305665666, "grad_norm": 5.303629212021097, "learning_rate": 1.9364376329954978e-05, "loss": 0.8819, "step": 2210 }, { "epoch": 0.28157982711686325, "grad_norm": 6.149078684701823, "learning_rate": 1.9363652537164234e-05, "loss": 0.8942, "step": 2211 }, { "epoch": 0.2817071811770699, "grad_norm": 5.423607453255336, "learning_rate": 1.9362928346054356e-05, "loss": 0.9286, "step": 2212 }, { "epoch": 0.28183453523727653, "grad_norm": 4.809743116343621, "learning_rate": 1.9362203756656145e-05, "loss": 0.9433, "step": 2213 }, { "epoch": 0.28196188929748317, "grad_norm": 7.38052885068019, "learning_rate": 1.936147876900043e-05, "loss": 0.9791, "step": 2214 }, { "epoch": 0.2820892433576898, "grad_norm": 5.0009165877396216, "learning_rate": 1.9360753383118048e-05, "loss": 0.9792, "step": 2215 }, { "epoch": 0.28221659741789645, "grad_norm": 4.934895661554527, "learning_rate": 1.9360027599039855e-05, "loss": 0.8957, "step": 2216 }, { "epoch": 0.28234395147810304, "grad_norm": 3.738109046533342, "learning_rate": 1.9359301416796726e-05, "loss": 1.0307, "step": 2217 }, { "epoch": 0.2824713055383097, "grad_norm": 4.571091179658738, "learning_rate": 1.9358574836419553e-05, "loss": 0.9712, "step": 2218 }, { "epoch": 0.2825986595985163, "grad_norm": 5.992609240475571, "learning_rate": 1.9357847857939243e-05, "loss": 0.9582, "step": 2219 }, { "epoch": 0.28272601365872296, "grad_norm": 7.0343406835707825, "learning_rate": 1.9357120481386723e-05, "loss": 0.9064, "step": 2220 }, { "epoch": 0.2828533677189296, "grad_norm": 5.247914944727081, "learning_rate": 1.9356392706792936e-05, "loss": 1.0293, "step": 2221 }, { "epoch": 0.28298072177913625, "grad_norm": 5.24755911624317, "learning_rate": 1.9355664534188833e-05, "loss": 0.9523, "step": 2222 }, { "epoch": 0.28310807583934283, "grad_norm": 3.77058204670675, "learning_rate": 1.9354935963605395e-05, "loss": 0.9416, "step": 2223 }, { "epoch": 0.2832354298995495, "grad_norm": 6.23519180798187, "learning_rate": 1.9354206995073613e-05, "loss": 1.019, "step": 2224 }, { "epoch": 0.2833627839597561, "grad_norm": 4.455861746418782, "learning_rate": 1.93534776286245e-05, "loss": 0.8951, "step": 2225 }, { "epoch": 0.28349013801996276, "grad_norm": 4.962991124216106, "learning_rate": 1.935274786428908e-05, "loss": 0.9558, "step": 2226 }, { "epoch": 0.2836174920801694, "grad_norm": 5.452644042745204, "learning_rate": 1.9352017702098393e-05, "loss": 0.9673, "step": 2227 }, { "epoch": 0.28374484614037604, "grad_norm": 5.497365667446201, "learning_rate": 1.9351287142083505e-05, "loss": 0.9622, "step": 2228 }, { "epoch": 0.2838722002005826, "grad_norm": 6.877771824491955, "learning_rate": 1.935055618427549e-05, "loss": 1.014, "step": 2229 }, { "epoch": 0.28399955426078927, "grad_norm": 5.326496669712743, "learning_rate": 1.9349824828705443e-05, "loss": 1.0212, "step": 2230 }, { "epoch": 0.2841269083209959, "grad_norm": 4.521232528758232, "learning_rate": 1.9349093075404474e-05, "loss": 0.8721, "step": 2231 }, { "epoch": 0.28425426238120255, "grad_norm": 4.886713285618447, "learning_rate": 1.934836092440371e-05, "loss": 0.8945, "step": 2232 }, { "epoch": 0.2843816164414092, "grad_norm": 4.478013048283324, "learning_rate": 1.9347628375734306e-05, "loss": 0.9652, "step": 2233 }, { "epoch": 0.28450897050161583, "grad_norm": 4.655486178633574, "learning_rate": 1.9346895429427407e-05, "loss": 0.9227, "step": 2234 }, { "epoch": 0.2846363245618224, "grad_norm": 5.331421310695552, "learning_rate": 1.9346162085514204e-05, "loss": 1.057, "step": 2235 }, { "epoch": 0.28476367862202906, "grad_norm": 5.038371856437863, "learning_rate": 1.9345428344025883e-05, "loss": 0.9051, "step": 2236 }, { "epoch": 0.2848910326822357, "grad_norm": 4.5173737726694645, "learning_rate": 1.934469420499367e-05, "loss": 1.008, "step": 2237 }, { "epoch": 0.28501838674244234, "grad_norm": 4.290270136795548, "learning_rate": 1.934395966844878e-05, "loss": 0.9827, "step": 2238 }, { "epoch": 0.285145740802649, "grad_norm": 4.956704179270817, "learning_rate": 1.9343224734422472e-05, "loss": 0.9234, "step": 2239 }, { "epoch": 0.28527309486285557, "grad_norm": 4.379443065124714, "learning_rate": 1.9342489402945997e-05, "loss": 1.0263, "step": 2240 }, { "epoch": 0.2854004489230622, "grad_norm": 6.018437983742971, "learning_rate": 1.934175367405065e-05, "loss": 0.9777, "step": 2241 }, { "epoch": 0.28552780298326885, "grad_norm": 5.067939943808318, "learning_rate": 1.9341017547767713e-05, "loss": 0.953, "step": 2242 }, { "epoch": 0.2856551570434755, "grad_norm": 7.1419441457085, "learning_rate": 1.934028102412851e-05, "loss": 0.8831, "step": 2243 }, { "epoch": 0.28578251110368214, "grad_norm": 6.071195678090924, "learning_rate": 1.9339544103164365e-05, "loss": 1.0695, "step": 2244 }, { "epoch": 0.2859098651638888, "grad_norm": 4.364692534884017, "learning_rate": 1.933880678490663e-05, "loss": 0.912, "step": 2245 }, { "epoch": 0.28603721922409536, "grad_norm": 3.524136290011522, "learning_rate": 1.9338069069386668e-05, "loss": 0.9127, "step": 2246 }, { "epoch": 0.286164573284302, "grad_norm": 5.592784606485487, "learning_rate": 1.9337330956635864e-05, "loss": 1.0343, "step": 2247 }, { "epoch": 0.28629192734450865, "grad_norm": 8.068927893535436, "learning_rate": 1.9336592446685615e-05, "loss": 0.9894, "step": 2248 }, { "epoch": 0.2864192814047153, "grad_norm": 6.193405877801608, "learning_rate": 1.933585353956733e-05, "loss": 1.024, "step": 2249 }, { "epoch": 0.28654663546492193, "grad_norm": 5.815811362501299, "learning_rate": 1.933511423531245e-05, "loss": 0.9553, "step": 2250 }, { "epoch": 0.28667398952512857, "grad_norm": 6.636141064285603, "learning_rate": 1.933437453395242e-05, "loss": 0.9942, "step": 2251 }, { "epoch": 0.28680134358533516, "grad_norm": 6.893829638521751, "learning_rate": 1.9333634435518707e-05, "loss": 1.0622, "step": 2252 }, { "epoch": 0.2869286976455418, "grad_norm": 5.52525547869584, "learning_rate": 1.9332893940042796e-05, "loss": 0.9181, "step": 2253 }, { "epoch": 0.28705605170574844, "grad_norm": 5.328029240169994, "learning_rate": 1.9332153047556183e-05, "loss": 1.0477, "step": 2254 }, { "epoch": 0.2871834057659551, "grad_norm": 5.400451902624768, "learning_rate": 1.9331411758090388e-05, "loss": 0.9254, "step": 2255 }, { "epoch": 0.2873107598261617, "grad_norm": 6.978582860717244, "learning_rate": 1.9330670071676937e-05, "loss": 0.9569, "step": 2256 }, { "epoch": 0.28743811388636836, "grad_norm": 6.290103878261186, "learning_rate": 1.932992798834739e-05, "loss": 0.9631, "step": 2257 }, { "epoch": 0.28756546794657495, "grad_norm": 7.476006338705659, "learning_rate": 1.932918550813331e-05, "loss": 0.9519, "step": 2258 }, { "epoch": 0.2876928220067816, "grad_norm": 3.51608151332901, "learning_rate": 1.9328442631066286e-05, "loss": 0.8884, "step": 2259 }, { "epoch": 0.28782017606698823, "grad_norm": 5.194167849995865, "learning_rate": 1.932769935717791e-05, "loss": 0.859, "step": 2260 }, { "epoch": 0.2879475301271949, "grad_norm": 4.670590023040995, "learning_rate": 1.9326955686499807e-05, "loss": 0.9809, "step": 2261 }, { "epoch": 0.2880748841874015, "grad_norm": 5.232320900598122, "learning_rate": 1.932621161906361e-05, "loss": 0.9844, "step": 2262 }, { "epoch": 0.28820223824760816, "grad_norm": 5.805677778710669, "learning_rate": 1.9325467154900974e-05, "loss": 1.0872, "step": 2263 }, { "epoch": 0.28832959230781474, "grad_norm": 4.786247085486716, "learning_rate": 1.932472229404356e-05, "loss": 0.9238, "step": 2264 }, { "epoch": 0.2884569463680214, "grad_norm": 8.303756360222849, "learning_rate": 1.9323977036523058e-05, "loss": 0.8937, "step": 2265 }, { "epoch": 0.288584300428228, "grad_norm": 4.441670781480172, "learning_rate": 1.9323231382371174e-05, "loss": 0.9205, "step": 2266 }, { "epoch": 0.28871165448843467, "grad_norm": 5.717460778839533, "learning_rate": 1.9322485331619622e-05, "loss": 1.1245, "step": 2267 }, { "epoch": 0.2888390085486413, "grad_norm": 4.285111071505802, "learning_rate": 1.932173888430014e-05, "loss": 1.1127, "step": 2268 }, { "epoch": 0.28896636260884795, "grad_norm": 4.218659350516257, "learning_rate": 1.9320992040444483e-05, "loss": 0.9626, "step": 2269 }, { "epoch": 0.28909371666905453, "grad_norm": 4.942385031521187, "learning_rate": 1.9320244800084417e-05, "loss": 0.9857, "step": 2270 }, { "epoch": 0.2892210707292612, "grad_norm": 7.124262185962554, "learning_rate": 1.9319497163251728e-05, "loss": 0.9753, "step": 2271 }, { "epoch": 0.2893484247894678, "grad_norm": 7.736084288712883, "learning_rate": 1.9318749129978225e-05, "loss": 1.0909, "step": 2272 }, { "epoch": 0.28947577884967446, "grad_norm": 3.916968692470361, "learning_rate": 1.9318000700295725e-05, "loss": 0.9116, "step": 2273 }, { "epoch": 0.2896031329098811, "grad_norm": 5.8782972457610105, "learning_rate": 1.9317251874236066e-05, "loss": 0.9768, "step": 2274 }, { "epoch": 0.28973048697008774, "grad_norm": 5.8013309924688565, "learning_rate": 1.9316502651831104e-05, "loss": 0.8514, "step": 2275 }, { "epoch": 0.2898578410302943, "grad_norm": 5.6966295014685615, "learning_rate": 1.9315753033112704e-05, "loss": 0.9823, "step": 2276 }, { "epoch": 0.28998519509050097, "grad_norm": 6.013574503413546, "learning_rate": 1.9315003018112765e-05, "loss": 0.8798, "step": 2277 }, { "epoch": 0.2901125491507076, "grad_norm": 6.031571928139165, "learning_rate": 1.931425260686318e-05, "loss": 0.918, "step": 2278 }, { "epoch": 0.29023990321091425, "grad_norm": 7.488048949839692, "learning_rate": 1.931350179939588e-05, "loss": 0.9346, "step": 2279 }, { "epoch": 0.2903672572711209, "grad_norm": 5.5816601575826965, "learning_rate": 1.9312750595742794e-05, "loss": 1.0537, "step": 2280 }, { "epoch": 0.2904946113313275, "grad_norm": 5.617529252523221, "learning_rate": 1.9311998995935883e-05, "loss": 0.9422, "step": 2281 }, { "epoch": 0.2906219653915341, "grad_norm": 4.349045414157491, "learning_rate": 1.931124700000712e-05, "loss": 0.9974, "step": 2282 }, { "epoch": 0.29074931945174076, "grad_norm": 5.170409579848455, "learning_rate": 1.9310494607988494e-05, "loss": 0.9717, "step": 2283 }, { "epoch": 0.2908766735119474, "grad_norm": 6.319737883335233, "learning_rate": 1.930974181991201e-05, "loss": 1.0048, "step": 2284 }, { "epoch": 0.29100402757215404, "grad_norm": 4.997398844272912, "learning_rate": 1.9308988635809688e-05, "loss": 0.9352, "step": 2285 }, { "epoch": 0.2911313816323607, "grad_norm": 5.5679229704472375, "learning_rate": 1.930823505571357e-05, "loss": 0.8817, "step": 2286 }, { "epoch": 0.29125873569256727, "grad_norm": 5.554595817742847, "learning_rate": 1.9307481079655716e-05, "loss": 0.9651, "step": 2287 }, { "epoch": 0.2913860897527739, "grad_norm": 5.693212972495875, "learning_rate": 1.9306726707668194e-05, "loss": 0.9804, "step": 2288 }, { "epoch": 0.29151344381298055, "grad_norm": 4.68197998548399, "learning_rate": 1.9305971939783094e-05, "loss": 0.9132, "step": 2289 }, { "epoch": 0.2916407978731872, "grad_norm": 4.7334408483696775, "learning_rate": 1.9305216776032528e-05, "loss": 1.0875, "step": 2290 }, { "epoch": 0.29176815193339384, "grad_norm": 5.340337902050867, "learning_rate": 1.9304461216448612e-05, "loss": 1.0256, "step": 2291 }, { "epoch": 0.2918955059936005, "grad_norm": 6.0611633715001965, "learning_rate": 1.9303705261063496e-05, "loss": 0.9575, "step": 2292 }, { "epoch": 0.29202286005380707, "grad_norm": 5.409268457166756, "learning_rate": 1.930294890990933e-05, "loss": 0.9956, "step": 2293 }, { "epoch": 0.2921502141140137, "grad_norm": 4.360307491203652, "learning_rate": 1.9302192163018292e-05, "loss": 1.1261, "step": 2294 }, { "epoch": 0.29227756817422035, "grad_norm": 4.543581894148383, "learning_rate": 1.9301435020422575e-05, "loss": 1.0707, "step": 2295 }, { "epoch": 0.292404922234427, "grad_norm": 5.019302493571458, "learning_rate": 1.930067748215438e-05, "loss": 0.9486, "step": 2296 }, { "epoch": 0.29253227629463363, "grad_norm": 5.1383732557070845, "learning_rate": 1.9299919548245938e-05, "loss": 0.9126, "step": 2297 }, { "epoch": 0.29265963035484027, "grad_norm": 5.3430632211270845, "learning_rate": 1.9299161218729487e-05, "loss": 0.9765, "step": 2298 }, { "epoch": 0.29278698441504686, "grad_norm": 6.809842492521337, "learning_rate": 1.929840249363729e-05, "loss": 0.8662, "step": 2299 }, { "epoch": 0.2929143384752535, "grad_norm": 5.315365007558204, "learning_rate": 1.9297643373001618e-05, "loss": 0.926, "step": 2300 }, { "epoch": 0.29304169253546014, "grad_norm": 4.722951067359047, "learning_rate": 1.9296883856854764e-05, "loss": 0.8986, "step": 2301 }, { "epoch": 0.2931690465956668, "grad_norm": 4.864921912010606, "learning_rate": 1.929612394522904e-05, "loss": 0.8732, "step": 2302 }, { "epoch": 0.2932964006558734, "grad_norm": 4.012889360687404, "learning_rate": 1.9295363638156764e-05, "loss": 0.9911, "step": 2303 }, { "epoch": 0.29342375471608007, "grad_norm": 5.365808123672366, "learning_rate": 1.929460293567029e-05, "loss": 0.983, "step": 2304 }, { "epoch": 0.29355110877628665, "grad_norm": 6.226146497434792, "learning_rate": 1.929384183780197e-05, "loss": 0.9277, "step": 2305 }, { "epoch": 0.2936784628364933, "grad_norm": 4.601227364374314, "learning_rate": 1.929308034458418e-05, "loss": 0.9162, "step": 2306 }, { "epoch": 0.29380581689669993, "grad_norm": 5.576915449547042, "learning_rate": 1.9292318456049313e-05, "loss": 0.8949, "step": 2307 }, { "epoch": 0.2939331709569066, "grad_norm": 7.015861524235359, "learning_rate": 1.9291556172229784e-05, "loss": 0.9657, "step": 2308 }, { "epoch": 0.2940605250171132, "grad_norm": 5.842503379001383, "learning_rate": 1.9290793493158014e-05, "loss": 0.9789, "step": 2309 }, { "epoch": 0.29418787907731986, "grad_norm": 5.560578432251514, "learning_rate": 1.929003041886645e-05, "loss": 0.8596, "step": 2310 }, { "epoch": 0.29431523313752644, "grad_norm": 3.9685965681079187, "learning_rate": 1.928926694938755e-05, "loss": 0.9635, "step": 2311 }, { "epoch": 0.2944425871977331, "grad_norm": 4.809208524622402, "learning_rate": 1.9288503084753793e-05, "loss": 1.089, "step": 2312 }, { "epoch": 0.2945699412579397, "grad_norm": 5.259111943865651, "learning_rate": 1.9287738824997672e-05, "loss": 0.9576, "step": 2313 }, { "epoch": 0.29469729531814637, "grad_norm": 5.025815694469445, "learning_rate": 1.9286974170151696e-05, "loss": 0.9096, "step": 2314 }, { "epoch": 0.294824649378353, "grad_norm": 5.537674443444391, "learning_rate": 1.92862091202484e-05, "loss": 0.933, "step": 2315 }, { "epoch": 0.29495200343855965, "grad_norm": 4.664941241680336, "learning_rate": 1.9285443675320315e-05, "loss": 1.041, "step": 2316 }, { "epoch": 0.29507935749876624, "grad_norm": 4.683779244332888, "learning_rate": 1.9284677835400013e-05, "loss": 0.9634, "step": 2317 }, { "epoch": 0.2952067115589729, "grad_norm": 5.570889436396025, "learning_rate": 1.928391160052007e-05, "loss": 1.0452, "step": 2318 }, { "epoch": 0.2953340656191795, "grad_norm": 5.923979131256169, "learning_rate": 1.9283144970713082e-05, "loss": 0.9457, "step": 2319 }, { "epoch": 0.29546141967938616, "grad_norm": 6.225263242249498, "learning_rate": 1.928237794601165e-05, "loss": 0.9188, "step": 2320 }, { "epoch": 0.2955887737395928, "grad_norm": 6.537319447812241, "learning_rate": 1.928161052644842e-05, "loss": 0.9413, "step": 2321 }, { "epoch": 0.29571612779979944, "grad_norm": 6.035480181941725, "learning_rate": 1.9280842712056023e-05, "loss": 0.8955, "step": 2322 }, { "epoch": 0.29584348186000603, "grad_norm": 4.071255214126804, "learning_rate": 1.9280074502867124e-05, "loss": 1.0152, "step": 2323 }, { "epoch": 0.29597083592021267, "grad_norm": 7.2375159808626455, "learning_rate": 1.9279305898914407e-05, "loss": 0.9824, "step": 2324 }, { "epoch": 0.2960981899804193, "grad_norm": 4.984540691431961, "learning_rate": 1.9278536900230564e-05, "loss": 1.006, "step": 2325 }, { "epoch": 0.29622554404062595, "grad_norm": 3.6894730284170656, "learning_rate": 1.9277767506848303e-05, "loss": 0.9564, "step": 2326 }, { "epoch": 0.2963528981008326, "grad_norm": 14.507268728029603, "learning_rate": 1.9276997718800362e-05, "loss": 1.0252, "step": 2327 }, { "epoch": 0.2964802521610392, "grad_norm": 4.1718439108158085, "learning_rate": 1.927622753611948e-05, "loss": 1.0036, "step": 2328 }, { "epoch": 0.2966076062212458, "grad_norm": 4.637887680424189, "learning_rate": 1.927545695883842e-05, "loss": 0.9146, "step": 2329 }, { "epoch": 0.29673496028145246, "grad_norm": 6.10570273314043, "learning_rate": 1.9274685986989966e-05, "loss": 0.938, "step": 2330 }, { "epoch": 0.2968623143416591, "grad_norm": 4.152182287915018, "learning_rate": 1.9273914620606912e-05, "loss": 0.9101, "step": 2331 }, { "epoch": 0.29698966840186575, "grad_norm": 4.488219630098113, "learning_rate": 1.9273142859722072e-05, "loss": 0.8482, "step": 2332 }, { "epoch": 0.2971170224620724, "grad_norm": 5.555829109238733, "learning_rate": 1.9272370704368272e-05, "loss": 0.8945, "step": 2333 }, { "epoch": 0.297244376522279, "grad_norm": 6.824063627404799, "learning_rate": 1.927159815457836e-05, "loss": 0.9777, "step": 2334 }, { "epoch": 0.2973717305824856, "grad_norm": 5.77562477068059, "learning_rate": 1.9270825210385203e-05, "loss": 0.9543, "step": 2335 }, { "epoch": 0.29749908464269226, "grad_norm": 5.323961527398075, "learning_rate": 1.927005187182168e-05, "loss": 1.116, "step": 2336 }, { "epoch": 0.2976264387028989, "grad_norm": 5.107802589479239, "learning_rate": 1.9269278138920686e-05, "loss": 0.9456, "step": 2337 }, { "epoch": 0.29775379276310554, "grad_norm": 5.3579104899882, "learning_rate": 1.9268504011715134e-05, "loss": 0.9943, "step": 2338 }, { "epoch": 0.2978811468233122, "grad_norm": 8.210342342101924, "learning_rate": 1.9267729490237958e-05, "loss": 0.9187, "step": 2339 }, { "epoch": 0.29800850088351877, "grad_norm": 5.124278692333119, "learning_rate": 1.92669545745221e-05, "loss": 0.8947, "step": 2340 }, { "epoch": 0.2981358549437254, "grad_norm": 4.264877533311538, "learning_rate": 1.9266179264600527e-05, "loss": 1.0179, "step": 2341 }, { "epoch": 0.29826320900393205, "grad_norm": 5.210763999875867, "learning_rate": 1.9265403560506223e-05, "loss": 0.9389, "step": 2342 }, { "epoch": 0.2983905630641387, "grad_norm": 4.735656967588853, "learning_rate": 1.9264627462272184e-05, "loss": 1.0618, "step": 2343 }, { "epoch": 0.29851791712434533, "grad_norm": 4.667642060024814, "learning_rate": 1.9263850969931418e-05, "loss": 1.0058, "step": 2344 }, { "epoch": 0.298645271184552, "grad_norm": 5.956955045619141, "learning_rate": 1.9263074083516965e-05, "loss": 1.0041, "step": 2345 }, { "epoch": 0.29877262524475856, "grad_norm": 5.452883652207492, "learning_rate": 1.9262296803061867e-05, "loss": 0.9371, "step": 2346 }, { "epoch": 0.2988999793049652, "grad_norm": 13.474852133312945, "learning_rate": 1.926151912859919e-05, "loss": 0.8465, "step": 2347 }, { "epoch": 0.29902733336517184, "grad_norm": 6.786660562948444, "learning_rate": 1.9260741060162015e-05, "loss": 0.951, "step": 2348 }, { "epoch": 0.2991546874253785, "grad_norm": 5.324857095582839, "learning_rate": 1.9259962597783444e-05, "loss": 0.9631, "step": 2349 }, { "epoch": 0.2992820414855851, "grad_norm": 4.990701803573071, "learning_rate": 1.9259183741496586e-05, "loss": 0.9879, "step": 2350 }, { "epoch": 0.29940939554579177, "grad_norm": 6.1714429486828815, "learning_rate": 1.925840449133458e-05, "loss": 0.9998, "step": 2351 }, { "epoch": 0.29953674960599835, "grad_norm": 6.79431132056817, "learning_rate": 1.9257624847330567e-05, "loss": 0.916, "step": 2352 }, { "epoch": 0.299664103666205, "grad_norm": 5.414971169406439, "learning_rate": 1.9256844809517714e-05, "loss": 0.9983, "step": 2353 }, { "epoch": 0.29979145772641164, "grad_norm": 6.706722974496608, "learning_rate": 1.9256064377929206e-05, "loss": 0.8914, "step": 2354 }, { "epoch": 0.2999188117866183, "grad_norm": 6.9339697851376, "learning_rate": 1.9255283552598242e-05, "loss": 1.022, "step": 2355 }, { "epoch": 0.3000461658468249, "grad_norm": 6.547697102135116, "learning_rate": 1.925450233355803e-05, "loss": 1.0301, "step": 2356 }, { "epoch": 0.30017351990703156, "grad_norm": 5.039317421785876, "learning_rate": 1.9253720720841812e-05, "loss": 1.0293, "step": 2357 }, { "epoch": 0.30030087396723815, "grad_norm": 5.0620798949195605, "learning_rate": 1.925293871448283e-05, "loss": 0.9184, "step": 2358 }, { "epoch": 0.3004282280274448, "grad_norm": 5.502368937018617, "learning_rate": 1.9252156314514353e-05, "loss": 0.9902, "step": 2359 }, { "epoch": 0.30055558208765143, "grad_norm": 4.959087261080682, "learning_rate": 1.925137352096966e-05, "loss": 1.0593, "step": 2360 }, { "epoch": 0.30068293614785807, "grad_norm": 6.54826511382798, "learning_rate": 1.9250590333882056e-05, "loss": 0.8994, "step": 2361 }, { "epoch": 0.3008102902080647, "grad_norm": 5.447075666450765, "learning_rate": 1.924980675328485e-05, "loss": 1.008, "step": 2362 }, { "epoch": 0.30093764426827135, "grad_norm": 5.663010408753319, "learning_rate": 1.924902277921138e-05, "loss": 0.975, "step": 2363 }, { "epoch": 0.30106499832847794, "grad_norm": 4.979336957799365, "learning_rate": 1.9248238411694994e-05, "loss": 0.9282, "step": 2364 }, { "epoch": 0.3011923523886846, "grad_norm": 5.259066609290298, "learning_rate": 1.9247453650769057e-05, "loss": 0.9378, "step": 2365 }, { "epoch": 0.3013197064488912, "grad_norm": 4.329109823510334, "learning_rate": 1.924666849646695e-05, "loss": 0.9466, "step": 2366 }, { "epoch": 0.30144706050909786, "grad_norm": 4.66364862851753, "learning_rate": 1.9245882948822078e-05, "loss": 0.9453, "step": 2367 }, { "epoch": 0.3015744145693045, "grad_norm": 6.379440257233033, "learning_rate": 1.9245097007867853e-05, "loss": 1.1695, "step": 2368 }, { "epoch": 0.30170176862951115, "grad_norm": 6.215123024641248, "learning_rate": 1.924431067363771e-05, "loss": 0.947, "step": 2369 }, { "epoch": 0.30182912268971773, "grad_norm": 5.3002267321186585, "learning_rate": 1.92435239461651e-05, "loss": 0.9107, "step": 2370 }, { "epoch": 0.3019564767499244, "grad_norm": 6.432751431938547, "learning_rate": 1.9242736825483484e-05, "loss": 0.9924, "step": 2371 }, { "epoch": 0.302083830810131, "grad_norm": 5.182284019375116, "learning_rate": 1.924194931162635e-05, "loss": 0.9225, "step": 2372 }, { "epoch": 0.30221118487033766, "grad_norm": 5.359529454088137, "learning_rate": 1.9241161404627196e-05, "loss": 0.9628, "step": 2373 }, { "epoch": 0.3023385389305443, "grad_norm": 6.754903919318802, "learning_rate": 1.9240373104519538e-05, "loss": 0.8703, "step": 2374 }, { "epoch": 0.3024658929907509, "grad_norm": 5.263676766869248, "learning_rate": 1.9239584411336913e-05, "loss": 0.9803, "step": 2375 }, { "epoch": 0.3025932470509575, "grad_norm": 4.376429950813778, "learning_rate": 1.9238795325112867e-05, "loss": 0.9298, "step": 2376 }, { "epoch": 0.30272060111116417, "grad_norm": 4.366577760990114, "learning_rate": 1.923800584588097e-05, "loss": 1.0008, "step": 2377 }, { "epoch": 0.3028479551713708, "grad_norm": 7.221939505235776, "learning_rate": 1.9237215973674805e-05, "loss": 0.9899, "step": 2378 }, { "epoch": 0.30297530923157745, "grad_norm": 4.065072052376886, "learning_rate": 1.9236425708527972e-05, "loss": 0.9246, "step": 2379 }, { "epoch": 0.3031026632917841, "grad_norm": 4.642776555630027, "learning_rate": 1.9235635050474085e-05, "loss": 0.8799, "step": 2380 }, { "epoch": 0.3032300173519907, "grad_norm": 4.508946109289785, "learning_rate": 1.923484399954678e-05, "loss": 0.961, "step": 2381 }, { "epoch": 0.3033573714121973, "grad_norm": 9.354483572867027, "learning_rate": 1.923405255577971e-05, "loss": 0.8652, "step": 2382 }, { "epoch": 0.30348472547240396, "grad_norm": 4.762792126764871, "learning_rate": 1.9233260719206543e-05, "loss": 0.9529, "step": 2383 }, { "epoch": 0.3036120795326106, "grad_norm": 4.767570323695481, "learning_rate": 1.923246848986095e-05, "loss": 0.8489, "step": 2384 }, { "epoch": 0.30373943359281724, "grad_norm": 4.387257000315391, "learning_rate": 1.9231675867776648e-05, "loss": 0.9604, "step": 2385 }, { "epoch": 0.3038667876530239, "grad_norm": 4.614867994542819, "learning_rate": 1.9230882852987348e-05, "loss": 0.8718, "step": 2386 }, { "epoch": 0.30399414171323047, "grad_norm": 5.108711596242636, "learning_rate": 1.9230089445526778e-05, "loss": 0.9946, "step": 2387 }, { "epoch": 0.3041214957734371, "grad_norm": 6.138589846930473, "learning_rate": 1.92292956454287e-05, "loss": 0.9435, "step": 2388 }, { "epoch": 0.30424884983364375, "grad_norm": 4.977825010639471, "learning_rate": 1.9228501452726872e-05, "loss": 0.9678, "step": 2389 }, { "epoch": 0.3043762038938504, "grad_norm": 5.135519814632156, "learning_rate": 1.922770686745508e-05, "loss": 1.0575, "step": 2390 }, { "epoch": 0.30450355795405704, "grad_norm": 4.4646519363598545, "learning_rate": 1.9226911889647128e-05, "loss": 0.9175, "step": 2391 }, { "epoch": 0.3046309120142637, "grad_norm": 5.685641142189147, "learning_rate": 1.922611651933683e-05, "loss": 0.9805, "step": 2392 }, { "epoch": 0.30475826607447026, "grad_norm": 4.941726806857919, "learning_rate": 1.9225320756558023e-05, "loss": 0.9506, "step": 2393 }, { "epoch": 0.3048856201346769, "grad_norm": 4.907095089190291, "learning_rate": 1.9224524601344557e-05, "loss": 0.9138, "step": 2394 }, { "epoch": 0.30501297419488355, "grad_norm": 5.240892230359184, "learning_rate": 1.92237280537303e-05, "loss": 0.879, "step": 2395 }, { "epoch": 0.3051403282550902, "grad_norm": 4.732739660257308, "learning_rate": 1.9222931113749132e-05, "loss": 1.0182, "step": 2396 }, { "epoch": 0.30526768231529683, "grad_norm": 5.937838896064395, "learning_rate": 1.922213378143496e-05, "loss": 0.9216, "step": 2397 }, { "epoch": 0.30539503637550347, "grad_norm": 5.143194142153951, "learning_rate": 1.9221336056821694e-05, "loss": 1.0552, "step": 2398 }, { "epoch": 0.30552239043571006, "grad_norm": 6.464203494541896, "learning_rate": 1.9220537939943278e-05, "loss": 0.9087, "step": 2399 }, { "epoch": 0.3056497444959167, "grad_norm": 5.613982027466058, "learning_rate": 1.9219739430833658e-05, "loss": 1.0086, "step": 2400 }, { "epoch": 0.30577709855612334, "grad_norm": 5.251196043831069, "learning_rate": 1.92189405295268e-05, "loss": 1.024, "step": 2401 }, { "epoch": 0.30590445261633, "grad_norm": 6.139204409370102, "learning_rate": 1.921814123605669e-05, "loss": 0.966, "step": 2402 }, { "epoch": 0.3060318066765366, "grad_norm": 5.717905117639342, "learning_rate": 1.921734155045733e-05, "loss": 0.9647, "step": 2403 }, { "epoch": 0.30615916073674326, "grad_norm": 6.365355432839313, "learning_rate": 1.9216541472762736e-05, "loss": 0.8977, "step": 2404 }, { "epoch": 0.30628651479694985, "grad_norm": 5.928194805532604, "learning_rate": 1.9215741003006942e-05, "loss": 1.0066, "step": 2405 }, { "epoch": 0.3064138688571565, "grad_norm": 4.891136264585176, "learning_rate": 1.9214940141224005e-05, "loss": 0.9647, "step": 2406 }, { "epoch": 0.30654122291736313, "grad_norm": 4.890592044804602, "learning_rate": 1.9214138887447983e-05, "loss": 1.044, "step": 2407 }, { "epoch": 0.3066685769775698, "grad_norm": 4.928363894403414, "learning_rate": 1.921333724171297e-05, "loss": 1.1022, "step": 2408 }, { "epoch": 0.3067959310377764, "grad_norm": 4.729688483982595, "learning_rate": 1.921253520405306e-05, "loss": 0.9419, "step": 2409 }, { "epoch": 0.30692328509798306, "grad_norm": 5.43861009512961, "learning_rate": 1.9211732774502372e-05, "loss": 0.8916, "step": 2410 }, { "epoch": 0.30705063915818964, "grad_norm": 5.629195111266619, "learning_rate": 1.9210929953095047e-05, "loss": 0.9924, "step": 2411 }, { "epoch": 0.3071779932183963, "grad_norm": 5.395834001028051, "learning_rate": 1.9210126739865226e-05, "loss": 0.9525, "step": 2412 }, { "epoch": 0.3073053472786029, "grad_norm": 5.483985995763793, "learning_rate": 1.920932313484708e-05, "loss": 0.9936, "step": 2413 }, { "epoch": 0.30743270133880957, "grad_norm": 5.470112483735302, "learning_rate": 1.9208519138074803e-05, "loss": 0.9494, "step": 2414 }, { "epoch": 0.3075600553990162, "grad_norm": 6.050927196734857, "learning_rate": 1.9207714749582583e-05, "loss": 0.9662, "step": 2415 }, { "epoch": 0.30768740945922285, "grad_norm": 4.577083117566504, "learning_rate": 1.9206909969404643e-05, "loss": 1.017, "step": 2416 }, { "epoch": 0.30781476351942944, "grad_norm": 4.207758848554405, "learning_rate": 1.920610479757522e-05, "loss": 0.918, "step": 2417 }, { "epoch": 0.3079421175796361, "grad_norm": 4.584607010933668, "learning_rate": 1.9205299234128558e-05, "loss": 0.8914, "step": 2418 }, { "epoch": 0.3080694716398427, "grad_norm": 5.8060430943607555, "learning_rate": 1.920449327909893e-05, "loss": 0.9561, "step": 2419 }, { "epoch": 0.30819682570004936, "grad_norm": 5.543343636844775, "learning_rate": 1.9203686932520624e-05, "loss": 0.9198, "step": 2420 }, { "epoch": 0.308324179760256, "grad_norm": 6.6106645484932205, "learning_rate": 1.9202880194427937e-05, "loss": 0.9407, "step": 2421 }, { "epoch": 0.3084515338204626, "grad_norm": 4.950706493167833, "learning_rate": 1.920207306485518e-05, "loss": 0.9575, "step": 2422 }, { "epoch": 0.30857888788066923, "grad_norm": 4.632950830565494, "learning_rate": 1.92012655438367e-05, "loss": 0.916, "step": 2423 }, { "epoch": 0.30870624194087587, "grad_norm": 4.756714341785273, "learning_rate": 1.9200457631406842e-05, "loss": 0.8721, "step": 2424 }, { "epoch": 0.3088335960010825, "grad_norm": 4.835345057864359, "learning_rate": 1.919964932759997e-05, "loss": 1.0384, "step": 2425 }, { "epoch": 0.30896095006128915, "grad_norm": 5.118701134379754, "learning_rate": 1.919884063245047e-05, "loss": 0.9432, "step": 2426 }, { "epoch": 0.3090883041214958, "grad_norm": 8.064852112693039, "learning_rate": 1.919803154599275e-05, "loss": 0.9799, "step": 2427 }, { "epoch": 0.3092156581817024, "grad_norm": 4.823078369769405, "learning_rate": 1.9197222068261223e-05, "loss": 0.9643, "step": 2428 }, { "epoch": 0.309343012241909, "grad_norm": 5.258096355694388, "learning_rate": 1.919641219929032e-05, "loss": 0.9309, "step": 2429 }, { "epoch": 0.30947036630211566, "grad_norm": 4.684749966425615, "learning_rate": 1.9195601939114498e-05, "loss": 0.9316, "step": 2430 }, { "epoch": 0.3095977203623223, "grad_norm": 5.937742529483368, "learning_rate": 1.919479128776822e-05, "loss": 1.0159, "step": 2431 }, { "epoch": 0.30972507442252895, "grad_norm": 6.326113383490015, "learning_rate": 1.9193980245285967e-05, "loss": 0.9461, "step": 2432 }, { "epoch": 0.3098524284827356, "grad_norm": 4.934632253192041, "learning_rate": 1.9193168811702248e-05, "loss": 1.0028, "step": 2433 }, { "epoch": 0.3099797825429422, "grad_norm": 5.761669094065687, "learning_rate": 1.9192356987051575e-05, "loss": 1.0219, "step": 2434 }, { "epoch": 0.3101071366031488, "grad_norm": 5.479709303447854, "learning_rate": 1.9191544771368485e-05, "loss": 1.0376, "step": 2435 }, { "epoch": 0.31023449066335546, "grad_norm": 5.329387615873844, "learning_rate": 1.9190732164687528e-05, "loss": 0.8723, "step": 2436 }, { "epoch": 0.3103618447235621, "grad_norm": 5.968839155400078, "learning_rate": 1.918991916704327e-05, "loss": 0.9442, "step": 2437 }, { "epoch": 0.31048919878376874, "grad_norm": 5.441022411645922, "learning_rate": 1.9189105778470295e-05, "loss": 0.9427, "step": 2438 }, { "epoch": 0.3106165528439754, "grad_norm": 5.181392153272718, "learning_rate": 1.9188291999003207e-05, "loss": 0.9488, "step": 2439 }, { "epoch": 0.31074390690418197, "grad_norm": 4.626442761532063, "learning_rate": 1.9187477828676618e-05, "loss": 1.0714, "step": 2440 }, { "epoch": 0.3108712609643886, "grad_norm": 8.557714754040557, "learning_rate": 1.9186663267525168e-05, "loss": 0.9898, "step": 2441 }, { "epoch": 0.31099861502459525, "grad_norm": 5.0695124907549936, "learning_rate": 1.91858483155835e-05, "loss": 0.9832, "step": 2442 }, { "epoch": 0.3111259690848019, "grad_norm": 3.884945320127538, "learning_rate": 1.918503297288629e-05, "loss": 0.9587, "step": 2443 }, { "epoch": 0.31125332314500853, "grad_norm": 4.215082679743105, "learning_rate": 1.9184217239468213e-05, "loss": 0.9267, "step": 2444 }, { "epoch": 0.3113806772052152, "grad_norm": 4.98022307145279, "learning_rate": 1.9183401115363973e-05, "loss": 1.0212, "step": 2445 }, { "epoch": 0.31150803126542176, "grad_norm": 5.014354319058492, "learning_rate": 1.918258460060829e-05, "loss": 1.0073, "step": 2446 }, { "epoch": 0.3116353853256284, "grad_norm": 5.57827613262524, "learning_rate": 1.9181767695235895e-05, "loss": 0.9561, "step": 2447 }, { "epoch": 0.31176273938583504, "grad_norm": 7.487119422372869, "learning_rate": 1.9180950399281538e-05, "loss": 1.0213, "step": 2448 }, { "epoch": 0.3118900934460417, "grad_norm": 4.719312217516074, "learning_rate": 1.9180132712779987e-05, "loss": 0.9697, "step": 2449 }, { "epoch": 0.3120174475062483, "grad_norm": 6.402163807340111, "learning_rate": 1.917931463576602e-05, "loss": 0.9637, "step": 2450 }, { "epoch": 0.31214480156645497, "grad_norm": 6.23163225078576, "learning_rate": 1.9178496168274447e-05, "loss": 0.9457, "step": 2451 }, { "epoch": 0.31227215562666155, "grad_norm": 4.837846468561104, "learning_rate": 1.9177677310340076e-05, "loss": 0.9633, "step": 2452 }, { "epoch": 0.3123995096868682, "grad_norm": 3.704980570424315, "learning_rate": 1.9176858061997744e-05, "loss": 0.9492, "step": 2453 }, { "epoch": 0.31252686374707483, "grad_norm": 4.167765103828833, "learning_rate": 1.91760384232823e-05, "loss": 0.9249, "step": 2454 }, { "epoch": 0.3126542178072815, "grad_norm": 4.387664339731823, "learning_rate": 1.9175218394228614e-05, "loss": 0.998, "step": 2455 }, { "epoch": 0.3127815718674881, "grad_norm": 5.220068302511543, "learning_rate": 1.9174397974871563e-05, "loss": 0.9495, "step": 2456 }, { "epoch": 0.31290892592769476, "grad_norm": 6.139088834073545, "learning_rate": 1.917357716524605e-05, "loss": 1.0069, "step": 2457 }, { "epoch": 0.31303627998790134, "grad_norm": 5.935157715422025, "learning_rate": 1.9172755965386995e-05, "loss": 0.9882, "step": 2458 }, { "epoch": 0.313163634048108, "grad_norm": 4.775640650407667, "learning_rate": 1.9171934375329323e-05, "loss": 0.9753, "step": 2459 }, { "epoch": 0.3132909881083146, "grad_norm": 4.736041389516449, "learning_rate": 1.9171112395107988e-05, "loss": 1.0053, "step": 2460 }, { "epoch": 0.31341834216852127, "grad_norm": 4.4744840526972265, "learning_rate": 1.9170290024757958e-05, "loss": 0.9477, "step": 2461 }, { "epoch": 0.3135456962287279, "grad_norm": 5.5772488971920975, "learning_rate": 1.916946726431421e-05, "loss": 0.8998, "step": 2462 }, { "epoch": 0.31367305028893455, "grad_norm": 4.561742850408378, "learning_rate": 1.9168644113811746e-05, "loss": 0.8928, "step": 2463 }, { "epoch": 0.31380040434914114, "grad_norm": 6.121922868324371, "learning_rate": 1.9167820573285584e-05, "loss": 0.9256, "step": 2464 }, { "epoch": 0.3139277584093478, "grad_norm": 4.3670794041994485, "learning_rate": 1.9166996642770756e-05, "loss": 0.9084, "step": 2465 }, { "epoch": 0.3140551124695544, "grad_norm": 5.627768449858232, "learning_rate": 1.916617232230231e-05, "loss": 0.9832, "step": 2466 }, { "epoch": 0.31418246652976106, "grad_norm": 5.9663298328876095, "learning_rate": 1.9165347611915313e-05, "loss": 0.9159, "step": 2467 }, { "epoch": 0.3143098205899677, "grad_norm": 4.094674741242858, "learning_rate": 1.9164522511644844e-05, "loss": 1.0285, "step": 2468 }, { "epoch": 0.3144371746501743, "grad_norm": 5.5011616260422525, "learning_rate": 1.9163697021526003e-05, "loss": 0.9296, "step": 2469 }, { "epoch": 0.31456452871038093, "grad_norm": 6.219244089099167, "learning_rate": 1.9162871141593907e-05, "loss": 0.9225, "step": 2470 }, { "epoch": 0.31469188277058757, "grad_norm": 4.89578939648115, "learning_rate": 1.916204487188369e-05, "loss": 0.9342, "step": 2471 }, { "epoch": 0.3148192368307942, "grad_norm": 6.590609557421749, "learning_rate": 1.916121821243049e-05, "loss": 1.1388, "step": 2472 }, { "epoch": 0.31494659089100085, "grad_norm": 4.640709873027008, "learning_rate": 1.9160391163269486e-05, "loss": 0.9063, "step": 2473 }, { "epoch": 0.3150739449512075, "grad_norm": 4.448341557788261, "learning_rate": 1.9159563724435852e-05, "loss": 0.9749, "step": 2474 }, { "epoch": 0.3152012990114141, "grad_norm": 5.651651610288117, "learning_rate": 1.915873589596479e-05, "loss": 0.9451, "step": 2475 }, { "epoch": 0.3153286530716207, "grad_norm": 4.459680704225649, "learning_rate": 1.915790767789151e-05, "loss": 0.8976, "step": 2476 }, { "epoch": 0.31545600713182737, "grad_norm": 6.162655668633184, "learning_rate": 1.9157079070251248e-05, "loss": 0.9383, "step": 2477 }, { "epoch": 0.315583361192034, "grad_norm": 3.9692455793330774, "learning_rate": 1.915625007307925e-05, "loss": 0.9391, "step": 2478 }, { "epoch": 0.31571071525224065, "grad_norm": 5.911581872217757, "learning_rate": 1.9155420686410778e-05, "loss": 0.9762, "step": 2479 }, { "epoch": 0.3158380693124473, "grad_norm": 6.186299217153207, "learning_rate": 1.9154590910281118e-05, "loss": 0.9695, "step": 2480 }, { "epoch": 0.3159654233726539, "grad_norm": 4.59218420214825, "learning_rate": 1.915376074472557e-05, "loss": 0.9113, "step": 2481 }, { "epoch": 0.3160927774328605, "grad_norm": 4.340886990664772, "learning_rate": 1.9152930189779436e-05, "loss": 0.9518, "step": 2482 }, { "epoch": 0.31622013149306716, "grad_norm": 5.216476219202749, "learning_rate": 1.915209924547806e-05, "loss": 1.0157, "step": 2483 }, { "epoch": 0.3163474855532738, "grad_norm": 5.807949130442401, "learning_rate": 1.9151267911856782e-05, "loss": 0.9676, "step": 2484 }, { "epoch": 0.31647483961348044, "grad_norm": 5.882329386495665, "learning_rate": 1.9150436188950974e-05, "loss": 0.9858, "step": 2485 }, { "epoch": 0.3166021936736871, "grad_norm": 5.26569418479679, "learning_rate": 1.9149604076796006e-05, "loss": 0.9749, "step": 2486 }, { "epoch": 0.31672954773389367, "grad_norm": 3.9748595062954863, "learning_rate": 1.9148771575427282e-05, "loss": 0.9334, "step": 2487 }, { "epoch": 0.3168569017941003, "grad_norm": 5.51708066064955, "learning_rate": 1.9147938684880213e-05, "loss": 0.9486, "step": 2488 }, { "epoch": 0.31698425585430695, "grad_norm": 5.569014088954492, "learning_rate": 1.914710540519023e-05, "loss": 1.0413, "step": 2489 }, { "epoch": 0.3171116099145136, "grad_norm": 5.5130009014890735, "learning_rate": 1.9146271736392776e-05, "loss": 0.9071, "step": 2490 }, { "epoch": 0.31723896397472023, "grad_norm": 5.3936465433570975, "learning_rate": 1.914543767852332e-05, "loss": 0.9902, "step": 2491 }, { "epoch": 0.3173663180349269, "grad_norm": 5.1624834702344895, "learning_rate": 1.9144603231617342e-05, "loss": 0.9899, "step": 2492 }, { "epoch": 0.31749367209513346, "grad_norm": 4.457596738817127, "learning_rate": 1.9143768395710337e-05, "loss": 1.0167, "step": 2493 }, { "epoch": 0.3176210261553401, "grad_norm": 5.343690035299519, "learning_rate": 1.9142933170837814e-05, "loss": 0.9384, "step": 2494 }, { "epoch": 0.31774838021554674, "grad_norm": 4.836499897881425, "learning_rate": 1.914209755703531e-05, "loss": 0.9149, "step": 2495 }, { "epoch": 0.3178757342757534, "grad_norm": 6.277963566164265, "learning_rate": 1.914126155433836e-05, "loss": 1.0924, "step": 2496 }, { "epoch": 0.31800308833596, "grad_norm": 5.999848045968084, "learning_rate": 1.914042516278254e-05, "loss": 1.0077, "step": 2497 }, { "epoch": 0.31813044239616667, "grad_norm": 4.715286701826297, "learning_rate": 1.913958838240342e-05, "loss": 0.9707, "step": 2498 }, { "epoch": 0.31825779645637325, "grad_norm": 4.238587933532592, "learning_rate": 1.9138751213236597e-05, "loss": 0.9586, "step": 2499 }, { "epoch": 0.3183851505165799, "grad_norm": 3.910804107988168, "learning_rate": 1.913791365531769e-05, "loss": 0.8785, "step": 2500 }, { "epoch": 0.31851250457678654, "grad_norm": 5.150282116912535, "learning_rate": 1.9137075708682314e-05, "loss": 0.9502, "step": 2501 }, { "epoch": 0.3186398586369932, "grad_norm": 4.7259337456280965, "learning_rate": 1.9136237373366126e-05, "loss": 0.8284, "step": 2502 }, { "epoch": 0.3187672126971998, "grad_norm": 5.133615462902073, "learning_rate": 1.9135398649404786e-05, "loss": 0.9548, "step": 2503 }, { "epoch": 0.31889456675740646, "grad_norm": 6.855057802408761, "learning_rate": 1.9134559536833974e-05, "loss": 0.9426, "step": 2504 }, { "epoch": 0.31902192081761305, "grad_norm": 5.916067854259379, "learning_rate": 1.9133720035689375e-05, "loss": 0.8877, "step": 2505 }, { "epoch": 0.3191492748778197, "grad_norm": 6.2865981681924215, "learning_rate": 1.9132880146006708e-05, "loss": 0.945, "step": 2506 }, { "epoch": 0.31927662893802633, "grad_norm": 7.521614463867874, "learning_rate": 1.91320398678217e-05, "loss": 0.974, "step": 2507 }, { "epoch": 0.31940398299823297, "grad_norm": 4.6909043220646325, "learning_rate": 1.91311992011701e-05, "loss": 0.9644, "step": 2508 }, { "epoch": 0.3195313370584396, "grad_norm": 6.141494961136581, "learning_rate": 1.913035814608766e-05, "loss": 1.0522, "step": 2509 }, { "epoch": 0.3196586911186462, "grad_norm": 6.446043992581564, "learning_rate": 1.9129516702610165e-05, "loss": 0.9892, "step": 2510 }, { "epoch": 0.31978604517885284, "grad_norm": 4.307617298196269, "learning_rate": 1.9128674870773405e-05, "loss": 0.9824, "step": 2511 }, { "epoch": 0.3199133992390595, "grad_norm": 5.064614690225627, "learning_rate": 1.912783265061319e-05, "loss": 0.9931, "step": 2512 }, { "epoch": 0.3200407532992661, "grad_norm": 7.245433026648741, "learning_rate": 1.9126990042165352e-05, "loss": 0.8897, "step": 2513 }, { "epoch": 0.32016810735947276, "grad_norm": 5.634376171279083, "learning_rate": 1.912614704546573e-05, "loss": 0.974, "step": 2514 }, { "epoch": 0.3202954614196794, "grad_norm": 5.004217467207679, "learning_rate": 1.9125303660550183e-05, "loss": 0.9849, "step": 2515 }, { "epoch": 0.320422815479886, "grad_norm": 5.843361255138745, "learning_rate": 1.912445988745459e-05, "loss": 0.9396, "step": 2516 }, { "epoch": 0.32055016954009263, "grad_norm": 4.880379471955353, "learning_rate": 1.912361572621485e-05, "loss": 0.908, "step": 2517 }, { "epoch": 0.3206775236002993, "grad_norm": 4.419729406436897, "learning_rate": 1.9122771176866863e-05, "loss": 0.9627, "step": 2518 }, { "epoch": 0.3208048776605059, "grad_norm": 4.9628250976464106, "learning_rate": 1.9121926239446556e-05, "loss": 0.9099, "step": 2519 }, { "epoch": 0.32093223172071256, "grad_norm": 5.005861470627244, "learning_rate": 1.912108091398988e-05, "loss": 0.8976, "step": 2520 }, { "epoch": 0.3210595857809192, "grad_norm": 3.9970422776854004, "learning_rate": 1.9120235200532786e-05, "loss": 1.0194, "step": 2521 }, { "epoch": 0.3211869398411258, "grad_norm": 4.848267251608308, "learning_rate": 1.9119389099111252e-05, "loss": 0.8731, "step": 2522 }, { "epoch": 0.3213142939013324, "grad_norm": 4.8074877510569385, "learning_rate": 1.9118542609761273e-05, "loss": 0.8728, "step": 2523 }, { "epoch": 0.32144164796153907, "grad_norm": 5.108540971445134, "learning_rate": 1.9117695732518858e-05, "loss": 1.0182, "step": 2524 }, { "epoch": 0.3215690020217457, "grad_norm": 4.334173390073328, "learning_rate": 1.9116848467420025e-05, "loss": 0.9577, "step": 2525 }, { "epoch": 0.32169635608195235, "grad_norm": 4.523671936390385, "learning_rate": 1.9116000814500822e-05, "loss": 0.9517, "step": 2526 }, { "epoch": 0.321823710142159, "grad_norm": 6.159973877194984, "learning_rate": 1.9115152773797305e-05, "loss": 0.9527, "step": 2527 }, { "epoch": 0.3219510642023656, "grad_norm": 5.086640621400267, "learning_rate": 1.911430434534555e-05, "loss": 0.9254, "step": 2528 }, { "epoch": 0.3220784182625722, "grad_norm": 6.418441262847928, "learning_rate": 1.9113455529181645e-05, "loss": 0.973, "step": 2529 }, { "epoch": 0.32220577232277886, "grad_norm": 5.620796667629333, "learning_rate": 1.9112606325341706e-05, "loss": 0.9685, "step": 2530 }, { "epoch": 0.3223331263829855, "grad_norm": 4.515420136870009, "learning_rate": 1.9111756733861846e-05, "loss": 0.8841, "step": 2531 }, { "epoch": 0.32246048044319214, "grad_norm": 7.642445103701167, "learning_rate": 1.911090675477821e-05, "loss": 0.903, "step": 2532 }, { "epoch": 0.3225878345033988, "grad_norm": 4.8016431141921965, "learning_rate": 1.911005638812696e-05, "loss": 0.9992, "step": 2533 }, { "epoch": 0.32271518856360537, "grad_norm": 6.233952481668526, "learning_rate": 1.910920563394427e-05, "loss": 0.9561, "step": 2534 }, { "epoch": 0.322842542623812, "grad_norm": 5.114767512820605, "learning_rate": 1.9108354492266315e-05, "loss": 1.0435, "step": 2535 }, { "epoch": 0.32296989668401865, "grad_norm": 5.177326015052335, "learning_rate": 1.9107502963129318e-05, "loss": 0.9967, "step": 2536 }, { "epoch": 0.3230972507442253, "grad_norm": 5.145400962219729, "learning_rate": 1.91066510465695e-05, "loss": 0.957, "step": 2537 }, { "epoch": 0.32322460480443194, "grad_norm": 4.601104039926772, "learning_rate": 1.9105798742623093e-05, "loss": 1.0109, "step": 2538 }, { "epoch": 0.3233519588646386, "grad_norm": 5.871656724609366, "learning_rate": 1.9104946051326358e-05, "loss": 0.9691, "step": 2539 }, { "epoch": 0.32347931292484516, "grad_norm": 6.369771571113368, "learning_rate": 1.9104092972715564e-05, "loss": 1.0704, "step": 2540 }, { "epoch": 0.3236066669850518, "grad_norm": 4.679042867289278, "learning_rate": 1.9103239506827006e-05, "loss": 1.0007, "step": 2541 }, { "epoch": 0.32373402104525845, "grad_norm": 4.55709560590165, "learning_rate": 1.9102385653696983e-05, "loss": 1.011, "step": 2542 }, { "epoch": 0.3238613751054651, "grad_norm": 6.381860071324872, "learning_rate": 1.9101531413361824e-05, "loss": 0.9249, "step": 2543 }, { "epoch": 0.32398872916567173, "grad_norm": 4.268028616096929, "learning_rate": 1.9100676785857862e-05, "loss": 1.0004, "step": 2544 }, { "epoch": 0.32411608322587837, "grad_norm": 7.018874743228634, "learning_rate": 1.9099821771221452e-05, "loss": 0.8935, "step": 2545 }, { "epoch": 0.32424343728608496, "grad_norm": 6.625079460387273, "learning_rate": 1.9098966369488967e-05, "loss": 0.9675, "step": 2546 }, { "epoch": 0.3243707913462916, "grad_norm": 4.9580211808281955, "learning_rate": 1.9098110580696793e-05, "loss": 0.8631, "step": 2547 }, { "epoch": 0.32449814540649824, "grad_norm": 5.2104224732600635, "learning_rate": 1.909725440488134e-05, "loss": 0.8774, "step": 2548 }, { "epoch": 0.3246254994667049, "grad_norm": 4.180198566885759, "learning_rate": 1.909639784207902e-05, "loss": 0.9031, "step": 2549 }, { "epoch": 0.3247528535269115, "grad_norm": 4.561188600795451, "learning_rate": 1.9095540892326282e-05, "loss": 1.0314, "step": 2550 }, { "epoch": 0.32488020758711816, "grad_norm": 4.615030213956613, "learning_rate": 1.9094683555659565e-05, "loss": 0.9225, "step": 2551 }, { "epoch": 0.32500756164732475, "grad_norm": 7.035829400662342, "learning_rate": 1.909382583211535e-05, "loss": 0.9754, "step": 2552 }, { "epoch": 0.3251349157075314, "grad_norm": 4.931897811981918, "learning_rate": 1.9092967721730118e-05, "loss": 0.9287, "step": 2553 }, { "epoch": 0.32526226976773803, "grad_norm": 7.005240522308326, "learning_rate": 1.9092109224540375e-05, "loss": 0.8866, "step": 2554 }, { "epoch": 0.3253896238279447, "grad_norm": 4.873086798433654, "learning_rate": 1.9091250340582642e-05, "loss": 0.8508, "step": 2555 }, { "epoch": 0.3255169778881513, "grad_norm": 4.482051872324332, "learning_rate": 1.909039106989345e-05, "loss": 0.9005, "step": 2556 }, { "epoch": 0.3256443319483579, "grad_norm": 5.874537348285096, "learning_rate": 1.9089531412509354e-05, "loss": 0.9172, "step": 2557 }, { "epoch": 0.32577168600856454, "grad_norm": 6.033456981649414, "learning_rate": 1.9088671368466928e-05, "loss": 0.9298, "step": 2558 }, { "epoch": 0.3258990400687712, "grad_norm": 5.4419141369445505, "learning_rate": 1.9087810937802747e-05, "loss": 1.0205, "step": 2559 }, { "epoch": 0.3260263941289778, "grad_norm": 8.320781511869045, "learning_rate": 1.9086950120553417e-05, "loss": 1.0127, "step": 2560 }, { "epoch": 0.32615374818918447, "grad_norm": 4.204840214400268, "learning_rate": 1.9086088916755557e-05, "loss": 0.9175, "step": 2561 }, { "epoch": 0.3262811022493911, "grad_norm": 6.03144132478678, "learning_rate": 1.9085227326445805e-05, "loss": 1.0701, "step": 2562 }, { "epoch": 0.3264084563095977, "grad_norm": 6.020943329366209, "learning_rate": 1.908436534966081e-05, "loss": 1.0239, "step": 2563 }, { "epoch": 0.32653581036980434, "grad_norm": 5.113384948098608, "learning_rate": 1.9083502986437233e-05, "loss": 0.9623, "step": 2564 }, { "epoch": 0.326663164430011, "grad_norm": 5.347702260233953, "learning_rate": 1.9082640236811766e-05, "loss": 1.073, "step": 2565 }, { "epoch": 0.3267905184902176, "grad_norm": 5.2285418903430685, "learning_rate": 1.908177710082111e-05, "loss": 1.0635, "step": 2566 }, { "epoch": 0.32691787255042426, "grad_norm": 6.742131203143439, "learning_rate": 1.9080913578501973e-05, "loss": 0.9097, "step": 2567 }, { "epoch": 0.3270452266106309, "grad_norm": 5.7230741901564715, "learning_rate": 1.9080049669891098e-05, "loss": 0.9343, "step": 2568 }, { "epoch": 0.3271725806708375, "grad_norm": 6.227705904367902, "learning_rate": 1.9079185375025227e-05, "loss": 0.9226, "step": 2569 }, { "epoch": 0.32729993473104413, "grad_norm": 6.388298973085969, "learning_rate": 1.9078320693941132e-05, "loss": 1.0023, "step": 2570 }, { "epoch": 0.32742728879125077, "grad_norm": 5.17499146391676, "learning_rate": 1.9077455626675593e-05, "loss": 0.8663, "step": 2571 }, { "epoch": 0.3275546428514574, "grad_norm": 5.608686739635169, "learning_rate": 1.9076590173265406e-05, "loss": 0.9189, "step": 2572 }, { "epoch": 0.32768199691166405, "grad_norm": 5.542332640286133, "learning_rate": 1.907572433374739e-05, "loss": 0.8867, "step": 2573 }, { "epoch": 0.3278093509718707, "grad_norm": 4.179462519499881, "learning_rate": 1.9074858108158377e-05, "loss": 0.9368, "step": 2574 }, { "epoch": 0.3279367050320773, "grad_norm": 4.949297367666824, "learning_rate": 1.9073991496535216e-05, "loss": 1.0417, "step": 2575 }, { "epoch": 0.3280640590922839, "grad_norm": 5.730045024701588, "learning_rate": 1.9073124498914774e-05, "loss": 1.0475, "step": 2576 }, { "epoch": 0.32819141315249056, "grad_norm": 4.9237299254512505, "learning_rate": 1.907225711533392e-05, "loss": 0.9087, "step": 2577 }, { "epoch": 0.3283187672126972, "grad_norm": 4.12799610800153, "learning_rate": 1.9071389345829564e-05, "loss": 1.0381, "step": 2578 }, { "epoch": 0.32844612127290385, "grad_norm": 4.682591627765226, "learning_rate": 1.9070521190438618e-05, "loss": 0.9864, "step": 2579 }, { "epoch": 0.3285734753331105, "grad_norm": 5.015963438980731, "learning_rate": 1.9069652649198004e-05, "loss": 0.9771, "step": 2580 }, { "epoch": 0.3287008293933171, "grad_norm": 4.566182361919494, "learning_rate": 1.906878372214468e-05, "loss": 0.8941, "step": 2581 }, { "epoch": 0.3288281834535237, "grad_norm": 4.615081461707189, "learning_rate": 1.9067914409315603e-05, "loss": 0.9783, "step": 2582 }, { "epoch": 0.32895553751373036, "grad_norm": 7.681792831173867, "learning_rate": 1.9067044710747754e-05, "loss": 1.0475, "step": 2583 }, { "epoch": 0.329082891573937, "grad_norm": 4.902106412332563, "learning_rate": 1.906617462647813e-05, "loss": 0.9546, "step": 2584 }, { "epoch": 0.32921024563414364, "grad_norm": 5.018471168762863, "learning_rate": 1.9065304156543736e-05, "loss": 0.9655, "step": 2585 }, { "epoch": 0.3293375996943503, "grad_norm": 4.94482728045137, "learning_rate": 1.906443330098161e-05, "loss": 0.9318, "step": 2586 }, { "epoch": 0.32946495375455687, "grad_norm": 3.998596183487229, "learning_rate": 1.9063562059828794e-05, "loss": 0.9281, "step": 2587 }, { "epoch": 0.3295923078147635, "grad_norm": 5.966219860091087, "learning_rate": 1.906269043312235e-05, "loss": 0.9016, "step": 2588 }, { "epoch": 0.32971966187497015, "grad_norm": 4.088112946728596, "learning_rate": 1.906181842089936e-05, "loss": 1.03, "step": 2589 }, { "epoch": 0.3298470159351768, "grad_norm": 5.354740018116761, "learning_rate": 1.906094602319691e-05, "loss": 0.9512, "step": 2590 }, { "epoch": 0.32997436999538343, "grad_norm": 7.607380505744384, "learning_rate": 1.9060073240052112e-05, "loss": 0.979, "step": 2591 }, { "epoch": 0.3301017240555901, "grad_norm": 6.009033079135326, "learning_rate": 1.90592000715021e-05, "loss": 1.0248, "step": 2592 }, { "epoch": 0.33022907811579666, "grad_norm": 4.692340843497277, "learning_rate": 1.9058326517584014e-05, "loss": 0.9131, "step": 2593 }, { "epoch": 0.3303564321760033, "grad_norm": 6.64127208474133, "learning_rate": 1.9057452578335008e-05, "loss": 0.9541, "step": 2594 }, { "epoch": 0.33048378623620994, "grad_norm": 5.532511594185314, "learning_rate": 1.905657825379227e-05, "loss": 1.0486, "step": 2595 }, { "epoch": 0.3306111402964166, "grad_norm": 5.1316187848026145, "learning_rate": 1.9055703543992985e-05, "loss": 0.9943, "step": 2596 }, { "epoch": 0.3307384943566232, "grad_norm": 4.459822907973297, "learning_rate": 1.9054828448974363e-05, "loss": 0.9625, "step": 2597 }, { "epoch": 0.33086584841682987, "grad_norm": 4.676960400478764, "learning_rate": 1.905395296877363e-05, "loss": 1.1066, "step": 2598 }, { "epoch": 0.33099320247703645, "grad_norm": 4.441141702763315, "learning_rate": 1.905307710342803e-05, "loss": 0.9407, "step": 2599 }, { "epoch": 0.3311205565372431, "grad_norm": 4.04179269390972, "learning_rate": 1.905220085297482e-05, "loss": 0.9111, "step": 2600 }, { "epoch": 0.33124791059744974, "grad_norm": 4.66613641632841, "learning_rate": 1.9051324217451275e-05, "loss": 1.0248, "step": 2601 }, { "epoch": 0.3313752646576564, "grad_norm": 4.244028721056945, "learning_rate": 1.9050447196894687e-05, "loss": 1.0137, "step": 2602 }, { "epoch": 0.331502618717863, "grad_norm": 5.621327706161037, "learning_rate": 1.904956979134236e-05, "loss": 0.8294, "step": 2603 }, { "epoch": 0.3316299727780696, "grad_norm": 4.255260577711842, "learning_rate": 1.9048692000831618e-05, "loss": 0.8888, "step": 2604 }, { "epoch": 0.33175732683827625, "grad_norm": 5.888650173571123, "learning_rate": 1.9047813825399803e-05, "loss": 1.0511, "step": 2605 }, { "epoch": 0.3318846808984829, "grad_norm": 4.434574506060834, "learning_rate": 1.904693526508427e-05, "loss": 0.9594, "step": 2606 }, { "epoch": 0.33201203495868953, "grad_norm": 6.103145301265519, "learning_rate": 1.9046056319922403e-05, "loss": 0.9428, "step": 2607 }, { "epoch": 0.33213938901889617, "grad_norm": 4.533274862435936, "learning_rate": 1.9045176989951573e-05, "loss": 0.9001, "step": 2608 }, { "epoch": 0.3322667430791028, "grad_norm": 5.754522121020006, "learning_rate": 1.9044297275209195e-05, "loss": 1.0338, "step": 2609 }, { "epoch": 0.3323940971393094, "grad_norm": 4.404140968692889, "learning_rate": 1.9043417175732693e-05, "loss": 0.9851, "step": 2610 }, { "epoch": 0.33252145119951604, "grad_norm": 5.627795732642542, "learning_rate": 1.9042536691559502e-05, "loss": 0.9912, "step": 2611 }, { "epoch": 0.3326488052597227, "grad_norm": 4.534720939621769, "learning_rate": 1.9041655822727077e-05, "loss": 0.9879, "step": 2612 }, { "epoch": 0.3327761593199293, "grad_norm": 4.715980373817354, "learning_rate": 1.9040774569272895e-05, "loss": 0.9485, "step": 2613 }, { "epoch": 0.33290351338013596, "grad_norm": 5.488127291546078, "learning_rate": 1.9039892931234434e-05, "loss": 0.8105, "step": 2614 }, { "epoch": 0.3330308674403426, "grad_norm": 5.192552978069045, "learning_rate": 1.9039010908649206e-05, "loss": 0.8612, "step": 2615 }, { "epoch": 0.3331582215005492, "grad_norm": 5.915126367220362, "learning_rate": 1.9038128501554723e-05, "loss": 0.9894, "step": 2616 }, { "epoch": 0.33328557556075583, "grad_norm": 4.774280130337918, "learning_rate": 1.903724570998853e-05, "loss": 0.9293, "step": 2617 }, { "epoch": 0.3334129296209625, "grad_norm": 6.764607200975981, "learning_rate": 1.9036362533988173e-05, "loss": 0.9333, "step": 2618 }, { "epoch": 0.3335402836811691, "grad_norm": 4.6822358136210385, "learning_rate": 1.9035478973591227e-05, "loss": 1.0333, "step": 2619 }, { "epoch": 0.33366763774137576, "grad_norm": 4.467354082949035, "learning_rate": 1.9034595028835277e-05, "loss": 0.9625, "step": 2620 }, { "epoch": 0.3337949918015824, "grad_norm": 4.880646199138695, "learning_rate": 1.903371069975792e-05, "loss": 0.9344, "step": 2621 }, { "epoch": 0.333922345861789, "grad_norm": 6.668296265114928, "learning_rate": 1.903282598639678e-05, "loss": 0.8734, "step": 2622 }, { "epoch": 0.3340496999219956, "grad_norm": 5.489550808177435, "learning_rate": 1.9031940888789485e-05, "loss": 0.9822, "step": 2623 }, { "epoch": 0.33417705398220227, "grad_norm": 7.8691774671823085, "learning_rate": 1.903105540697369e-05, "loss": 0.9388, "step": 2624 }, { "epoch": 0.3343044080424089, "grad_norm": 5.993557650251207, "learning_rate": 1.903016954098707e-05, "loss": 0.9222, "step": 2625 }, { "epoch": 0.33443176210261555, "grad_norm": 4.938623614363187, "learning_rate": 1.9029283290867297e-05, "loss": 0.9431, "step": 2626 }, { "epoch": 0.3345591161628222, "grad_norm": 6.286933718713447, "learning_rate": 1.9028396656652072e-05, "loss": 1.0292, "step": 2627 }, { "epoch": 0.3346864702230288, "grad_norm": 5.920288839768636, "learning_rate": 1.9027509638379122e-05, "loss": 0.9324, "step": 2628 }, { "epoch": 0.3348138242832354, "grad_norm": 5.797628759851785, "learning_rate": 1.9026622236086167e-05, "loss": 0.9915, "step": 2629 }, { "epoch": 0.33494117834344206, "grad_norm": 5.674226467836896, "learning_rate": 1.9025734449810964e-05, "loss": 0.9482, "step": 2630 }, { "epoch": 0.3350685324036487, "grad_norm": 5.022806985301813, "learning_rate": 1.9024846279591275e-05, "loss": 0.9048, "step": 2631 }, { "epoch": 0.33519588646385534, "grad_norm": 4.8296654475192105, "learning_rate": 1.9023957725464887e-05, "loss": 0.8706, "step": 2632 }, { "epoch": 0.335323240524062, "grad_norm": 5.932874303198964, "learning_rate": 1.9023068787469587e-05, "loss": 0.8607, "step": 2633 }, { "epoch": 0.33545059458426857, "grad_norm": 4.656322913911071, "learning_rate": 1.90221794656432e-05, "loss": 0.8791, "step": 2634 }, { "epoch": 0.3355779486444752, "grad_norm": 4.343813230205259, "learning_rate": 1.9021289760023555e-05, "loss": 0.9553, "step": 2635 }, { "epoch": 0.33570530270468185, "grad_norm": 4.52525870073676, "learning_rate": 1.9020399670648496e-05, "loss": 1.0068, "step": 2636 }, { "epoch": 0.3358326567648885, "grad_norm": 4.907744550553373, "learning_rate": 1.9019509197555884e-05, "loss": 1.1055, "step": 2637 }, { "epoch": 0.33596001082509513, "grad_norm": 4.978867504544343, "learning_rate": 1.901861834078361e-05, "loss": 0.9022, "step": 2638 }, { "epoch": 0.3360873648853018, "grad_norm": 6.919151441993888, "learning_rate": 1.9017727100369555e-05, "loss": 1.0947, "step": 2639 }, { "epoch": 0.33621471894550836, "grad_norm": 5.08193667043044, "learning_rate": 1.9016835476351644e-05, "loss": 1.0768, "step": 2640 }, { "epoch": 0.336342073005715, "grad_norm": 6.440004028413828, "learning_rate": 1.9015943468767796e-05, "loss": 0.9227, "step": 2641 }, { "epoch": 0.33646942706592164, "grad_norm": 6.011117172629888, "learning_rate": 1.9015051077655963e-05, "loss": 0.9747, "step": 2642 }, { "epoch": 0.3365967811261283, "grad_norm": 4.857490983877095, "learning_rate": 1.90141583030541e-05, "loss": 0.9739, "step": 2643 }, { "epoch": 0.3367241351863349, "grad_norm": 5.350463754394095, "learning_rate": 1.9013265145000193e-05, "loss": 1.0133, "step": 2644 }, { "epoch": 0.33685148924654157, "grad_norm": 5.4614798913627585, "learning_rate": 1.901237160353223e-05, "loss": 1.0212, "step": 2645 }, { "epoch": 0.33697884330674815, "grad_norm": 6.313471049035682, "learning_rate": 1.9011477678688218e-05, "loss": 0.9876, "step": 2646 }, { "epoch": 0.3371061973669548, "grad_norm": 5.968246997988933, "learning_rate": 1.9010583370506195e-05, "loss": 0.918, "step": 2647 }, { "epoch": 0.33723355142716144, "grad_norm": 5.19784984150692, "learning_rate": 1.900968867902419e-05, "loss": 0.9453, "step": 2648 }, { "epoch": 0.3373609054873681, "grad_norm": 4.1554674831674605, "learning_rate": 1.9008793604280275e-05, "loss": 1.0021, "step": 2649 }, { "epoch": 0.3374882595475747, "grad_norm": 6.418350923458467, "learning_rate": 1.9007898146312517e-05, "loss": 0.8429, "step": 2650 }, { "epoch": 0.3376156136077813, "grad_norm": 4.557100000619847, "learning_rate": 1.9007002305159013e-05, "loss": 0.9218, "step": 2651 }, { "epoch": 0.33774296766798795, "grad_norm": 4.072479432711527, "learning_rate": 1.9006106080857864e-05, "loss": 0.9863, "step": 2652 }, { "epoch": 0.3378703217281946, "grad_norm": 4.980052640399242, "learning_rate": 1.90052094734472e-05, "loss": 0.9362, "step": 2653 }, { "epoch": 0.33799767578840123, "grad_norm": 7.005265026973964, "learning_rate": 1.9004312482965162e-05, "loss": 0.8914, "step": 2654 }, { "epoch": 0.33812502984860787, "grad_norm": 4.716318881059867, "learning_rate": 1.90034151094499e-05, "loss": 0.9816, "step": 2655 }, { "epoch": 0.3382523839088145, "grad_norm": 4.613459649056408, "learning_rate": 1.90025173529396e-05, "loss": 1.0093, "step": 2656 }, { "epoch": 0.3383797379690211, "grad_norm": 3.941037239214319, "learning_rate": 1.9001619213472438e-05, "loss": 0.9049, "step": 2657 }, { "epoch": 0.33850709202922774, "grad_norm": 4.473675127876439, "learning_rate": 1.900072069108663e-05, "loss": 0.9904, "step": 2658 }, { "epoch": 0.3386344460894344, "grad_norm": 4.606911210903827, "learning_rate": 1.899982178582039e-05, "loss": 0.8868, "step": 2659 }, { "epoch": 0.338761800149641, "grad_norm": 9.17534762771174, "learning_rate": 1.8998922497711964e-05, "loss": 0.8799, "step": 2660 }, { "epoch": 0.33888915420984767, "grad_norm": 5.785251201955414, "learning_rate": 1.8998022826799604e-05, "loss": 0.9332, "step": 2661 }, { "epoch": 0.3390165082700543, "grad_norm": 5.088986468779313, "learning_rate": 1.8997122773121576e-05, "loss": 0.9803, "step": 2662 }, { "epoch": 0.3391438623302609, "grad_norm": 4.882395685334591, "learning_rate": 1.8996222336716172e-05, "loss": 0.9261, "step": 2663 }, { "epoch": 0.33927121639046753, "grad_norm": 5.851241099109004, "learning_rate": 1.89953215176217e-05, "loss": 1.0033, "step": 2664 }, { "epoch": 0.3393985704506742, "grad_norm": 5.246861927772764, "learning_rate": 1.899442031587647e-05, "loss": 1.1042, "step": 2665 }, { "epoch": 0.3395259245108808, "grad_norm": 5.934569589358397, "learning_rate": 1.8993518731518822e-05, "loss": 0.9844, "step": 2666 }, { "epoch": 0.33965327857108746, "grad_norm": 5.772102996668579, "learning_rate": 1.899261676458711e-05, "loss": 0.8874, "step": 2667 }, { "epoch": 0.3397806326312941, "grad_norm": 4.113700649692236, "learning_rate": 1.8991714415119705e-05, "loss": 1.0184, "step": 2668 }, { "epoch": 0.3399079866915007, "grad_norm": 5.297719162372073, "learning_rate": 1.8990811683154987e-05, "loss": 0.9489, "step": 2669 }, { "epoch": 0.3400353407517073, "grad_norm": 4.963074711880038, "learning_rate": 1.8989908568731356e-05, "loss": 0.9118, "step": 2670 }, { "epoch": 0.34016269481191397, "grad_norm": 5.010745327877354, "learning_rate": 1.8989005071887237e-05, "loss": 0.984, "step": 2671 }, { "epoch": 0.3402900488721206, "grad_norm": 5.997081682659999, "learning_rate": 1.8988101192661057e-05, "loss": 0.9609, "step": 2672 }, { "epoch": 0.34041740293232725, "grad_norm": 5.295408873076435, "learning_rate": 1.8987196931091266e-05, "loss": 0.928, "step": 2673 }, { "epoch": 0.3405447569925339, "grad_norm": 5.86005595001487, "learning_rate": 1.8986292287216336e-05, "loss": 0.8758, "step": 2674 }, { "epoch": 0.3406721110527405, "grad_norm": 5.696962303375719, "learning_rate": 1.8985387261074745e-05, "loss": 1.0028, "step": 2675 }, { "epoch": 0.3407994651129471, "grad_norm": 6.769568723298619, "learning_rate": 1.8984481852704994e-05, "loss": 0.887, "step": 2676 }, { "epoch": 0.34092681917315376, "grad_norm": 5.561314831606821, "learning_rate": 1.8983576062145594e-05, "loss": 0.9901, "step": 2677 }, { "epoch": 0.3410541732333604, "grad_norm": 5.756611547392125, "learning_rate": 1.898266988943508e-05, "loss": 0.9414, "step": 2678 }, { "epoch": 0.34118152729356704, "grad_norm": 4.89537544003037, "learning_rate": 1.8981763334611998e-05, "loss": 1.0601, "step": 2679 }, { "epoch": 0.3413088813537737, "grad_norm": 5.591903978628493, "learning_rate": 1.8980856397714914e-05, "loss": 0.9329, "step": 2680 }, { "epoch": 0.34143623541398027, "grad_norm": 5.680999730203331, "learning_rate": 1.8979949078782404e-05, "loss": 1.1186, "step": 2681 }, { "epoch": 0.3415635894741869, "grad_norm": 4.759601726379005, "learning_rate": 1.8979041377853068e-05, "loss": 0.9277, "step": 2682 }, { "epoch": 0.34169094353439355, "grad_norm": 6.369109779804948, "learning_rate": 1.8978133294965516e-05, "loss": 0.9856, "step": 2683 }, { "epoch": 0.3418182975946002, "grad_norm": 4.901043699009387, "learning_rate": 1.897722483015838e-05, "loss": 0.8883, "step": 2684 }, { "epoch": 0.34194565165480684, "grad_norm": 4.264040250849452, "learning_rate": 1.8976315983470305e-05, "loss": 0.9974, "step": 2685 }, { "epoch": 0.3420730057150135, "grad_norm": 6.210438757491041, "learning_rate": 1.897540675493995e-05, "loss": 0.901, "step": 2686 }, { "epoch": 0.34220035977522006, "grad_norm": 6.073060894971596, "learning_rate": 1.897449714460599e-05, "loss": 0.9285, "step": 2687 }, { "epoch": 0.3423277138354267, "grad_norm": 8.17078819852505, "learning_rate": 1.8973587152507125e-05, "loss": 0.9444, "step": 2688 }, { "epoch": 0.34245506789563335, "grad_norm": 5.885135430112184, "learning_rate": 1.897267677868206e-05, "loss": 0.9022, "step": 2689 }, { "epoch": 0.34258242195584, "grad_norm": 5.221357596404994, "learning_rate": 1.8971766023169523e-05, "loss": 0.945, "step": 2690 }, { "epoch": 0.34270977601604663, "grad_norm": 4.874377479909654, "learning_rate": 1.897085488600826e-05, "loss": 0.9542, "step": 2691 }, { "epoch": 0.3428371300762532, "grad_norm": 6.4016132228891145, "learning_rate": 1.8969943367237023e-05, "loss": 0.9382, "step": 2692 }, { "epoch": 0.34296448413645986, "grad_norm": 4.688116007700764, "learning_rate": 1.896903146689459e-05, "loss": 1.0233, "step": 2693 }, { "epoch": 0.3430918381966665, "grad_norm": 4.423170998379805, "learning_rate": 1.8968119185019752e-05, "loss": 0.8811, "step": 2694 }, { "epoch": 0.34321919225687314, "grad_norm": 4.925819197167471, "learning_rate": 1.896720652165132e-05, "loss": 1.0469, "step": 2695 }, { "epoch": 0.3433465463170798, "grad_norm": 5.241618050176448, "learning_rate": 1.8966293476828113e-05, "loss": 0.991, "step": 2696 }, { "epoch": 0.3434739003772864, "grad_norm": 5.70471002161057, "learning_rate": 1.8965380050588977e-05, "loss": 0.9894, "step": 2697 }, { "epoch": 0.343601254437493, "grad_norm": 5.8313267480965605, "learning_rate": 1.8964466242972758e-05, "loss": 0.9719, "step": 2698 }, { "epoch": 0.34372860849769965, "grad_norm": 5.131299868677054, "learning_rate": 1.8963552054018335e-05, "loss": 0.9245, "step": 2699 }, { "epoch": 0.3438559625579063, "grad_norm": 6.486207266545669, "learning_rate": 1.8962637483764597e-05, "loss": 0.9324, "step": 2700 }, { "epoch": 0.34398331661811293, "grad_norm": 4.898501747111568, "learning_rate": 1.8961722532250447e-05, "loss": 0.9144, "step": 2701 }, { "epoch": 0.3441106706783196, "grad_norm": 4.836383164288827, "learning_rate": 1.8960807199514805e-05, "loss": 0.9566, "step": 2702 }, { "epoch": 0.3442380247385262, "grad_norm": 4.156563424360613, "learning_rate": 1.8959891485596613e-05, "loss": 0.9324, "step": 2703 }, { "epoch": 0.3443653787987328, "grad_norm": 3.904501073324452, "learning_rate": 1.8958975390534817e-05, "loss": 0.9814, "step": 2704 }, { "epoch": 0.34449273285893944, "grad_norm": 6.786592268970057, "learning_rate": 1.8958058914368393e-05, "loss": 1.0467, "step": 2705 }, { "epoch": 0.3446200869191461, "grad_norm": 6.585865097281438, "learning_rate": 1.8957142057136323e-05, "loss": 0.9099, "step": 2706 }, { "epoch": 0.3447474409793527, "grad_norm": 5.4083581243024215, "learning_rate": 1.895622481887761e-05, "loss": 1.0338, "step": 2707 }, { "epoch": 0.34487479503955937, "grad_norm": 4.2346031437184335, "learning_rate": 1.8955307199631277e-05, "loss": 0.996, "step": 2708 }, { "epoch": 0.345002149099766, "grad_norm": 5.0763300613819275, "learning_rate": 1.8954389199436353e-05, "loss": 1.0593, "step": 2709 }, { "epoch": 0.3451295031599726, "grad_norm": 4.504865559086568, "learning_rate": 1.8953470818331885e-05, "loss": 1.0109, "step": 2710 }, { "epoch": 0.34525685722017924, "grad_norm": 4.411564212806035, "learning_rate": 1.895255205635695e-05, "loss": 0.9596, "step": 2711 }, { "epoch": 0.3453842112803859, "grad_norm": 5.169314208597407, "learning_rate": 1.8951632913550625e-05, "loss": 1.0357, "step": 2712 }, { "epoch": 0.3455115653405925, "grad_norm": 5.373941184150419, "learning_rate": 1.895071338995201e-05, "loss": 0.9423, "step": 2713 }, { "epoch": 0.34563891940079916, "grad_norm": 5.164452519311098, "learning_rate": 1.894979348560022e-05, "loss": 0.9505, "step": 2714 }, { "epoch": 0.3457662734610058, "grad_norm": 6.359659924322003, "learning_rate": 1.8948873200534386e-05, "loss": 0.9462, "step": 2715 }, { "epoch": 0.3458936275212124, "grad_norm": 4.959538203860984, "learning_rate": 1.8947952534793663e-05, "loss": 0.9285, "step": 2716 }, { "epoch": 0.34602098158141903, "grad_norm": 6.295713255987071, "learning_rate": 1.8947031488417203e-05, "loss": 1.0014, "step": 2717 }, { "epoch": 0.34614833564162567, "grad_norm": 6.542061504264993, "learning_rate": 1.8946110061444196e-05, "loss": 0.963, "step": 2718 }, { "epoch": 0.3462756897018323, "grad_norm": 4.913011302013352, "learning_rate": 1.8945188253913837e-05, "loss": 0.9696, "step": 2719 }, { "epoch": 0.34640304376203895, "grad_norm": 8.438456615331273, "learning_rate": 1.8944266065865334e-05, "loss": 1.0225, "step": 2720 }, { "epoch": 0.3465303978222456, "grad_norm": 4.89273639963552, "learning_rate": 1.894334349733792e-05, "loss": 0.9205, "step": 2721 }, { "epoch": 0.3466577518824522, "grad_norm": 5.831267872217722, "learning_rate": 1.8942420548370836e-05, "loss": 1.0323, "step": 2722 }, { "epoch": 0.3467851059426588, "grad_norm": 5.377832420498871, "learning_rate": 1.894149721900335e-05, "loss": 0.931, "step": 2723 }, { "epoch": 0.34691246000286546, "grad_norm": 5.2599340639224295, "learning_rate": 1.894057350927473e-05, "loss": 1.0013, "step": 2724 }, { "epoch": 0.3470398140630721, "grad_norm": 5.201803320194106, "learning_rate": 1.8939649419224278e-05, "loss": 0.9274, "step": 2725 }, { "epoch": 0.34716716812327875, "grad_norm": 5.285171948597805, "learning_rate": 1.89387249488913e-05, "loss": 0.9969, "step": 2726 }, { "epoch": 0.3472945221834854, "grad_norm": 4.584910497093419, "learning_rate": 1.893780009831512e-05, "loss": 0.9819, "step": 2727 }, { "epoch": 0.347421876243692, "grad_norm": 6.267111585257623, "learning_rate": 1.8936874867535085e-05, "loss": 0.9984, "step": 2728 }, { "epoch": 0.3475492303038986, "grad_norm": 6.362933538689009, "learning_rate": 1.8935949256590552e-05, "loss": 1.011, "step": 2729 }, { "epoch": 0.34767658436410526, "grad_norm": 4.268885445854585, "learning_rate": 1.8935023265520892e-05, "loss": 0.9676, "step": 2730 }, { "epoch": 0.3478039384243119, "grad_norm": 4.993840047050701, "learning_rate": 1.89340968943655e-05, "loss": 0.9638, "step": 2731 }, { "epoch": 0.34793129248451854, "grad_norm": 5.28595844279309, "learning_rate": 1.8933170143163782e-05, "loss": 0.8549, "step": 2732 }, { "epoch": 0.3480586465447252, "grad_norm": 5.5992479636812895, "learning_rate": 1.8932243011955154e-05, "loss": 1.0261, "step": 2733 }, { "epoch": 0.34818600060493177, "grad_norm": 7.832502469848672, "learning_rate": 1.8931315500779066e-05, "loss": 0.8554, "step": 2734 }, { "epoch": 0.3483133546651384, "grad_norm": 5.713022167289004, "learning_rate": 1.8930387609674964e-05, "loss": 1.0488, "step": 2735 }, { "epoch": 0.34844070872534505, "grad_norm": 6.649868623970285, "learning_rate": 1.8929459338682323e-05, "loss": 1.0333, "step": 2736 }, { "epoch": 0.3485680627855517, "grad_norm": 4.83392637275808, "learning_rate": 1.8928530687840635e-05, "loss": 0.93, "step": 2737 }, { "epoch": 0.34869541684575833, "grad_norm": 5.0341911485987385, "learning_rate": 1.8927601657189395e-05, "loss": 0.9729, "step": 2738 }, { "epoch": 0.3488227709059649, "grad_norm": 14.025295837872504, "learning_rate": 1.892667224676813e-05, "loss": 0.8949, "step": 2739 }, { "epoch": 0.34895012496617156, "grad_norm": 4.651908130253577, "learning_rate": 1.8925742456616375e-05, "loss": 0.9562, "step": 2740 }, { "epoch": 0.3490774790263782, "grad_norm": 5.638933007245704, "learning_rate": 1.892481228677368e-05, "loss": 0.9033, "step": 2741 }, { "epoch": 0.34920483308658484, "grad_norm": 4.345561795454672, "learning_rate": 1.8923881737279614e-05, "loss": 0.9103, "step": 2742 }, { "epoch": 0.3493321871467915, "grad_norm": 4.194559779206038, "learning_rate": 1.8922950808173764e-05, "loss": 0.9609, "step": 2743 }, { "epoch": 0.3494595412069981, "grad_norm": 4.283764622766022, "learning_rate": 1.8922019499495727e-05, "loss": 1.0447, "step": 2744 }, { "epoch": 0.3495868952672047, "grad_norm": 4.5529503364024375, "learning_rate": 1.8921087811285116e-05, "loss": 0.9335, "step": 2745 }, { "epoch": 0.34971424932741135, "grad_norm": 5.622674079666395, "learning_rate": 1.8920155743581577e-05, "loss": 0.9892, "step": 2746 }, { "epoch": 0.349841603387618, "grad_norm": 3.8671778823270557, "learning_rate": 1.8919223296424746e-05, "loss": 0.9959, "step": 2747 }, { "epoch": 0.34996895744782464, "grad_norm": 5.028897321810759, "learning_rate": 1.8918290469854296e-05, "loss": 1.0099, "step": 2748 }, { "epoch": 0.3500963115080313, "grad_norm": 5.036151654421074, "learning_rate": 1.8917357263909905e-05, "loss": 0.9845, "step": 2749 }, { "epoch": 0.3502236655682379, "grad_norm": 5.5350011832107615, "learning_rate": 1.891642367863127e-05, "loss": 0.8733, "step": 2750 }, { "epoch": 0.3503510196284445, "grad_norm": 6.505568246732227, "learning_rate": 1.891548971405811e-05, "loss": 0.9439, "step": 2751 }, { "epoch": 0.35047837368865115, "grad_norm": 4.4345428929660455, "learning_rate": 1.891455537023015e-05, "loss": 0.9105, "step": 2752 }, { "epoch": 0.3506057277488578, "grad_norm": 6.672348112890964, "learning_rate": 1.8913620647187137e-05, "loss": 0.9758, "step": 2753 }, { "epoch": 0.35073308180906443, "grad_norm": 4.447276775897307, "learning_rate": 1.891268554496883e-05, "loss": 0.8875, "step": 2754 }, { "epoch": 0.35086043586927107, "grad_norm": 5.0922723979023825, "learning_rate": 1.8911750063615014e-05, "loss": 0.972, "step": 2755 }, { "epoch": 0.3509877899294777, "grad_norm": 6.7767800220341385, "learning_rate": 1.8910814203165474e-05, "loss": 1.008, "step": 2756 }, { "epoch": 0.3511151439896843, "grad_norm": 5.824552448417409, "learning_rate": 1.890987796366003e-05, "loss": 0.7824, "step": 2757 }, { "epoch": 0.35124249804989094, "grad_norm": 5.880589849249619, "learning_rate": 1.8908941345138502e-05, "loss": 1.0618, "step": 2758 }, { "epoch": 0.3513698521100976, "grad_norm": 5.794106205233841, "learning_rate": 1.890800434764074e-05, "loss": 0.9095, "step": 2759 }, { "epoch": 0.3514972061703042, "grad_norm": 5.340078955648828, "learning_rate": 1.8907066971206593e-05, "loss": 0.9057, "step": 2760 }, { "epoch": 0.35162456023051086, "grad_norm": 7.968152790504351, "learning_rate": 1.8906129215875943e-05, "loss": 0.8624, "step": 2761 }, { "epoch": 0.3517519142907175, "grad_norm": 5.134287165089572, "learning_rate": 1.8905191081688676e-05, "loss": 1.0272, "step": 2762 }, { "epoch": 0.3518792683509241, "grad_norm": 3.8623274039312996, "learning_rate": 1.8904252568684704e-05, "loss": 1.0475, "step": 2763 }, { "epoch": 0.35200662241113073, "grad_norm": 4.881146395434887, "learning_rate": 1.890331367690395e-05, "loss": 0.9093, "step": 2764 }, { "epoch": 0.3521339764713374, "grad_norm": 4.924924456796405, "learning_rate": 1.890237440638635e-05, "loss": 1.0146, "step": 2765 }, { "epoch": 0.352261330531544, "grad_norm": 4.4388969264291225, "learning_rate": 1.890143475717186e-05, "loss": 0.9316, "step": 2766 }, { "epoch": 0.35238868459175066, "grad_norm": 4.454980510493503, "learning_rate": 1.8900494729300453e-05, "loss": 0.9239, "step": 2767 }, { "epoch": 0.3525160386519573, "grad_norm": 5.4913202634779905, "learning_rate": 1.889955432281212e-05, "loss": 0.9966, "step": 2768 }, { "epoch": 0.3526433927121639, "grad_norm": 7.048082878309639, "learning_rate": 1.8898613537746857e-05, "loss": 0.962, "step": 2769 }, { "epoch": 0.3527707467723705, "grad_norm": 4.641019759422344, "learning_rate": 1.8897672374144692e-05, "loss": 1.074, "step": 2770 }, { "epoch": 0.35289810083257717, "grad_norm": 5.094820998232256, "learning_rate": 1.8896730832045655e-05, "loss": 0.9095, "step": 2771 }, { "epoch": 0.3530254548927838, "grad_norm": 5.682244524132277, "learning_rate": 1.88957889114898e-05, "loss": 0.9396, "step": 2772 }, { "epoch": 0.35315280895299045, "grad_norm": 6.277327342270094, "learning_rate": 1.88948466125172e-05, "loss": 1.0474, "step": 2773 }, { "epoch": 0.3532801630131971, "grad_norm": 5.484288750212111, "learning_rate": 1.889390393516793e-05, "loss": 1.058, "step": 2774 }, { "epoch": 0.3534075170734037, "grad_norm": 4.96698043310045, "learning_rate": 1.8892960879482092e-05, "loss": 0.9238, "step": 2775 }, { "epoch": 0.3535348711336103, "grad_norm": 5.290010488460073, "learning_rate": 1.8892017445499812e-05, "loss": 0.8931, "step": 2776 }, { "epoch": 0.35366222519381696, "grad_norm": 5.977796798606464, "learning_rate": 1.8891073633261217e-05, "loss": 1.0182, "step": 2777 }, { "epoch": 0.3537895792540236, "grad_norm": 5.114462090302804, "learning_rate": 1.8890129442806453e-05, "loss": 0.866, "step": 2778 }, { "epoch": 0.35391693331423024, "grad_norm": 6.706455354843844, "learning_rate": 1.888918487417569e-05, "loss": 0.8921, "step": 2779 }, { "epoch": 0.3540442873744369, "grad_norm": 5.326861189694861, "learning_rate": 1.88882399274091e-05, "loss": 1.0039, "step": 2780 }, { "epoch": 0.35417164143464347, "grad_norm": 6.186171572101959, "learning_rate": 1.888729460254689e-05, "loss": 0.9196, "step": 2781 }, { "epoch": 0.3542989954948501, "grad_norm": 5.827985910182617, "learning_rate": 1.888634889962927e-05, "loss": 0.9968, "step": 2782 }, { "epoch": 0.35442634955505675, "grad_norm": 4.537162343642595, "learning_rate": 1.8885402818696466e-05, "loss": 1.0051, "step": 2783 }, { "epoch": 0.3545537036152634, "grad_norm": 3.9512938830875264, "learning_rate": 1.8884456359788725e-05, "loss": 0.9294, "step": 2784 }, { "epoch": 0.35468105767547004, "grad_norm": 5.094861430042333, "learning_rate": 1.8883509522946308e-05, "loss": 1.0091, "step": 2785 }, { "epoch": 0.3548084117356766, "grad_norm": 4.989120472548204, "learning_rate": 1.8882562308209493e-05, "loss": 0.9185, "step": 2786 }, { "epoch": 0.35493576579588326, "grad_norm": 5.310560882314186, "learning_rate": 1.8881614715618575e-05, "loss": 1.0017, "step": 2787 }, { "epoch": 0.3550631198560899, "grad_norm": 5.650708260493975, "learning_rate": 1.8880666745213862e-05, "loss": 0.8541, "step": 2788 }, { "epoch": 0.35519047391629655, "grad_norm": 5.569013232721796, "learning_rate": 1.887971839703568e-05, "loss": 0.9467, "step": 2789 }, { "epoch": 0.3553178279765032, "grad_norm": 5.8768132434956435, "learning_rate": 1.887876967112437e-05, "loss": 0.9919, "step": 2790 }, { "epoch": 0.35544518203670983, "grad_norm": 6.124315496070573, "learning_rate": 1.8877820567520286e-05, "loss": 1.0021, "step": 2791 }, { "epoch": 0.3555725360969164, "grad_norm": 3.9439859407340703, "learning_rate": 1.8876871086263807e-05, "loss": 1.1124, "step": 2792 }, { "epoch": 0.35569989015712306, "grad_norm": 4.965043130708425, "learning_rate": 1.8875921227395323e-05, "loss": 0.9407, "step": 2793 }, { "epoch": 0.3558272442173297, "grad_norm": 4.713357635895819, "learning_rate": 1.887497099095524e-05, "loss": 0.9837, "step": 2794 }, { "epoch": 0.35595459827753634, "grad_norm": 5.171273648892833, "learning_rate": 1.8874020376983974e-05, "loss": 1.0284, "step": 2795 }, { "epoch": 0.356081952337743, "grad_norm": 5.7194336862922475, "learning_rate": 1.887306938552197e-05, "loss": 0.9462, "step": 2796 }, { "epoch": 0.3562093063979496, "grad_norm": 4.630686794944254, "learning_rate": 1.8872118016609677e-05, "loss": 0.9091, "step": 2797 }, { "epoch": 0.3563366604581562, "grad_norm": 5.280188640056807, "learning_rate": 1.887116627028757e-05, "loss": 0.88, "step": 2798 }, { "epoch": 0.35646401451836285, "grad_norm": 6.115073119979459, "learning_rate": 1.8870214146596134e-05, "loss": 0.9775, "step": 2799 }, { "epoch": 0.3565913685785695, "grad_norm": 4.394690535307697, "learning_rate": 1.8869261645575867e-05, "loss": 0.9835, "step": 2800 }, { "epoch": 0.35671872263877613, "grad_norm": 5.050394159929925, "learning_rate": 1.8868308767267294e-05, "loss": 0.9251, "step": 2801 }, { "epoch": 0.3568460766989828, "grad_norm": 6.0567968225989, "learning_rate": 1.8867355511710942e-05, "loss": 0.9727, "step": 2802 }, { "epoch": 0.3569734307591894, "grad_norm": 4.986890965039478, "learning_rate": 1.8866401878947365e-05, "loss": 0.9626, "step": 2803 }, { "epoch": 0.357100784819396, "grad_norm": 6.266019204693198, "learning_rate": 1.8865447869017134e-05, "loss": 0.893, "step": 2804 }, { "epoch": 0.35722813887960264, "grad_norm": 5.652046116688697, "learning_rate": 1.8864493481960825e-05, "loss": 0.8861, "step": 2805 }, { "epoch": 0.3573554929398093, "grad_norm": 4.635686127906509, "learning_rate": 1.8863538717819037e-05, "loss": 1.0814, "step": 2806 }, { "epoch": 0.3574828470000159, "grad_norm": 6.127510023603019, "learning_rate": 1.8862583576632388e-05, "loss": 0.9553, "step": 2807 }, { "epoch": 0.35761020106022257, "grad_norm": 5.784405481226246, "learning_rate": 1.8861628058441505e-05, "loss": 0.9891, "step": 2808 }, { "epoch": 0.3577375551204292, "grad_norm": 6.475174077466651, "learning_rate": 1.8860672163287038e-05, "loss": 1.0264, "step": 2809 }, { "epoch": 0.3578649091806358, "grad_norm": 6.027637724218653, "learning_rate": 1.885971589120965e-05, "loss": 0.962, "step": 2810 }, { "epoch": 0.35799226324084243, "grad_norm": 5.6316722715782115, "learning_rate": 1.8858759242250014e-05, "loss": 0.9262, "step": 2811 }, { "epoch": 0.3581196173010491, "grad_norm": 6.223388475535787, "learning_rate": 1.885780221644883e-05, "loss": 1.0871, "step": 2812 }, { "epoch": 0.3582469713612557, "grad_norm": 4.580507459221563, "learning_rate": 1.8856844813846812e-05, "loss": 0.9104, "step": 2813 }, { "epoch": 0.35837432542146236, "grad_norm": 6.215065943152524, "learning_rate": 1.8855887034484678e-05, "loss": 0.9589, "step": 2814 }, { "epoch": 0.358501679481669, "grad_norm": 4.489169332558833, "learning_rate": 1.8854928878403173e-05, "loss": 0.9672, "step": 2815 }, { "epoch": 0.3586290335418756, "grad_norm": 9.765703124687503, "learning_rate": 1.885397034564306e-05, "loss": 0.9291, "step": 2816 }, { "epoch": 0.3587563876020822, "grad_norm": 5.523925932313763, "learning_rate": 1.8853011436245113e-05, "loss": 0.9819, "step": 2817 }, { "epoch": 0.35888374166228887, "grad_norm": 4.364545265098502, "learning_rate": 1.8852052150250123e-05, "loss": 1.0217, "step": 2818 }, { "epoch": 0.3590110957224955, "grad_norm": 5.659694571190229, "learning_rate": 1.8851092487698896e-05, "loss": 1.0377, "step": 2819 }, { "epoch": 0.35913844978270215, "grad_norm": 4.616616357375445, "learning_rate": 1.885013244863225e-05, "loss": 0.8517, "step": 2820 }, { "epoch": 0.3592658038429088, "grad_norm": 4.161922627864092, "learning_rate": 1.8849172033091032e-05, "loss": 0.9806, "step": 2821 }, { "epoch": 0.3593931579031154, "grad_norm": 4.753201459862814, "learning_rate": 1.884821124111609e-05, "loss": 0.9091, "step": 2822 }, { "epoch": 0.359520511963322, "grad_norm": 5.397430811867483, "learning_rate": 1.8847250072748307e-05, "loss": 0.9048, "step": 2823 }, { "epoch": 0.35964786602352866, "grad_norm": 6.30801564688668, "learning_rate": 1.8846288528028555e-05, "loss": 0.95, "step": 2824 }, { "epoch": 0.3597752200837353, "grad_norm": 4.702596742485637, "learning_rate": 1.8845326606997747e-05, "loss": 1.0221, "step": 2825 }, { "epoch": 0.35990257414394194, "grad_norm": 4.7643751162574395, "learning_rate": 1.8844364309696798e-05, "loss": 0.9242, "step": 2826 }, { "epoch": 0.3600299282041486, "grad_norm": 5.457328684516776, "learning_rate": 1.8843401636166642e-05, "loss": 0.8829, "step": 2827 }, { "epoch": 0.36015728226435517, "grad_norm": 4.333232193769126, "learning_rate": 1.884243858644823e-05, "loss": 0.9323, "step": 2828 }, { "epoch": 0.3602846363245618, "grad_norm": 5.100519841945628, "learning_rate": 1.8841475160582533e-05, "loss": 0.8748, "step": 2829 }, { "epoch": 0.36041199038476845, "grad_norm": 7.685310610250271, "learning_rate": 1.8840511358610535e-05, "loss": 0.9904, "step": 2830 }, { "epoch": 0.3605393444449751, "grad_norm": 5.435347207637993, "learning_rate": 1.8839547180573228e-05, "loss": 0.8986, "step": 2831 }, { "epoch": 0.36066669850518174, "grad_norm": 6.886759463063859, "learning_rate": 1.8838582626511633e-05, "loss": 0.9547, "step": 2832 }, { "epoch": 0.3607940525653883, "grad_norm": 6.528054363622752, "learning_rate": 1.883761769646678e-05, "loss": 0.9656, "step": 2833 }, { "epoch": 0.36092140662559496, "grad_norm": 6.002436143312232, "learning_rate": 1.883665239047971e-05, "loss": 0.8355, "step": 2834 }, { "epoch": 0.3610487606858016, "grad_norm": 5.445380627768312, "learning_rate": 1.8835686708591495e-05, "loss": 0.9088, "step": 2835 }, { "epoch": 0.36117611474600825, "grad_norm": 4.985524203517851, "learning_rate": 1.883472065084321e-05, "loss": 0.9615, "step": 2836 }, { "epoch": 0.3613034688062149, "grad_norm": 6.571285797131935, "learning_rate": 1.883375421727595e-05, "loss": 0.9333, "step": 2837 }, { "epoch": 0.36143082286642153, "grad_norm": 5.187613612390101, "learning_rate": 1.8832787407930825e-05, "loss": 0.9698, "step": 2838 }, { "epoch": 0.3615581769266281, "grad_norm": 5.2872010058554, "learning_rate": 1.8831820222848964e-05, "loss": 0.9871, "step": 2839 }, { "epoch": 0.36168553098683476, "grad_norm": 6.226508586654924, "learning_rate": 1.8830852662071507e-05, "loss": 0.8644, "step": 2840 }, { "epoch": 0.3618128850470414, "grad_norm": 4.706382459825624, "learning_rate": 1.8829884725639618e-05, "loss": 0.9077, "step": 2841 }, { "epoch": 0.36194023910724804, "grad_norm": 5.18586502967027, "learning_rate": 1.8828916413594468e-05, "loss": 1.0207, "step": 2842 }, { "epoch": 0.3620675931674547, "grad_norm": 4.583717445692769, "learning_rate": 1.8827947725977246e-05, "loss": 0.9448, "step": 2843 }, { "epoch": 0.3621949472276613, "grad_norm": 4.9352187669184175, "learning_rate": 1.8826978662829164e-05, "loss": 0.8308, "step": 2844 }, { "epoch": 0.3623223012878679, "grad_norm": 5.02420099842825, "learning_rate": 1.882600922419144e-05, "loss": 0.94, "step": 2845 }, { "epoch": 0.36244965534807455, "grad_norm": 5.964051639346117, "learning_rate": 1.882503941010532e-05, "loss": 0.9682, "step": 2846 }, { "epoch": 0.3625770094082812, "grad_norm": 4.8076039960526415, "learning_rate": 1.882406922061205e-05, "loss": 1.0093, "step": 2847 }, { "epoch": 0.36270436346848783, "grad_norm": 4.278485373121275, "learning_rate": 1.8823098655752905e-05, "loss": 0.9995, "step": 2848 }, { "epoch": 0.3628317175286945, "grad_norm": 5.549804257901839, "learning_rate": 1.8822127715569173e-05, "loss": 0.8992, "step": 2849 }, { "epoch": 0.3629590715889011, "grad_norm": 5.601135493008106, "learning_rate": 1.8821156400102154e-05, "loss": 0.9144, "step": 2850 }, { "epoch": 0.3630864256491077, "grad_norm": 5.521493529186345, "learning_rate": 1.882018470939317e-05, "loss": 1.0028, "step": 2851 }, { "epoch": 0.36321377970931434, "grad_norm": 7.3509696372760445, "learning_rate": 1.881921264348355e-05, "loss": 0.8766, "step": 2852 }, { "epoch": 0.363341133769521, "grad_norm": 4.903421351932734, "learning_rate": 1.8818240202414652e-05, "loss": 0.8999, "step": 2853 }, { "epoch": 0.3634684878297276, "grad_norm": 5.830781142182182, "learning_rate": 1.8817267386227837e-05, "loss": 0.9452, "step": 2854 }, { "epoch": 0.36359584188993427, "grad_norm": 3.809066351887756, "learning_rate": 1.881629419496449e-05, "loss": 0.9475, "step": 2855 }, { "epoch": 0.3637231959501409, "grad_norm": 5.997554280590782, "learning_rate": 1.8815320628666004e-05, "loss": 0.9631, "step": 2856 }, { "epoch": 0.3638505500103475, "grad_norm": 4.95970049476808, "learning_rate": 1.8814346687373802e-05, "loss": 0.881, "step": 2857 }, { "epoch": 0.36397790407055414, "grad_norm": 4.8326043642539425, "learning_rate": 1.881337237112931e-05, "loss": 0.9825, "step": 2858 }, { "epoch": 0.3641052581307608, "grad_norm": 4.277864328695065, "learning_rate": 1.8812397679973975e-05, "loss": 0.8529, "step": 2859 }, { "epoch": 0.3642326121909674, "grad_norm": 6.70503403304393, "learning_rate": 1.881142261394926e-05, "loss": 1.0027, "step": 2860 }, { "epoch": 0.36435996625117406, "grad_norm": 5.621615430046328, "learning_rate": 1.8810447173096638e-05, "loss": 1.0146, "step": 2861 }, { "epoch": 0.3644873203113807, "grad_norm": 4.616510590119612, "learning_rate": 1.8809471357457608e-05, "loss": 0.8621, "step": 2862 }, { "epoch": 0.3646146743715873, "grad_norm": 10.847084785879144, "learning_rate": 1.8808495167073682e-05, "loss": 0.9343, "step": 2863 }, { "epoch": 0.36474202843179393, "grad_norm": 5.326881599211821, "learning_rate": 1.880751860198638e-05, "loss": 0.8416, "step": 2864 }, { "epoch": 0.36486938249200057, "grad_norm": 4.3347511661835325, "learning_rate": 1.880654166223725e-05, "loss": 1.0326, "step": 2865 }, { "epoch": 0.3649967365522072, "grad_norm": 6.163102571999616, "learning_rate": 1.880556434786785e-05, "loss": 0.9138, "step": 2866 }, { "epoch": 0.36512409061241385, "grad_norm": 4.515382542420357, "learning_rate": 1.8804586658919745e-05, "loss": 0.9831, "step": 2867 }, { "epoch": 0.3652514446726205, "grad_norm": 4.771437005687402, "learning_rate": 1.8803608595434535e-05, "loss": 1.0142, "step": 2868 }, { "epoch": 0.3653787987328271, "grad_norm": 5.06314403063705, "learning_rate": 1.8802630157453817e-05, "loss": 1.0286, "step": 2869 }, { "epoch": 0.3655061527930337, "grad_norm": 5.476848600263655, "learning_rate": 1.880165134501922e-05, "loss": 1.001, "step": 2870 }, { "epoch": 0.36563350685324036, "grad_norm": 8.881051163189163, "learning_rate": 1.880067215817238e-05, "loss": 0.9062, "step": 2871 }, { "epoch": 0.365760860913447, "grad_norm": 5.901585588922599, "learning_rate": 1.8799692596954947e-05, "loss": 1.0077, "step": 2872 }, { "epoch": 0.36588821497365365, "grad_norm": 5.325226742210798, "learning_rate": 1.8798712661408594e-05, "loss": 0.8875, "step": 2873 }, { "epoch": 0.36601556903386023, "grad_norm": 6.34117818480796, "learning_rate": 1.8797732351575003e-05, "loss": 0.8601, "step": 2874 }, { "epoch": 0.3661429230940669, "grad_norm": 4.53344294658609, "learning_rate": 1.8796751667495883e-05, "loss": 0.9057, "step": 2875 }, { "epoch": 0.3662702771542735, "grad_norm": 4.768531188344464, "learning_rate": 1.8795770609212938e-05, "loss": 0.8625, "step": 2876 }, { "epoch": 0.36639763121448016, "grad_norm": 4.747389628255796, "learning_rate": 1.8794789176767914e-05, "loss": 0.9804, "step": 2877 }, { "epoch": 0.3665249852746868, "grad_norm": 5.493460669075612, "learning_rate": 1.879380737020255e-05, "loss": 0.9467, "step": 2878 }, { "epoch": 0.36665233933489344, "grad_norm": 5.375984456596157, "learning_rate": 1.879282518955862e-05, "loss": 0.919, "step": 2879 }, { "epoch": 0.3667796933951, "grad_norm": 5.036383243331077, "learning_rate": 1.87918426348779e-05, "loss": 0.944, "step": 2880 }, { "epoch": 0.36690704745530667, "grad_norm": 4.541817385537888, "learning_rate": 1.8790859706202184e-05, "loss": 0.9079, "step": 2881 }, { "epoch": 0.3670344015155133, "grad_norm": 4.8996757633602215, "learning_rate": 1.8789876403573292e-05, "loss": 0.9794, "step": 2882 }, { "epoch": 0.36716175557571995, "grad_norm": 5.426557733878018, "learning_rate": 1.8788892727033045e-05, "loss": 0.9072, "step": 2883 }, { "epoch": 0.3672891096359266, "grad_norm": 5.309730076942729, "learning_rate": 1.878790867662329e-05, "loss": 0.8818, "step": 2884 }, { "epoch": 0.36741646369613323, "grad_norm": 5.627211411548679, "learning_rate": 1.878692425238589e-05, "loss": 1.0839, "step": 2885 }, { "epoch": 0.3675438177563398, "grad_norm": 4.098214779618415, "learning_rate": 1.878593945436272e-05, "loss": 1.0607, "step": 2886 }, { "epoch": 0.36767117181654646, "grad_norm": 5.927298364833824, "learning_rate": 1.878495428259567e-05, "loss": 0.9909, "step": 2887 }, { "epoch": 0.3677985258767531, "grad_norm": 5.044985768928531, "learning_rate": 1.878396873712665e-05, "loss": 0.8956, "step": 2888 }, { "epoch": 0.36792587993695974, "grad_norm": 5.414321783113167, "learning_rate": 1.8782982817997583e-05, "loss": 0.9774, "step": 2889 }, { "epoch": 0.3680532339971664, "grad_norm": 5.497408342972157, "learning_rate": 1.8781996525250412e-05, "loss": 0.8826, "step": 2890 }, { "epoch": 0.368180588057373, "grad_norm": 6.85484355455839, "learning_rate": 1.8781009858927092e-05, "loss": 0.9922, "step": 2891 }, { "epoch": 0.3683079421175796, "grad_norm": 5.163619999242976, "learning_rate": 1.8780022819069588e-05, "loss": 1.066, "step": 2892 }, { "epoch": 0.36843529617778625, "grad_norm": 5.7523705944640495, "learning_rate": 1.8779035405719898e-05, "loss": 0.8428, "step": 2893 }, { "epoch": 0.3685626502379929, "grad_norm": 4.2790098246376695, "learning_rate": 1.8778047618920016e-05, "loss": 0.9314, "step": 2894 }, { "epoch": 0.36869000429819954, "grad_norm": 4.8378876681082, "learning_rate": 1.8777059458711968e-05, "loss": 0.9065, "step": 2895 }, { "epoch": 0.3688173583584062, "grad_norm": 6.07481878818835, "learning_rate": 1.8776070925137783e-05, "loss": 0.931, "step": 2896 }, { "epoch": 0.3689447124186128, "grad_norm": 5.234078014545973, "learning_rate": 1.877508201823952e-05, "loss": 0.9456, "step": 2897 }, { "epoch": 0.3690720664788194, "grad_norm": 5.189624397343033, "learning_rate": 1.877409273805924e-05, "loss": 0.9515, "step": 2898 }, { "epoch": 0.36919942053902605, "grad_norm": 6.355906897542976, "learning_rate": 1.8773103084639025e-05, "loss": 0.9949, "step": 2899 }, { "epoch": 0.3693267745992327, "grad_norm": 4.902473113644263, "learning_rate": 1.877211305802098e-05, "loss": 1.0517, "step": 2900 }, { "epoch": 0.36945412865943933, "grad_norm": 4.829157549125659, "learning_rate": 1.8771122658247214e-05, "loss": 0.9751, "step": 2901 }, { "epoch": 0.36958148271964597, "grad_norm": 8.24946268817277, "learning_rate": 1.877013188535986e-05, "loss": 1.0766, "step": 2902 }, { "epoch": 0.3697088367798526, "grad_norm": 4.363174587438457, "learning_rate": 1.8769140739401063e-05, "loss": 1.0098, "step": 2903 }, { "epoch": 0.3698361908400592, "grad_norm": 5.874919160327303, "learning_rate": 1.876814922041299e-05, "loss": 0.918, "step": 2904 }, { "epoch": 0.36996354490026584, "grad_norm": 4.5290126143819105, "learning_rate": 1.876715732843781e-05, "loss": 0.958, "step": 2905 }, { "epoch": 0.3700908989604725, "grad_norm": 4.968679367619038, "learning_rate": 1.8766165063517724e-05, "loss": 0.9514, "step": 2906 }, { "epoch": 0.3702182530206791, "grad_norm": 5.14503804392044, "learning_rate": 1.876517242569494e-05, "loss": 0.9125, "step": 2907 }, { "epoch": 0.37034560708088576, "grad_norm": 3.7898141017203133, "learning_rate": 1.876417941501168e-05, "loss": 0.9783, "step": 2908 }, { "epoch": 0.3704729611410924, "grad_norm": 6.721181598050644, "learning_rate": 1.8763186031510193e-05, "loss": 0.8898, "step": 2909 }, { "epoch": 0.370600315201299, "grad_norm": 6.287204937239724, "learning_rate": 1.8762192275232734e-05, "loss": 1.0121, "step": 2910 }, { "epoch": 0.37072766926150563, "grad_norm": 5.115626625615388, "learning_rate": 1.876119814622157e-05, "loss": 0.9795, "step": 2911 }, { "epoch": 0.3708550233217123, "grad_norm": 7.097123877168896, "learning_rate": 1.8760203644519e-05, "loss": 0.8829, "step": 2912 }, { "epoch": 0.3709823773819189, "grad_norm": 5.536951434861661, "learning_rate": 1.875920877016732e-05, "loss": 0.9093, "step": 2913 }, { "epoch": 0.37110973144212556, "grad_norm": 5.580248174856846, "learning_rate": 1.8758213523208855e-05, "loss": 0.9081, "step": 2914 }, { "epoch": 0.3712370855023322, "grad_norm": 5.1395988019822045, "learning_rate": 1.8757217903685943e-05, "loss": 0.9594, "step": 2915 }, { "epoch": 0.3713644395625388, "grad_norm": 4.782044587773427, "learning_rate": 1.8756221911640933e-05, "loss": 0.9754, "step": 2916 }, { "epoch": 0.3714917936227454, "grad_norm": 4.188082754095407, "learning_rate": 1.8755225547116197e-05, "loss": 1.0031, "step": 2917 }, { "epoch": 0.37161914768295207, "grad_norm": 4.259759356368611, "learning_rate": 1.8754228810154113e-05, "loss": 0.9268, "step": 2918 }, { "epoch": 0.3717465017431587, "grad_norm": 4.335068624232682, "learning_rate": 1.875323170079709e-05, "loss": 0.994, "step": 2919 }, { "epoch": 0.37187385580336535, "grad_norm": 4.326560718980989, "learning_rate": 1.8752234219087538e-05, "loss": 1.0216, "step": 2920 }, { "epoch": 0.37200120986357194, "grad_norm": 3.6456288016940523, "learning_rate": 1.875123636506789e-05, "loss": 0.873, "step": 2921 }, { "epoch": 0.3721285639237786, "grad_norm": 5.813288153259911, "learning_rate": 1.8750238138780595e-05, "loss": 0.8672, "step": 2922 }, { "epoch": 0.3722559179839852, "grad_norm": 4.531144344808089, "learning_rate": 1.8749239540268114e-05, "loss": 0.9351, "step": 2923 }, { "epoch": 0.37238327204419186, "grad_norm": 5.423830410795791, "learning_rate": 1.8748240569572927e-05, "loss": 0.984, "step": 2924 }, { "epoch": 0.3725106261043985, "grad_norm": 3.9568997782098756, "learning_rate": 1.8747241226737528e-05, "loss": 0.9587, "step": 2925 }, { "epoch": 0.37263798016460514, "grad_norm": 5.003290238234017, "learning_rate": 1.874624151180443e-05, "loss": 0.8993, "step": 2926 }, { "epoch": 0.37276533422481173, "grad_norm": 4.479493358438097, "learning_rate": 1.8745241424816165e-05, "loss": 0.9783, "step": 2927 }, { "epoch": 0.37289268828501837, "grad_norm": 5.317329455511681, "learning_rate": 1.8744240965815263e-05, "loss": 0.9897, "step": 2928 }, { "epoch": 0.373020042345225, "grad_norm": 6.075269327355154, "learning_rate": 1.874324013484429e-05, "loss": 0.9693, "step": 2929 }, { "epoch": 0.37314739640543165, "grad_norm": 4.84987995746355, "learning_rate": 1.874223893194582e-05, "loss": 0.9458, "step": 2930 }, { "epoch": 0.3732747504656383, "grad_norm": 7.881177144693244, "learning_rate": 1.8741237357162444e-05, "loss": 0.9884, "step": 2931 }, { "epoch": 0.37340210452584494, "grad_norm": 5.143477463020291, "learning_rate": 1.8740235410536764e-05, "loss": 0.9889, "step": 2932 }, { "epoch": 0.3735294585860515, "grad_norm": 5.309750372699807, "learning_rate": 1.8739233092111403e-05, "loss": 1.0212, "step": 2933 }, { "epoch": 0.37365681264625816, "grad_norm": 4.6413452411540135, "learning_rate": 1.8738230401929e-05, "loss": 0.9554, "step": 2934 }, { "epoch": 0.3737841667064648, "grad_norm": 6.55501049096194, "learning_rate": 1.873722734003221e-05, "loss": 0.949, "step": 2935 }, { "epoch": 0.37391152076667145, "grad_norm": 5.060015410147259, "learning_rate": 1.8736223906463698e-05, "loss": 1.0589, "step": 2936 }, { "epoch": 0.3740388748268781, "grad_norm": 4.799787302073041, "learning_rate": 1.873522010126615e-05, "loss": 0.9171, "step": 2937 }, { "epoch": 0.37416622888708473, "grad_norm": 5.91859977555694, "learning_rate": 1.8734215924482267e-05, "loss": 0.9276, "step": 2938 }, { "epoch": 0.3742935829472913, "grad_norm": 5.470136194294513, "learning_rate": 1.8733211376154768e-05, "loss": 1.0553, "step": 2939 }, { "epoch": 0.37442093700749796, "grad_norm": 5.044424495480191, "learning_rate": 1.873220645632638e-05, "loss": 0.9919, "step": 2940 }, { "epoch": 0.3745482910677046, "grad_norm": 6.213680482152051, "learning_rate": 1.8731201165039857e-05, "loss": 1.0122, "step": 2941 }, { "epoch": 0.37467564512791124, "grad_norm": 11.961822178051968, "learning_rate": 1.8730195502337957e-05, "loss": 0.8941, "step": 2942 }, { "epoch": 0.3748029991881179, "grad_norm": 5.794310956380277, "learning_rate": 1.8729189468263466e-05, "loss": 0.8491, "step": 2943 }, { "epoch": 0.3749303532483245, "grad_norm": 4.093354417164666, "learning_rate": 1.8728183062859177e-05, "loss": 0.8843, "step": 2944 }, { "epoch": 0.3750577073085311, "grad_norm": 4.318776304217335, "learning_rate": 1.87271762861679e-05, "loss": 0.9786, "step": 2945 }, { "epoch": 0.37518506136873775, "grad_norm": 8.44956006592161, "learning_rate": 1.8726169138232462e-05, "loss": 0.9483, "step": 2946 }, { "epoch": 0.3753124154289444, "grad_norm": 7.52215558912678, "learning_rate": 1.8725161619095712e-05, "loss": 0.9934, "step": 2947 }, { "epoch": 0.37543976948915103, "grad_norm": 5.148168901258842, "learning_rate": 1.8724153728800498e-05, "loss": 0.9824, "step": 2948 }, { "epoch": 0.3755671235493577, "grad_norm": 3.858196619191893, "learning_rate": 1.8723145467389704e-05, "loss": 0.9739, "step": 2949 }, { "epoch": 0.3756944776095643, "grad_norm": 4.703575543593098, "learning_rate": 1.8722136834906214e-05, "loss": 0.992, "step": 2950 }, { "epoch": 0.3758218316697709, "grad_norm": 4.931227937559566, "learning_rate": 1.8721127831392934e-05, "loss": 1.0178, "step": 2951 }, { "epoch": 0.37594918572997754, "grad_norm": 5.22698580591476, "learning_rate": 1.8720118456892794e-05, "loss": 0.9031, "step": 2952 }, { "epoch": 0.3760765397901842, "grad_norm": 4.457166905390661, "learning_rate": 1.8719108711448726e-05, "loss": 0.8567, "step": 2953 }, { "epoch": 0.3762038938503908, "grad_norm": 5.675850812009204, "learning_rate": 1.871809859510368e-05, "loss": 0.8417, "step": 2954 }, { "epoch": 0.37633124791059747, "grad_norm": 5.440112483881218, "learning_rate": 1.871708810790063e-05, "loss": 0.9951, "step": 2955 }, { "epoch": 0.3764586019708041, "grad_norm": 4.812434406576628, "learning_rate": 1.8716077249882562e-05, "loss": 1.1804, "step": 2956 }, { "epoch": 0.3765859560310107, "grad_norm": 4.588519589576335, "learning_rate": 1.8715066021092472e-05, "loss": 0.9776, "step": 2957 }, { "epoch": 0.37671331009121733, "grad_norm": 6.234042433661907, "learning_rate": 1.871405442157338e-05, "loss": 0.9574, "step": 2958 }, { "epoch": 0.376840664151424, "grad_norm": 5.478136127676056, "learning_rate": 1.8713042451368316e-05, "loss": 0.9432, "step": 2959 }, { "epoch": 0.3769680182116306, "grad_norm": 4.596592652903195, "learning_rate": 1.871203011052033e-05, "loss": 0.8471, "step": 2960 }, { "epoch": 0.37709537227183726, "grad_norm": 5.130085352732287, "learning_rate": 1.8711017399072483e-05, "loss": 0.9602, "step": 2961 }, { "epoch": 0.3772227263320439, "grad_norm": 6.330598474929621, "learning_rate": 1.871000431706786e-05, "loss": 0.9979, "step": 2962 }, { "epoch": 0.3773500803922505, "grad_norm": 4.985541993332837, "learning_rate": 1.8708990864549554e-05, "loss": 0.9053, "step": 2963 }, { "epoch": 0.37747743445245713, "grad_norm": 5.5643994806448065, "learning_rate": 1.8707977041560673e-05, "loss": 0.9183, "step": 2964 }, { "epoch": 0.37760478851266377, "grad_norm": 4.941998713864846, "learning_rate": 1.8706962848144344e-05, "loss": 0.9147, "step": 2965 }, { "epoch": 0.3777321425728704, "grad_norm": 5.367565247237299, "learning_rate": 1.8705948284343717e-05, "loss": 0.9704, "step": 2966 }, { "epoch": 0.37785949663307705, "grad_norm": 5.837937999034112, "learning_rate": 1.870493335020194e-05, "loss": 0.9981, "step": 2967 }, { "epoch": 0.37798685069328364, "grad_norm": 5.5650409509843515, "learning_rate": 1.8703918045762197e-05, "loss": 0.962, "step": 2968 }, { "epoch": 0.3781142047534903, "grad_norm": 4.083283534362642, "learning_rate": 1.870290237106767e-05, "loss": 0.8919, "step": 2969 }, { "epoch": 0.3782415588136969, "grad_norm": 7.052446619891286, "learning_rate": 1.8701886326161573e-05, "loss": 0.9557, "step": 2970 }, { "epoch": 0.37836891287390356, "grad_norm": 5.1470224754268195, "learning_rate": 1.8700869911087115e-05, "loss": 1.0099, "step": 2971 }, { "epoch": 0.3784962669341102, "grad_norm": 4.18996982900459, "learning_rate": 1.8699853125887543e-05, "loss": 0.9132, "step": 2972 }, { "epoch": 0.37862362099431685, "grad_norm": 4.2967169715258695, "learning_rate": 1.869883597060611e-05, "loss": 1.079, "step": 2973 }, { "epoch": 0.37875097505452343, "grad_norm": 3.859880669088366, "learning_rate": 1.869781844528608e-05, "loss": 0.8787, "step": 2974 }, { "epoch": 0.3788783291147301, "grad_norm": 6.838057149054142, "learning_rate": 1.869680054997074e-05, "loss": 0.976, "step": 2975 }, { "epoch": 0.3790056831749367, "grad_norm": 6.795412524503117, "learning_rate": 1.8695782284703385e-05, "loss": 0.9242, "step": 2976 }, { "epoch": 0.37913303723514336, "grad_norm": 4.302521148399142, "learning_rate": 1.8694763649527342e-05, "loss": 0.8323, "step": 2977 }, { "epoch": 0.37926039129535, "grad_norm": 4.946712157219363, "learning_rate": 1.869374464448593e-05, "loss": 0.8619, "step": 2978 }, { "epoch": 0.37938774535555664, "grad_norm": 5.101994307792595, "learning_rate": 1.8692725269622506e-05, "loss": 0.8406, "step": 2979 }, { "epoch": 0.3795150994157632, "grad_norm": 4.929714970580156, "learning_rate": 1.869170552498043e-05, "loss": 0.8599, "step": 2980 }, { "epoch": 0.37964245347596987, "grad_norm": 5.433815420251437, "learning_rate": 1.8690685410603076e-05, "loss": 0.9244, "step": 2981 }, { "epoch": 0.3797698075361765, "grad_norm": 4.222726487838026, "learning_rate": 1.8689664926533843e-05, "loss": 1.0021, "step": 2982 }, { "epoch": 0.37989716159638315, "grad_norm": 4.1108968839302715, "learning_rate": 1.8688644072816142e-05, "loss": 0.9778, "step": 2983 }, { "epoch": 0.3800245156565898, "grad_norm": 6.159965207385604, "learning_rate": 1.8687622849493396e-05, "loss": 0.9661, "step": 2984 }, { "epoch": 0.38015186971679643, "grad_norm": 5.043443771198623, "learning_rate": 1.8686601256609053e-05, "loss": 0.9545, "step": 2985 }, { "epoch": 0.380279223777003, "grad_norm": 5.296875540134098, "learning_rate": 1.8685579294206562e-05, "loss": 1.015, "step": 2986 }, { "epoch": 0.38040657783720966, "grad_norm": 5.112632902698365, "learning_rate": 1.86845569623294e-05, "loss": 0.9965, "step": 2987 }, { "epoch": 0.3805339318974163, "grad_norm": 5.487123849268124, "learning_rate": 1.8683534261021058e-05, "loss": 0.8834, "step": 2988 }, { "epoch": 0.38066128595762294, "grad_norm": 4.658285016313718, "learning_rate": 1.8682511190325033e-05, "loss": 0.8875, "step": 2989 }, { "epoch": 0.3807886400178296, "grad_norm": 5.052669257830788, "learning_rate": 1.8681487750284856e-05, "loss": 0.8752, "step": 2990 }, { "epoch": 0.3809159940780362, "grad_norm": 4.518436810415389, "learning_rate": 1.8680463940944053e-05, "loss": 0.8877, "step": 2991 }, { "epoch": 0.3810433481382428, "grad_norm": 6.382911410551706, "learning_rate": 1.8679439762346186e-05, "loss": 1.0217, "step": 2992 }, { "epoch": 0.38117070219844945, "grad_norm": 5.073542666600857, "learning_rate": 1.8678415214534812e-05, "loss": 0.9584, "step": 2993 }, { "epoch": 0.3812980562586561, "grad_norm": 6.137935447395686, "learning_rate": 1.867739029755352e-05, "loss": 0.9819, "step": 2994 }, { "epoch": 0.38142541031886273, "grad_norm": 4.377314799871918, "learning_rate": 1.8676365011445906e-05, "loss": 0.8445, "step": 2995 }, { "epoch": 0.3815527643790694, "grad_norm": 5.081324487869811, "learning_rate": 1.8675339356255587e-05, "loss": 0.9623, "step": 2996 }, { "epoch": 0.381680118439276, "grad_norm": 4.548822674025287, "learning_rate": 1.8674313332026193e-05, "loss": 0.9122, "step": 2997 }, { "epoch": 0.3818074724994826, "grad_norm": 3.8621039375767707, "learning_rate": 1.867328693880137e-05, "loss": 0.9427, "step": 2998 }, { "epoch": 0.38193482655968924, "grad_norm": 4.252743676866492, "learning_rate": 1.8672260176624775e-05, "loss": 0.8997, "step": 2999 }, { "epoch": 0.3820621806198959, "grad_norm": 6.139036327373263, "learning_rate": 1.867123304554009e-05, "loss": 0.9459, "step": 3000 } ], "logging_steps": 1.0, "max_steps": 15704, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.472829136709563e+22, "train_batch_size": 2, "trial_name": null, "trial_params": null }