{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2547081204132639, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012735406020663196, "grad_norm": 7.271933690036313, "learning_rate": 4.237288135593221e-08, "loss": 1.8539, "step": 1 }, { "epoch": 0.0002547081204132639, "grad_norm": 9.658086805732816, "learning_rate": 8.474576271186442e-08, "loss": 2.0669, "step": 2 }, { "epoch": 0.0003820621806198959, "grad_norm": 11.484017585849884, "learning_rate": 1.2711864406779662e-07, "loss": 2.2215, "step": 3 }, { "epoch": 0.0005094162408265278, "grad_norm": 5.540318356122022, "learning_rate": 1.6949152542372883e-07, "loss": 1.8601, "step": 4 }, { "epoch": 0.0006367703010331598, "grad_norm": 8.72124267822763, "learning_rate": 2.1186440677966102e-07, "loss": 2.036, "step": 5 }, { "epoch": 0.0007641243612397918, "grad_norm": 5.653927826902831, "learning_rate": 2.5423728813559323e-07, "loss": 1.6908, "step": 6 }, { "epoch": 0.0008914784214464238, "grad_norm": 6.524121350290057, "learning_rate": 2.966101694915255e-07, "loss": 1.8968, "step": 7 }, { "epoch": 0.0010188324816530557, "grad_norm": 6.459540108765332, "learning_rate": 3.3898305084745766e-07, "loss": 1.7855, "step": 8 }, { "epoch": 0.0011461865418596878, "grad_norm": 9.04352851601738, "learning_rate": 3.813559322033899e-07, "loss": 2.0275, "step": 9 }, { "epoch": 0.0012735406020663196, "grad_norm": 5.189147687544124, "learning_rate": 4.2372881355932204e-07, "loss": 1.7905, "step": 10 }, { "epoch": 0.0014008946622729515, "grad_norm": 7.8253156604412375, "learning_rate": 4.661016949152543e-07, "loss": 1.9746, "step": 11 }, { "epoch": 0.0015282487224795836, "grad_norm": 8.244971390044737, "learning_rate": 5.084745762711865e-07, "loss": 1.9675, "step": 12 }, { "epoch": 0.0016556027826862155, "grad_norm": 9.407911793872701, "learning_rate": 5.508474576271187e-07, "loss": 1.8633, "step": 13 }, { "epoch": 0.0017829568428928476, "grad_norm": 7.269078375368235, "learning_rate": 5.93220338983051e-07, "loss": 1.9822, "step": 14 }, { "epoch": 0.0019103109030994794, "grad_norm": 6.836825277174009, "learning_rate": 6.355932203389831e-07, "loss": 1.8575, "step": 15 }, { "epoch": 0.0020376649633061113, "grad_norm": 7.472620005302849, "learning_rate": 6.779661016949153e-07, "loss": 1.7522, "step": 16 }, { "epoch": 0.002165019023512743, "grad_norm": 5.337881334605893, "learning_rate": 7.203389830508476e-07, "loss": 1.623, "step": 17 }, { "epoch": 0.0022923730837193755, "grad_norm": 9.373863863448666, "learning_rate": 7.627118644067798e-07, "loss": 1.9805, "step": 18 }, { "epoch": 0.0024197271439260074, "grad_norm": 7.718845629389268, "learning_rate": 8.050847457627118e-07, "loss": 1.807, "step": 19 }, { "epoch": 0.0025470812041326393, "grad_norm": 7.130785818146366, "learning_rate": 8.474576271186441e-07, "loss": 1.824, "step": 20 }, { "epoch": 0.002674435264339271, "grad_norm": 6.681287500260171, "learning_rate": 8.898305084745763e-07, "loss": 1.8496, "step": 21 }, { "epoch": 0.002801789324545903, "grad_norm": 6.872143516134369, "learning_rate": 9.322033898305086e-07, "loss": 1.8806, "step": 22 }, { "epoch": 0.0029291433847525353, "grad_norm": 4.705998046939443, "learning_rate": 9.745762711864408e-07, "loss": 1.634, "step": 23 }, { "epoch": 0.003056497444959167, "grad_norm": 11.506522277892683, "learning_rate": 1.016949152542373e-06, "loss": 2.1845, "step": 24 }, { "epoch": 0.003183851505165799, "grad_norm": 11.312083874209405, "learning_rate": 1.059322033898305e-06, "loss": 1.8857, "step": 25 }, { "epoch": 0.003311205565372431, "grad_norm": 6.4627660818311226, "learning_rate": 1.1016949152542374e-06, "loss": 1.8913, "step": 26 }, { "epoch": 0.003438559625579063, "grad_norm": 7.32229167356663, "learning_rate": 1.1440677966101696e-06, "loss": 1.9071, "step": 27 }, { "epoch": 0.003565913685785695, "grad_norm": 5.958980854124657, "learning_rate": 1.186440677966102e-06, "loss": 1.7951, "step": 28 }, { "epoch": 0.003693267745992327, "grad_norm": 7.968036694484309, "learning_rate": 1.228813559322034e-06, "loss": 1.9683, "step": 29 }, { "epoch": 0.003820621806198959, "grad_norm": 6.087713606766322, "learning_rate": 1.2711864406779662e-06, "loss": 1.7463, "step": 30 }, { "epoch": 0.003947975866405591, "grad_norm": 7.15380657743577, "learning_rate": 1.3135593220338985e-06, "loss": 1.8375, "step": 31 }, { "epoch": 0.004075329926612223, "grad_norm": 6.99281487234341, "learning_rate": 1.3559322033898307e-06, "loss": 2.0248, "step": 32 }, { "epoch": 0.0042026839868188545, "grad_norm": 6.702985570920233, "learning_rate": 1.3983050847457628e-06, "loss": 1.9517, "step": 33 }, { "epoch": 0.004330038047025486, "grad_norm": 7.265232789313097, "learning_rate": 1.4406779661016951e-06, "loss": 1.9653, "step": 34 }, { "epoch": 0.004457392107232118, "grad_norm": 11.115946654222988, "learning_rate": 1.4830508474576273e-06, "loss": 2.2556, "step": 35 }, { "epoch": 0.004584746167438751, "grad_norm": 7.299990917879491, "learning_rate": 1.5254237288135596e-06, "loss": 1.8916, "step": 36 }, { "epoch": 0.004712100227645383, "grad_norm": 8.757682697242801, "learning_rate": 1.5677966101694915e-06, "loss": 1.9587, "step": 37 }, { "epoch": 0.004839454287852015, "grad_norm": 6.31538702786833, "learning_rate": 1.6101694915254237e-06, "loss": 1.6702, "step": 38 }, { "epoch": 0.004966808348058647, "grad_norm": 5.678276211998329, "learning_rate": 1.652542372881356e-06, "loss": 1.6698, "step": 39 }, { "epoch": 0.0050941624082652785, "grad_norm": 6.5502751882271255, "learning_rate": 1.6949152542372882e-06, "loss": 1.6604, "step": 40 }, { "epoch": 0.00522151646847191, "grad_norm": 7.378033030484872, "learning_rate": 1.7372881355932205e-06, "loss": 1.8324, "step": 41 }, { "epoch": 0.005348870528678542, "grad_norm": 15.420303713013, "learning_rate": 1.7796610169491526e-06, "loss": 2.0715, "step": 42 }, { "epoch": 0.005476224588885174, "grad_norm": 5.720408048146975, "learning_rate": 1.8220338983050848e-06, "loss": 1.723, "step": 43 }, { "epoch": 0.005603578649091806, "grad_norm": 6.303637032995317, "learning_rate": 1.8644067796610171e-06, "loss": 1.7073, "step": 44 }, { "epoch": 0.005730932709298438, "grad_norm": 6.278397704866944, "learning_rate": 1.9067796610169493e-06, "loss": 1.6899, "step": 45 }, { "epoch": 0.005858286769505071, "grad_norm": 9.400638363853234, "learning_rate": 1.9491525423728816e-06, "loss": 2.0536, "step": 46 }, { "epoch": 0.0059856408297117025, "grad_norm": 11.772339058952785, "learning_rate": 1.9915254237288137e-06, "loss": 1.9004, "step": 47 }, { "epoch": 0.006112994889918334, "grad_norm": 6.097740079901016, "learning_rate": 2.033898305084746e-06, "loss": 1.7205, "step": 48 }, { "epoch": 0.006240348950124966, "grad_norm": 7.00755338498105, "learning_rate": 2.076271186440678e-06, "loss": 1.8041, "step": 49 }, { "epoch": 0.006367703010331598, "grad_norm": 6.9732164515856, "learning_rate": 2.11864406779661e-06, "loss": 1.7358, "step": 50 }, { "epoch": 0.00649505707053823, "grad_norm": 6.583437987191887, "learning_rate": 2.1610169491525427e-06, "loss": 1.7373, "step": 51 }, { "epoch": 0.006622411130744862, "grad_norm": 8.700856431922421, "learning_rate": 2.203389830508475e-06, "loss": 1.8176, "step": 52 }, { "epoch": 0.006749765190951494, "grad_norm": 6.11722055087655, "learning_rate": 2.245762711864407e-06, "loss": 1.6658, "step": 53 }, { "epoch": 0.006877119251158126, "grad_norm": 6.397543745767114, "learning_rate": 2.288135593220339e-06, "loss": 1.761, "step": 54 }, { "epoch": 0.0070044733113647575, "grad_norm": 9.611483508510796, "learning_rate": 2.3305084745762712e-06, "loss": 1.9658, "step": 55 }, { "epoch": 0.00713182737157139, "grad_norm": 9.349587138643392, "learning_rate": 2.372881355932204e-06, "loss": 1.7335, "step": 56 }, { "epoch": 0.007259181431778022, "grad_norm": 6.522174278890342, "learning_rate": 2.415254237288136e-06, "loss": 1.6166, "step": 57 }, { "epoch": 0.007386535491984654, "grad_norm": 7.750941557755308, "learning_rate": 2.457627118644068e-06, "loss": 1.7544, "step": 58 }, { "epoch": 0.007513889552191286, "grad_norm": 6.912297289671559, "learning_rate": 2.5e-06, "loss": 1.7011, "step": 59 }, { "epoch": 0.007641243612397918, "grad_norm": 7.984840812457877, "learning_rate": 2.5423728813559323e-06, "loss": 1.6993, "step": 60 }, { "epoch": 0.00776859767260455, "grad_norm": 7.121099642585329, "learning_rate": 2.5847457627118645e-06, "loss": 1.6092, "step": 61 }, { "epoch": 0.007895951732811182, "grad_norm": 13.071363736303791, "learning_rate": 2.627118644067797e-06, "loss": 1.9803, "step": 62 }, { "epoch": 0.008023305793017814, "grad_norm": 7.054327363108295, "learning_rate": 2.669491525423729e-06, "loss": 1.5912, "step": 63 }, { "epoch": 0.008150659853224445, "grad_norm": 8.19110074835651, "learning_rate": 2.7118644067796613e-06, "loss": 1.6463, "step": 64 }, { "epoch": 0.008278013913431078, "grad_norm": 7.584386759380691, "learning_rate": 2.7542372881355934e-06, "loss": 1.6676, "step": 65 }, { "epoch": 0.008405367973637709, "grad_norm": 13.138200788018468, "learning_rate": 2.7966101694915256e-06, "loss": 1.8309, "step": 66 }, { "epoch": 0.008532722033844342, "grad_norm": 9.75600502737869, "learning_rate": 2.838983050847458e-06, "loss": 1.8151, "step": 67 }, { "epoch": 0.008660076094050973, "grad_norm": 9.03322022802435, "learning_rate": 2.8813559322033903e-06, "loss": 1.7743, "step": 68 }, { "epoch": 0.008787430154257606, "grad_norm": 7.31942395137626, "learning_rate": 2.9237288135593224e-06, "loss": 1.4732, "step": 69 }, { "epoch": 0.008914784214464237, "grad_norm": 7.29859110156687, "learning_rate": 2.9661016949152545e-06, "loss": 1.5354, "step": 70 }, { "epoch": 0.00904213827467087, "grad_norm": 7.57838373430734, "learning_rate": 3.0084745762711862e-06, "loss": 1.6728, "step": 71 }, { "epoch": 0.009169492334877502, "grad_norm": 8.627342016718561, "learning_rate": 3.0508474576271192e-06, "loss": 1.5038, "step": 72 }, { "epoch": 0.009296846395084133, "grad_norm": 11.103801289476134, "learning_rate": 3.0932203389830514e-06, "loss": 1.6997, "step": 73 }, { "epoch": 0.009424200455290766, "grad_norm": 7.695175185164474, "learning_rate": 3.135593220338983e-06, "loss": 1.6034, "step": 74 }, { "epoch": 0.009551554515497397, "grad_norm": 7.476937502840767, "learning_rate": 3.1779661016949152e-06, "loss": 1.7081, "step": 75 }, { "epoch": 0.00967890857570403, "grad_norm": 8.36338889468324, "learning_rate": 3.2203389830508473e-06, "loss": 1.7367, "step": 76 }, { "epoch": 0.00980626263591066, "grad_norm": 7.377402480026078, "learning_rate": 3.26271186440678e-06, "loss": 1.5538, "step": 77 }, { "epoch": 0.009933616696117293, "grad_norm": 7.682228824336897, "learning_rate": 3.305084745762712e-06, "loss": 1.6121, "step": 78 }, { "epoch": 0.010060970756323924, "grad_norm": 9.349692811866577, "learning_rate": 3.347457627118644e-06, "loss": 1.6372, "step": 79 }, { "epoch": 0.010188324816530557, "grad_norm": 7.284498284211861, "learning_rate": 3.3898305084745763e-06, "loss": 1.6672, "step": 80 }, { "epoch": 0.01031567887673719, "grad_norm": 7.870893588490932, "learning_rate": 3.4322033898305084e-06, "loss": 1.581, "step": 81 }, { "epoch": 0.01044303293694382, "grad_norm": 7.532864852561931, "learning_rate": 3.474576271186441e-06, "loss": 1.5485, "step": 82 }, { "epoch": 0.010570386997150454, "grad_norm": 7.315126387572679, "learning_rate": 3.516949152542373e-06, "loss": 1.462, "step": 83 }, { "epoch": 0.010697741057357085, "grad_norm": 9.982699879040739, "learning_rate": 3.5593220338983053e-06, "loss": 1.6831, "step": 84 }, { "epoch": 0.010825095117563717, "grad_norm": 6.745164905565429, "learning_rate": 3.6016949152542374e-06, "loss": 1.4625, "step": 85 }, { "epoch": 0.010952449177770348, "grad_norm": 7.396636518107798, "learning_rate": 3.6440677966101695e-06, "loss": 1.4898, "step": 86 }, { "epoch": 0.011079803237976981, "grad_norm": 6.530327106595857, "learning_rate": 3.686440677966102e-06, "loss": 1.4851, "step": 87 }, { "epoch": 0.011207157298183612, "grad_norm": 7.280031290511072, "learning_rate": 3.7288135593220342e-06, "loss": 1.5776, "step": 88 }, { "epoch": 0.011334511358390245, "grad_norm": 7.258807653030983, "learning_rate": 3.7711864406779664e-06, "loss": 1.493, "step": 89 }, { "epoch": 0.011461865418596876, "grad_norm": 6.780060975917414, "learning_rate": 3.8135593220338985e-06, "loss": 1.497, "step": 90 }, { "epoch": 0.011589219478803509, "grad_norm": 5.299820792569274, "learning_rate": 3.8559322033898315e-06, "loss": 1.3391, "step": 91 }, { "epoch": 0.011716573539010141, "grad_norm": 11.906512079833732, "learning_rate": 3.898305084745763e-06, "loss": 1.7044, "step": 92 }, { "epoch": 0.011843927599216772, "grad_norm": 13.750145928735572, "learning_rate": 3.940677966101695e-06, "loss": 1.3373, "step": 93 }, { "epoch": 0.011971281659423405, "grad_norm": 6.2094515960377015, "learning_rate": 3.9830508474576275e-06, "loss": 1.4218, "step": 94 }, { "epoch": 0.012098635719630036, "grad_norm": 8.702043175020558, "learning_rate": 4.025423728813559e-06, "loss": 1.5958, "step": 95 }, { "epoch": 0.012225989779836669, "grad_norm": 10.530891389712178, "learning_rate": 4.067796610169492e-06, "loss": 1.3869, "step": 96 }, { "epoch": 0.0123533438400433, "grad_norm": 8.382484394062372, "learning_rate": 4.110169491525424e-06, "loss": 1.4319, "step": 97 }, { "epoch": 0.012480697900249933, "grad_norm": 8.65764730585386, "learning_rate": 4.152542372881356e-06, "loss": 1.4986, "step": 98 }, { "epoch": 0.012608051960456564, "grad_norm": 6.733364941110484, "learning_rate": 4.1949152542372886e-06, "loss": 1.3344, "step": 99 }, { "epoch": 0.012735406020663196, "grad_norm": 7.635913980134383, "learning_rate": 4.23728813559322e-06, "loss": 1.4161, "step": 100 }, { "epoch": 0.012862760080869829, "grad_norm": 10.236577619610213, "learning_rate": 4.279661016949153e-06, "loss": 1.5962, "step": 101 }, { "epoch": 0.01299011414107646, "grad_norm": 5.526567621635844, "learning_rate": 4.322033898305085e-06, "loss": 1.2936, "step": 102 }, { "epoch": 0.013117468201283093, "grad_norm": 7.496201634189791, "learning_rate": 4.364406779661017e-06, "loss": 1.4379, "step": 103 }, { "epoch": 0.013244822261489724, "grad_norm": 6.365117809706692, "learning_rate": 4.40677966101695e-06, "loss": 1.3969, "step": 104 }, { "epoch": 0.013372176321696357, "grad_norm": 4.970383669437514, "learning_rate": 4.449152542372881e-06, "loss": 1.4318, "step": 105 }, { "epoch": 0.013499530381902988, "grad_norm": 6.540428319781601, "learning_rate": 4.491525423728814e-06, "loss": 1.4254, "step": 106 }, { "epoch": 0.01362688444210962, "grad_norm": 5.759607332938477, "learning_rate": 4.5338983050847465e-06, "loss": 1.4332, "step": 107 }, { "epoch": 0.013754238502316251, "grad_norm": 8.31020834173882, "learning_rate": 4.576271186440678e-06, "loss": 1.4719, "step": 108 }, { "epoch": 0.013881592562522884, "grad_norm": 7.340003043857536, "learning_rate": 4.618644067796611e-06, "loss": 1.3971, "step": 109 }, { "epoch": 0.014008946622729515, "grad_norm": 6.511784361547076, "learning_rate": 4.6610169491525425e-06, "loss": 1.4415, "step": 110 }, { "epoch": 0.014136300682936148, "grad_norm": 6.731671633748867, "learning_rate": 4.703389830508475e-06, "loss": 1.4395, "step": 111 }, { "epoch": 0.01426365474314278, "grad_norm": 6.905671660774626, "learning_rate": 4.745762711864408e-06, "loss": 1.365, "step": 112 }, { "epoch": 0.014391008803349412, "grad_norm": 6.430425515815662, "learning_rate": 4.788135593220339e-06, "loss": 1.3832, "step": 113 }, { "epoch": 0.014518362863556044, "grad_norm": 7.435499419092716, "learning_rate": 4.830508474576272e-06, "loss": 1.4609, "step": 114 }, { "epoch": 0.014645716923762675, "grad_norm": 5.4419059003708625, "learning_rate": 4.872881355932204e-06, "loss": 1.3687, "step": 115 }, { "epoch": 0.014773070983969308, "grad_norm": 5.480813981464255, "learning_rate": 4.915254237288136e-06, "loss": 1.3259, "step": 116 }, { "epoch": 0.014900425044175939, "grad_norm": 19.487504134260305, "learning_rate": 4.957627118644069e-06, "loss": 1.2768, "step": 117 }, { "epoch": 0.015027779104382572, "grad_norm": 6.2148980714352895, "learning_rate": 5e-06, "loss": 1.3555, "step": 118 }, { "epoch": 0.015155133164589203, "grad_norm": 5.838187604837424, "learning_rate": 5.042372881355932e-06, "loss": 1.3501, "step": 119 }, { "epoch": 0.015282487224795836, "grad_norm": 5.3091089533384, "learning_rate": 5.084745762711865e-06, "loss": 1.3454, "step": 120 }, { "epoch": 0.015409841285002468, "grad_norm": 7.12967595373887, "learning_rate": 5.127118644067796e-06, "loss": 1.2718, "step": 121 }, { "epoch": 0.0155371953452091, "grad_norm": 5.081241156275937, "learning_rate": 5.169491525423729e-06, "loss": 1.306, "step": 122 }, { "epoch": 0.015664549405415732, "grad_norm": 5.1678133686722605, "learning_rate": 5.211864406779662e-06, "loss": 1.2662, "step": 123 }, { "epoch": 0.015791903465622363, "grad_norm": 6.609829212931382, "learning_rate": 5.254237288135594e-06, "loss": 1.4107, "step": 124 }, { "epoch": 0.015919257525828994, "grad_norm": 5.390686388633695, "learning_rate": 5.296610169491526e-06, "loss": 1.3657, "step": 125 }, { "epoch": 0.01604661158603563, "grad_norm": 4.716304726524517, "learning_rate": 5.338983050847458e-06, "loss": 1.285, "step": 126 }, { "epoch": 0.01617396564624226, "grad_norm": 7.442946547443793, "learning_rate": 5.38135593220339e-06, "loss": 1.5355, "step": 127 }, { "epoch": 0.01630131970644889, "grad_norm": 6.08763747167752, "learning_rate": 5.423728813559323e-06, "loss": 1.225, "step": 128 }, { "epoch": 0.01642867376665552, "grad_norm": 7.440177884180067, "learning_rate": 5.466101694915254e-06, "loss": 1.4228, "step": 129 }, { "epoch": 0.016556027826862156, "grad_norm": 7.716718027865346, "learning_rate": 5.508474576271187e-06, "loss": 1.4903, "step": 130 }, { "epoch": 0.016683381887068787, "grad_norm": 4.940675318037667, "learning_rate": 5.550847457627119e-06, "loss": 1.3298, "step": 131 }, { "epoch": 0.016810735947275418, "grad_norm": 6.172436497895839, "learning_rate": 5.593220338983051e-06, "loss": 1.334, "step": 132 }, { "epoch": 0.016938090007482053, "grad_norm": 4.577141665939358, "learning_rate": 5.635593220338984e-06, "loss": 1.3172, "step": 133 }, { "epoch": 0.017065444067688684, "grad_norm": 4.928191476396523, "learning_rate": 5.677966101694916e-06, "loss": 1.2845, "step": 134 }, { "epoch": 0.017192798127895315, "grad_norm": 5.175516465809592, "learning_rate": 5.720338983050848e-06, "loss": 1.3261, "step": 135 }, { "epoch": 0.017320152188101946, "grad_norm": 5.868259215222263, "learning_rate": 5.7627118644067805e-06, "loss": 1.3385, "step": 136 }, { "epoch": 0.01744750624830858, "grad_norm": 6.6858498507159405, "learning_rate": 5.805084745762712e-06, "loss": 1.3098, "step": 137 }, { "epoch": 0.01757486030851521, "grad_norm": 5.492215282646657, "learning_rate": 5.847457627118645e-06, "loss": 1.2635, "step": 138 }, { "epoch": 0.017702214368721842, "grad_norm": 10.420122310923675, "learning_rate": 5.8898305084745765e-06, "loss": 1.3717, "step": 139 }, { "epoch": 0.017829568428928473, "grad_norm": 5.144524576447836, "learning_rate": 5.932203389830509e-06, "loss": 1.2734, "step": 140 }, { "epoch": 0.017956922489135108, "grad_norm": 6.264504044638567, "learning_rate": 5.974576271186441e-06, "loss": 1.318, "step": 141 }, { "epoch": 0.01808427654934174, "grad_norm": 5.740016397060673, "learning_rate": 6.0169491525423725e-06, "loss": 1.3783, "step": 142 }, { "epoch": 0.01821163060954837, "grad_norm": 6.251052157530912, "learning_rate": 6.059322033898306e-06, "loss": 1.3595, "step": 143 }, { "epoch": 0.018338984669755004, "grad_norm": 5.523655392073338, "learning_rate": 6.1016949152542385e-06, "loss": 1.1841, "step": 144 }, { "epoch": 0.018466338729961635, "grad_norm": 5.937324601644495, "learning_rate": 6.14406779661017e-06, "loss": 1.328, "step": 145 }, { "epoch": 0.018593692790168266, "grad_norm": 5.449422459229983, "learning_rate": 6.186440677966103e-06, "loss": 1.3431, "step": 146 }, { "epoch": 0.018721046850374897, "grad_norm": 7.690879017131419, "learning_rate": 6.2288135593220344e-06, "loss": 1.3203, "step": 147 }, { "epoch": 0.01884840091058153, "grad_norm": 5.27418216675118, "learning_rate": 6.271186440677966e-06, "loss": 1.2765, "step": 148 }, { "epoch": 0.018975754970788163, "grad_norm": 6.17835986202785, "learning_rate": 6.313559322033899e-06, "loss": 1.2689, "step": 149 }, { "epoch": 0.019103109030994794, "grad_norm": 5.636045335076978, "learning_rate": 6.3559322033898304e-06, "loss": 1.3782, "step": 150 }, { "epoch": 0.019230463091201428, "grad_norm": 5.108084699461727, "learning_rate": 6.398305084745763e-06, "loss": 1.3202, "step": 151 }, { "epoch": 0.01935781715140806, "grad_norm": 6.525588214420939, "learning_rate": 6.440677966101695e-06, "loss": 1.3457, "step": 152 }, { "epoch": 0.01948517121161469, "grad_norm": 5.606250966219782, "learning_rate": 6.483050847457628e-06, "loss": 1.3161, "step": 153 }, { "epoch": 0.01961252527182132, "grad_norm": 5.588206693263131, "learning_rate": 6.52542372881356e-06, "loss": 1.3518, "step": 154 }, { "epoch": 0.019739879332027956, "grad_norm": 5.148154081610033, "learning_rate": 6.567796610169492e-06, "loss": 1.2552, "step": 155 }, { "epoch": 0.019867233392234587, "grad_norm": 7.284755141437708, "learning_rate": 6.610169491525424e-06, "loss": 1.2592, "step": 156 }, { "epoch": 0.019994587452441218, "grad_norm": 4.314617134869305, "learning_rate": 6.652542372881357e-06, "loss": 1.1922, "step": 157 }, { "epoch": 0.02012194151264785, "grad_norm": 5.301650153890583, "learning_rate": 6.694915254237288e-06, "loss": 1.2398, "step": 158 }, { "epoch": 0.020249295572854483, "grad_norm": 4.878119228831268, "learning_rate": 6.737288135593221e-06, "loss": 1.3827, "step": 159 }, { "epoch": 0.020376649633061114, "grad_norm": 6.819622271416848, "learning_rate": 6.779661016949153e-06, "loss": 1.3324, "step": 160 }, { "epoch": 0.020504003693267745, "grad_norm": 4.742557919660855, "learning_rate": 6.822033898305085e-06, "loss": 1.3472, "step": 161 }, { "epoch": 0.02063135775347438, "grad_norm": 5.689883895788964, "learning_rate": 6.864406779661017e-06, "loss": 1.1704, "step": 162 }, { "epoch": 0.02075871181368101, "grad_norm": 6.417111302943501, "learning_rate": 6.90677966101695e-06, "loss": 1.3367, "step": 163 }, { "epoch": 0.02088606587388764, "grad_norm": 6.942220645458798, "learning_rate": 6.949152542372882e-06, "loss": 1.2618, "step": 164 }, { "epoch": 0.021013419934094273, "grad_norm": 5.252453185448875, "learning_rate": 6.9915254237288146e-06, "loss": 1.2426, "step": 165 }, { "epoch": 0.021140773994300907, "grad_norm": 5.9624264758553185, "learning_rate": 7.033898305084746e-06, "loss": 1.514, "step": 166 }, { "epoch": 0.021268128054507538, "grad_norm": 7.006920527037309, "learning_rate": 7.076271186440679e-06, "loss": 1.2284, "step": 167 }, { "epoch": 0.02139548211471417, "grad_norm": 6.09704000229216, "learning_rate": 7.1186440677966106e-06, "loss": 1.3126, "step": 168 }, { "epoch": 0.0215228361749208, "grad_norm": 8.247528168259292, "learning_rate": 7.161016949152543e-06, "loss": 1.4338, "step": 169 }, { "epoch": 0.021650190235127435, "grad_norm": 6.171994905876386, "learning_rate": 7.203389830508475e-06, "loss": 1.2461, "step": 170 }, { "epoch": 0.021777544295334066, "grad_norm": 5.131775076203093, "learning_rate": 7.2457627118644065e-06, "loss": 1.2865, "step": 171 }, { "epoch": 0.021904898355540697, "grad_norm": 6.94815098159131, "learning_rate": 7.288135593220339e-06, "loss": 1.1561, "step": 172 }, { "epoch": 0.02203225241574733, "grad_norm": 6.040933218097003, "learning_rate": 7.3305084745762725e-06, "loss": 1.2328, "step": 173 }, { "epoch": 0.022159606475953962, "grad_norm": 5.685342232662009, "learning_rate": 7.372881355932204e-06, "loss": 1.2557, "step": 174 }, { "epoch": 0.022286960536160593, "grad_norm": 6.001107431572396, "learning_rate": 7.415254237288137e-06, "loss": 1.2557, "step": 175 }, { "epoch": 0.022414314596367224, "grad_norm": 6.392164571865962, "learning_rate": 7.4576271186440685e-06, "loss": 1.2892, "step": 176 }, { "epoch": 0.02254166865657386, "grad_norm": 6.654915116110552, "learning_rate": 7.500000000000001e-06, "loss": 1.2959, "step": 177 }, { "epoch": 0.02266902271678049, "grad_norm": 7.458014397932309, "learning_rate": 7.542372881355933e-06, "loss": 1.3309, "step": 178 }, { "epoch": 0.02279637677698712, "grad_norm": 7.500898180267626, "learning_rate": 7.5847457627118645e-06, "loss": 1.2727, "step": 179 }, { "epoch": 0.02292373083719375, "grad_norm": 5.733225361598391, "learning_rate": 7.627118644067797e-06, "loss": 1.3032, "step": 180 }, { "epoch": 0.023051084897400386, "grad_norm": 8.154695676999397, "learning_rate": 7.66949152542373e-06, "loss": 1.2128, "step": 181 }, { "epoch": 0.023178438957607017, "grad_norm": 6.985545902969932, "learning_rate": 7.711864406779663e-06, "loss": 1.3754, "step": 182 }, { "epoch": 0.023305793017813648, "grad_norm": 5.1080014310280655, "learning_rate": 7.754237288135595e-06, "loss": 1.2624, "step": 183 }, { "epoch": 0.023433147078020283, "grad_norm": 5.456029954386173, "learning_rate": 7.796610169491526e-06, "loss": 1.3119, "step": 184 }, { "epoch": 0.023560501138226914, "grad_norm": 5.870259096838373, "learning_rate": 7.838983050847458e-06, "loss": 1.3431, "step": 185 }, { "epoch": 0.023687855198433545, "grad_norm": 4.844651464438866, "learning_rate": 7.88135593220339e-06, "loss": 1.1957, "step": 186 }, { "epoch": 0.023815209258640176, "grad_norm": 4.881515257364843, "learning_rate": 7.923728813559323e-06, "loss": 1.2141, "step": 187 }, { "epoch": 0.02394256331884681, "grad_norm": 5.3909046929818265, "learning_rate": 7.966101694915255e-06, "loss": 1.2117, "step": 188 }, { "epoch": 0.02406991737905344, "grad_norm": 5.084112680862707, "learning_rate": 8.008474576271187e-06, "loss": 1.2686, "step": 189 }, { "epoch": 0.024197271439260072, "grad_norm": 6.606274321627709, "learning_rate": 8.050847457627118e-06, "loss": 1.269, "step": 190 }, { "epoch": 0.024324625499466707, "grad_norm": 4.587065315960808, "learning_rate": 8.093220338983052e-06, "loss": 1.1426, "step": 191 }, { "epoch": 0.024451979559673338, "grad_norm": 5.427227269739341, "learning_rate": 8.135593220338983e-06, "loss": 1.2607, "step": 192 }, { "epoch": 0.02457933361987997, "grad_norm": 7.6170325004261885, "learning_rate": 8.177966101694917e-06, "loss": 1.3308, "step": 193 }, { "epoch": 0.0247066876800866, "grad_norm": 6.3427616345396345, "learning_rate": 8.220338983050849e-06, "loss": 1.213, "step": 194 }, { "epoch": 0.024834041740293234, "grad_norm": 7.609799602341245, "learning_rate": 8.26271186440678e-06, "loss": 1.2924, "step": 195 }, { "epoch": 0.024961395800499865, "grad_norm": 4.2632262922802315, "learning_rate": 8.305084745762712e-06, "loss": 1.145, "step": 196 }, { "epoch": 0.025088749860706496, "grad_norm": 5.907637175889623, "learning_rate": 8.347457627118645e-06, "loss": 1.1883, "step": 197 }, { "epoch": 0.025216103920913127, "grad_norm": 5.60309063911008, "learning_rate": 8.389830508474577e-06, "loss": 1.2593, "step": 198 }, { "epoch": 0.02534345798111976, "grad_norm": 5.196493969572722, "learning_rate": 8.432203389830509e-06, "loss": 1.0949, "step": 199 }, { "epoch": 0.025470812041326393, "grad_norm": 4.282351386970627, "learning_rate": 8.47457627118644e-06, "loss": 1.2098, "step": 200 }, { "epoch": 0.025598166101533024, "grad_norm": 7.771190590780939, "learning_rate": 8.516949152542372e-06, "loss": 1.3432, "step": 201 }, { "epoch": 0.025725520161739658, "grad_norm": 6.684773397321345, "learning_rate": 8.559322033898306e-06, "loss": 1.3336, "step": 202 }, { "epoch": 0.02585287422194629, "grad_norm": 7.465165686739227, "learning_rate": 8.601694915254239e-06, "loss": 1.1817, "step": 203 }, { "epoch": 0.02598022828215292, "grad_norm": 7.086114075804347, "learning_rate": 8.64406779661017e-06, "loss": 1.239, "step": 204 }, { "epoch": 0.02610758234235955, "grad_norm": 5.267852583691561, "learning_rate": 8.686440677966103e-06, "loss": 1.2166, "step": 205 }, { "epoch": 0.026234936402566186, "grad_norm": 6.204765199065724, "learning_rate": 8.728813559322034e-06, "loss": 1.1705, "step": 206 }, { "epoch": 0.026362290462772817, "grad_norm": 5.8650519374099, "learning_rate": 8.771186440677966e-06, "loss": 1.1771, "step": 207 }, { "epoch": 0.026489644522979448, "grad_norm": 4.473193540326503, "learning_rate": 8.8135593220339e-06, "loss": 1.2766, "step": 208 }, { "epoch": 0.02661699858318608, "grad_norm": 6.466444701222969, "learning_rate": 8.855932203389831e-06, "loss": 1.1482, "step": 209 }, { "epoch": 0.026744352643392713, "grad_norm": 5.215909996040192, "learning_rate": 8.898305084745763e-06, "loss": 1.3104, "step": 210 }, { "epoch": 0.026871706703599344, "grad_norm": 4.8024398880969015, "learning_rate": 8.940677966101694e-06, "loss": 1.0803, "step": 211 }, { "epoch": 0.026999060763805975, "grad_norm": 6.601632130272579, "learning_rate": 8.983050847457628e-06, "loss": 1.3942, "step": 212 }, { "epoch": 0.02712641482401261, "grad_norm": 5.329161065013925, "learning_rate": 9.02542372881356e-06, "loss": 1.2741, "step": 213 }, { "epoch": 0.02725376888421924, "grad_norm": 5.287158798138966, "learning_rate": 9.067796610169493e-06, "loss": 1.2516, "step": 214 }, { "epoch": 0.02738112294442587, "grad_norm": 4.441068262343084, "learning_rate": 9.110169491525425e-06, "loss": 1.1225, "step": 215 }, { "epoch": 0.027508477004632503, "grad_norm": 5.781878133106383, "learning_rate": 9.152542372881356e-06, "loss": 1.3163, "step": 216 }, { "epoch": 0.027635831064839137, "grad_norm": 6.6835361034553715, "learning_rate": 9.194915254237288e-06, "loss": 1.2013, "step": 217 }, { "epoch": 0.027763185125045768, "grad_norm": 6.766142298984512, "learning_rate": 9.237288135593222e-06, "loss": 1.2307, "step": 218 }, { "epoch": 0.0278905391852524, "grad_norm": 5.126568996026534, "learning_rate": 9.279661016949153e-06, "loss": 1.204, "step": 219 }, { "epoch": 0.02801789324545903, "grad_norm": 5.481113953245, "learning_rate": 9.322033898305085e-06, "loss": 1.2129, "step": 220 }, { "epoch": 0.028145247305665665, "grad_norm": 9.019736795243459, "learning_rate": 9.364406779661017e-06, "loss": 1.3009, "step": 221 }, { "epoch": 0.028272601365872296, "grad_norm": 6.11214577364218, "learning_rate": 9.40677966101695e-06, "loss": 1.1636, "step": 222 }, { "epoch": 0.028399955426078927, "grad_norm": 6.208039688573386, "learning_rate": 9.449152542372882e-06, "loss": 1.2205, "step": 223 }, { "epoch": 0.02852730948628556, "grad_norm": 6.883913954414822, "learning_rate": 9.491525423728815e-06, "loss": 1.2852, "step": 224 }, { "epoch": 0.028654663546492192, "grad_norm": 7.770250504823821, "learning_rate": 9.533898305084747e-06, "loss": 1.2271, "step": 225 }, { "epoch": 0.028782017606698823, "grad_norm": 6.430792119192894, "learning_rate": 9.576271186440679e-06, "loss": 1.2906, "step": 226 }, { "epoch": 0.028909371666905454, "grad_norm": 7.010031460461512, "learning_rate": 9.61864406779661e-06, "loss": 1.2835, "step": 227 }, { "epoch": 0.02903672572711209, "grad_norm": 7.645972753876864, "learning_rate": 9.661016949152544e-06, "loss": 1.2884, "step": 228 }, { "epoch": 0.02916407978731872, "grad_norm": 6.254690622657227, "learning_rate": 9.703389830508475e-06, "loss": 1.2195, "step": 229 }, { "epoch": 0.02929143384752535, "grad_norm": 5.283445725714637, "learning_rate": 9.745762711864407e-06, "loss": 1.2283, "step": 230 }, { "epoch": 0.02941878790773198, "grad_norm": 5.300161776682745, "learning_rate": 9.788135593220339e-06, "loss": 1.2168, "step": 231 }, { "epoch": 0.029546141967938616, "grad_norm": 4.599103442185643, "learning_rate": 9.830508474576272e-06, "loss": 1.1986, "step": 232 }, { "epoch": 0.029673496028145247, "grad_norm": 4.231142055650315, "learning_rate": 9.872881355932204e-06, "loss": 1.1642, "step": 233 }, { "epoch": 0.029800850088351878, "grad_norm": 5.711332252730025, "learning_rate": 9.915254237288137e-06, "loss": 1.1048, "step": 234 }, { "epoch": 0.029928204148558513, "grad_norm": 4.869391712213411, "learning_rate": 9.957627118644069e-06, "loss": 1.0453, "step": 235 }, { "epoch": 0.030055558208765144, "grad_norm": 5.5126782723662675, "learning_rate": 1e-05, "loss": 1.2603, "step": 236 }, { "epoch": 0.030182912268971775, "grad_norm": 5.210563223006909, "learning_rate": 1.0042372881355933e-05, "loss": 1.2728, "step": 237 }, { "epoch": 0.030310266329178406, "grad_norm": 4.462683768455516, "learning_rate": 1.0084745762711864e-05, "loss": 1.173, "step": 238 }, { "epoch": 0.03043762038938504, "grad_norm": 6.872645859376026, "learning_rate": 1.0127118644067798e-05, "loss": 1.2844, "step": 239 }, { "epoch": 0.03056497444959167, "grad_norm": 6.347134293444063, "learning_rate": 1.016949152542373e-05, "loss": 1.1263, "step": 240 }, { "epoch": 0.030692328509798302, "grad_norm": 4.948369483974166, "learning_rate": 1.0211864406779661e-05, "loss": 1.1743, "step": 241 }, { "epoch": 0.030819682570004937, "grad_norm": 5.091288527678461, "learning_rate": 1.0254237288135593e-05, "loss": 1.1484, "step": 242 }, { "epoch": 0.030947036630211568, "grad_norm": 6.830186475942115, "learning_rate": 1.0296610169491526e-05, "loss": 1.2874, "step": 243 }, { "epoch": 0.0310743906904182, "grad_norm": 4.416479046753212, "learning_rate": 1.0338983050847458e-05, "loss": 1.1908, "step": 244 }, { "epoch": 0.03120174475062483, "grad_norm": 6.347450116845334, "learning_rate": 1.038135593220339e-05, "loss": 1.3404, "step": 245 }, { "epoch": 0.031329098810831464, "grad_norm": 5.174297399324874, "learning_rate": 1.0423728813559325e-05, "loss": 1.3433, "step": 246 }, { "epoch": 0.03145645287103809, "grad_norm": 6.830511797695472, "learning_rate": 1.0466101694915256e-05, "loss": 1.2365, "step": 247 }, { "epoch": 0.031583806931244726, "grad_norm": 4.788501700444122, "learning_rate": 1.0508474576271188e-05, "loss": 1.186, "step": 248 }, { "epoch": 0.03171116099145136, "grad_norm": 4.786448725323033, "learning_rate": 1.055084745762712e-05, "loss": 1.2575, "step": 249 }, { "epoch": 0.03183851505165799, "grad_norm": 5.365572446233603, "learning_rate": 1.0593220338983052e-05, "loss": 1.1592, "step": 250 }, { "epoch": 0.03196586911186462, "grad_norm": 5.338549308461248, "learning_rate": 1.0635593220338985e-05, "loss": 1.1911, "step": 251 }, { "epoch": 0.03209322317207126, "grad_norm": 6.401895373542065, "learning_rate": 1.0677966101694917e-05, "loss": 1.2061, "step": 252 }, { "epoch": 0.032220577232277885, "grad_norm": 7.118990857049467, "learning_rate": 1.0720338983050848e-05, "loss": 1.1411, "step": 253 }, { "epoch": 0.03234793129248452, "grad_norm": 5.334239505238133, "learning_rate": 1.076271186440678e-05, "loss": 1.1481, "step": 254 }, { "epoch": 0.032475285352691154, "grad_norm": 6.033395336247018, "learning_rate": 1.0805084745762714e-05, "loss": 1.304, "step": 255 }, { "epoch": 0.03260263941289778, "grad_norm": 5.935809085397795, "learning_rate": 1.0847457627118645e-05, "loss": 1.2317, "step": 256 }, { "epoch": 0.032729993473104416, "grad_norm": 6.56257120729597, "learning_rate": 1.0889830508474577e-05, "loss": 1.0834, "step": 257 }, { "epoch": 0.03285734753331104, "grad_norm": 5.52426654901273, "learning_rate": 1.0932203389830509e-05, "loss": 1.2457, "step": 258 }, { "epoch": 0.03298470159351768, "grad_norm": 5.3082240230657876, "learning_rate": 1.0974576271186442e-05, "loss": 1.109, "step": 259 }, { "epoch": 0.03311205565372431, "grad_norm": 7.27848219758666, "learning_rate": 1.1016949152542374e-05, "loss": 1.1494, "step": 260 }, { "epoch": 0.03323940971393094, "grad_norm": 6.186081232346015, "learning_rate": 1.1059322033898305e-05, "loss": 1.1951, "step": 261 }, { "epoch": 0.033366763774137574, "grad_norm": 6.09704876157918, "learning_rate": 1.1101694915254237e-05, "loss": 1.2232, "step": 262 }, { "epoch": 0.03349411783434421, "grad_norm": 5.6139682675869, "learning_rate": 1.1144067796610169e-05, "loss": 1.1192, "step": 263 }, { "epoch": 0.033621471894550836, "grad_norm": 5.013522744422119, "learning_rate": 1.1186440677966102e-05, "loss": 1.1021, "step": 264 }, { "epoch": 0.03374882595475747, "grad_norm": 5.189883867849798, "learning_rate": 1.1228813559322034e-05, "loss": 1.0947, "step": 265 }, { "epoch": 0.033876180014964105, "grad_norm": 9.693514624172337, "learning_rate": 1.1271186440677967e-05, "loss": 1.2365, "step": 266 }, { "epoch": 0.03400353407517073, "grad_norm": 4.465752509974851, "learning_rate": 1.13135593220339e-05, "loss": 1.2413, "step": 267 }, { "epoch": 0.03413088813537737, "grad_norm": 5.235194811033555, "learning_rate": 1.1355932203389833e-05, "loss": 1.161, "step": 268 }, { "epoch": 0.034258242195583995, "grad_norm": 6.032168619829575, "learning_rate": 1.1398305084745764e-05, "loss": 1.1826, "step": 269 }, { "epoch": 0.03438559625579063, "grad_norm": 6.792682345529315, "learning_rate": 1.1440677966101696e-05, "loss": 1.1542, "step": 270 }, { "epoch": 0.034512950315997264, "grad_norm": 6.934400261192159, "learning_rate": 1.148305084745763e-05, "loss": 1.2888, "step": 271 }, { "epoch": 0.03464030437620389, "grad_norm": 5.887182388323694, "learning_rate": 1.1525423728813561e-05, "loss": 1.2312, "step": 272 }, { "epoch": 0.034767658436410526, "grad_norm": 6.01624388050281, "learning_rate": 1.1567796610169493e-05, "loss": 1.2219, "step": 273 }, { "epoch": 0.03489501249661716, "grad_norm": 7.370642457700537, "learning_rate": 1.1610169491525424e-05, "loss": 1.2457, "step": 274 }, { "epoch": 0.03502236655682379, "grad_norm": 6.054900449884194, "learning_rate": 1.1652542372881356e-05, "loss": 1.1599, "step": 275 }, { "epoch": 0.03514972061703042, "grad_norm": 7.087314997551028, "learning_rate": 1.169491525423729e-05, "loss": 1.2703, "step": 276 }, { "epoch": 0.03527707467723706, "grad_norm": 6.126673567100073, "learning_rate": 1.1737288135593221e-05, "loss": 1.1172, "step": 277 }, { "epoch": 0.035404428737443684, "grad_norm": 6.316984189121061, "learning_rate": 1.1779661016949153e-05, "loss": 1.2458, "step": 278 }, { "epoch": 0.03553178279765032, "grad_norm": 7.160140799731012, "learning_rate": 1.1822033898305085e-05, "loss": 1.1956, "step": 279 }, { "epoch": 0.035659136857856946, "grad_norm": 6.894648823802736, "learning_rate": 1.1864406779661018e-05, "loss": 1.1181, "step": 280 }, { "epoch": 0.03578649091806358, "grad_norm": 6.664617446028264, "learning_rate": 1.190677966101695e-05, "loss": 1.1402, "step": 281 }, { "epoch": 0.035913844978270215, "grad_norm": 6.941991778114701, "learning_rate": 1.1949152542372882e-05, "loss": 1.1371, "step": 282 }, { "epoch": 0.03604119903847684, "grad_norm": 4.18847827017481, "learning_rate": 1.1991525423728813e-05, "loss": 1.1745, "step": 283 }, { "epoch": 0.03616855309868348, "grad_norm": 5.438227637703072, "learning_rate": 1.2033898305084745e-05, "loss": 1.2066, "step": 284 }, { "epoch": 0.03629590715889011, "grad_norm": 5.776477720439709, "learning_rate": 1.2076271186440678e-05, "loss": 1.0364, "step": 285 }, { "epoch": 0.03642326121909674, "grad_norm": 4.473684720727284, "learning_rate": 1.2118644067796612e-05, "loss": 1.1551, "step": 286 }, { "epoch": 0.036550615279303374, "grad_norm": 5.67647515168569, "learning_rate": 1.2161016949152544e-05, "loss": 1.2159, "step": 287 }, { "epoch": 0.03667796933951001, "grad_norm": 5.7037797943882, "learning_rate": 1.2203389830508477e-05, "loss": 1.2122, "step": 288 }, { "epoch": 0.036805323399716636, "grad_norm": 3.9661231771896124, "learning_rate": 1.2245762711864409e-05, "loss": 1.0775, "step": 289 }, { "epoch": 0.03693267745992327, "grad_norm": 6.025963240112854, "learning_rate": 1.228813559322034e-05, "loss": 1.2435, "step": 290 }, { "epoch": 0.0370600315201299, "grad_norm": 6.998972953655834, "learning_rate": 1.2330508474576272e-05, "loss": 1.2353, "step": 291 }, { "epoch": 0.03718738558033653, "grad_norm": 5.012810128147621, "learning_rate": 1.2372881355932205e-05, "loss": 1.1848, "step": 292 }, { "epoch": 0.03731473964054317, "grad_norm": 5.8944234970891864, "learning_rate": 1.2415254237288137e-05, "loss": 1.1496, "step": 293 }, { "epoch": 0.037442093700749794, "grad_norm": 6.154631581068166, "learning_rate": 1.2457627118644069e-05, "loss": 1.2476, "step": 294 }, { "epoch": 0.03756944776095643, "grad_norm": 8.244605872992144, "learning_rate": 1.25e-05, "loss": 1.2131, "step": 295 }, { "epoch": 0.03769680182116306, "grad_norm": 5.418418288060157, "learning_rate": 1.2542372881355932e-05, "loss": 1.1227, "step": 296 }, { "epoch": 0.03782415588136969, "grad_norm": 7.34352247819092, "learning_rate": 1.2584745762711866e-05, "loss": 1.0944, "step": 297 }, { "epoch": 0.037951509941576325, "grad_norm": 5.990661665555994, "learning_rate": 1.2627118644067797e-05, "loss": 1.1886, "step": 298 }, { "epoch": 0.03807886400178296, "grad_norm": 5.1388355607481895, "learning_rate": 1.2669491525423729e-05, "loss": 1.1942, "step": 299 }, { "epoch": 0.03820621806198959, "grad_norm": 5.576874237453807, "learning_rate": 1.2711864406779661e-05, "loss": 1.2666, "step": 300 }, { "epoch": 0.03833357212219622, "grad_norm": 6.33907171920546, "learning_rate": 1.2754237288135594e-05, "loss": 1.1965, "step": 301 }, { "epoch": 0.038460926182402856, "grad_norm": 6.752063859556231, "learning_rate": 1.2796610169491526e-05, "loss": 1.1443, "step": 302 }, { "epoch": 0.038588280242609484, "grad_norm": 5.783912499236838, "learning_rate": 1.2838983050847458e-05, "loss": 1.2141, "step": 303 }, { "epoch": 0.03871563430281612, "grad_norm": 9.393457985080918, "learning_rate": 1.288135593220339e-05, "loss": 1.227, "step": 304 }, { "epoch": 0.038842988363022746, "grad_norm": 3.728983414621008, "learning_rate": 1.2923728813559324e-05, "loss": 1.1179, "step": 305 }, { "epoch": 0.03897034242322938, "grad_norm": 6.896312623976467, "learning_rate": 1.2966101694915256e-05, "loss": 1.1986, "step": 306 }, { "epoch": 0.039097696483436015, "grad_norm": 5.28379985943732, "learning_rate": 1.3008474576271188e-05, "loss": 1.0703, "step": 307 }, { "epoch": 0.03922505054364264, "grad_norm": 7.615355599028728, "learning_rate": 1.305084745762712e-05, "loss": 1.2172, "step": 308 }, { "epoch": 0.03935240460384928, "grad_norm": 5.221403988975462, "learning_rate": 1.3093220338983053e-05, "loss": 1.0893, "step": 309 }, { "epoch": 0.03947975866405591, "grad_norm": 4.33216729737341, "learning_rate": 1.3135593220338985e-05, "loss": 1.0594, "step": 310 }, { "epoch": 0.03960711272426254, "grad_norm": 5.546533020653022, "learning_rate": 1.3177966101694916e-05, "loss": 1.0849, "step": 311 }, { "epoch": 0.03973446678446917, "grad_norm": 6.177659040601369, "learning_rate": 1.3220338983050848e-05, "loss": 1.1686, "step": 312 }, { "epoch": 0.03986182084467581, "grad_norm": 6.419279516083969, "learning_rate": 1.3262711864406782e-05, "loss": 1.1274, "step": 313 }, { "epoch": 0.039989174904882435, "grad_norm": 5.585204517851212, "learning_rate": 1.3305084745762713e-05, "loss": 1.1679, "step": 314 }, { "epoch": 0.04011652896508907, "grad_norm": 4.364550727716378, "learning_rate": 1.3347457627118645e-05, "loss": 1.1287, "step": 315 }, { "epoch": 0.0402438830252957, "grad_norm": 7.1135740518403, "learning_rate": 1.3389830508474577e-05, "loss": 1.1679, "step": 316 }, { "epoch": 0.04037123708550233, "grad_norm": 8.475439603763972, "learning_rate": 1.343220338983051e-05, "loss": 1.1221, "step": 317 }, { "epoch": 0.040498591145708966, "grad_norm": 6.599847803383956, "learning_rate": 1.3474576271186442e-05, "loss": 1.2375, "step": 318 }, { "epoch": 0.040625945205915594, "grad_norm": 5.760783160268134, "learning_rate": 1.3516949152542374e-05, "loss": 1.1498, "step": 319 }, { "epoch": 0.04075329926612223, "grad_norm": 5.623168816789539, "learning_rate": 1.3559322033898305e-05, "loss": 1.0415, "step": 320 }, { "epoch": 0.04088065332632886, "grad_norm": 5.145493633882771, "learning_rate": 1.3601694915254237e-05, "loss": 1.1116, "step": 321 }, { "epoch": 0.04100800738653549, "grad_norm": 6.643310865385602, "learning_rate": 1.364406779661017e-05, "loss": 1.3068, "step": 322 }, { "epoch": 0.041135361446742125, "grad_norm": 5.551218314953344, "learning_rate": 1.3686440677966102e-05, "loss": 1.2183, "step": 323 }, { "epoch": 0.04126271550694876, "grad_norm": 6.227722749099259, "learning_rate": 1.3728813559322034e-05, "loss": 1.281, "step": 324 }, { "epoch": 0.04139006956715539, "grad_norm": 7.083253598231905, "learning_rate": 1.3771186440677969e-05, "loss": 1.0694, "step": 325 }, { "epoch": 0.04151742362736202, "grad_norm": 5.469741818543489, "learning_rate": 1.38135593220339e-05, "loss": 1.2052, "step": 326 }, { "epoch": 0.04164477768756865, "grad_norm": 5.973156326146339, "learning_rate": 1.3855932203389832e-05, "loss": 1.2121, "step": 327 }, { "epoch": 0.04177213174777528, "grad_norm": 5.59625972090016, "learning_rate": 1.3898305084745764e-05, "loss": 1.1219, "step": 328 }, { "epoch": 0.04189948580798192, "grad_norm": 7.162602299730285, "learning_rate": 1.3940677966101697e-05, "loss": 1.1341, "step": 329 }, { "epoch": 0.042026839868188545, "grad_norm": 5.91873415790762, "learning_rate": 1.3983050847457629e-05, "loss": 1.1412, "step": 330 }, { "epoch": 0.04215419392839518, "grad_norm": 4.9801198560393685, "learning_rate": 1.4025423728813561e-05, "loss": 1.0811, "step": 331 }, { "epoch": 0.042281547988601814, "grad_norm": 7.04494488794065, "learning_rate": 1.4067796610169493e-05, "loss": 1.1825, "step": 332 }, { "epoch": 0.04240890204880844, "grad_norm": 4.976812191722208, "learning_rate": 1.4110169491525424e-05, "loss": 1.0741, "step": 333 }, { "epoch": 0.042536256109015076, "grad_norm": 5.810975233685298, "learning_rate": 1.4152542372881358e-05, "loss": 1.222, "step": 334 }, { "epoch": 0.04266361016922171, "grad_norm": 6.413104005442228, "learning_rate": 1.419491525423729e-05, "loss": 1.1863, "step": 335 }, { "epoch": 0.04279096422942834, "grad_norm": 5.016552230357741, "learning_rate": 1.4237288135593221e-05, "loss": 1.1668, "step": 336 }, { "epoch": 0.04291831828963497, "grad_norm": 3.332210844506039, "learning_rate": 1.4279661016949153e-05, "loss": 1.1309, "step": 337 }, { "epoch": 0.0430456723498416, "grad_norm": 5.386934736650668, "learning_rate": 1.4322033898305086e-05, "loss": 1.1814, "step": 338 }, { "epoch": 0.043173026410048235, "grad_norm": 6.338924282523835, "learning_rate": 1.4364406779661018e-05, "loss": 1.2045, "step": 339 }, { "epoch": 0.04330038047025487, "grad_norm": 7.035777079943078, "learning_rate": 1.440677966101695e-05, "loss": 1.1769, "step": 340 }, { "epoch": 0.0434277345304615, "grad_norm": 7.6157803688276555, "learning_rate": 1.4449152542372881e-05, "loss": 1.1015, "step": 341 }, { "epoch": 0.04355508859066813, "grad_norm": 5.101772800568265, "learning_rate": 1.4491525423728813e-05, "loss": 1.153, "step": 342 }, { "epoch": 0.043682442650874766, "grad_norm": 6.4044347656892215, "learning_rate": 1.4533898305084746e-05, "loss": 1.0963, "step": 343 }, { "epoch": 0.04380979671108139, "grad_norm": 7.158946233171024, "learning_rate": 1.4576271186440678e-05, "loss": 1.1785, "step": 344 }, { "epoch": 0.04393715077128803, "grad_norm": 5.614692399116299, "learning_rate": 1.4618644067796612e-05, "loss": 1.1204, "step": 345 }, { "epoch": 0.04406450483149466, "grad_norm": 7.491907204007989, "learning_rate": 1.4661016949152545e-05, "loss": 1.0839, "step": 346 }, { "epoch": 0.04419185889170129, "grad_norm": 6.626849797952939, "learning_rate": 1.4703389830508477e-05, "loss": 1.2041, "step": 347 }, { "epoch": 0.044319212951907924, "grad_norm": 6.108738078212482, "learning_rate": 1.4745762711864408e-05, "loss": 1.1178, "step": 348 }, { "epoch": 0.04444656701211455, "grad_norm": 5.6485756082916945, "learning_rate": 1.478813559322034e-05, "loss": 1.0452, "step": 349 }, { "epoch": 0.044573921072321186, "grad_norm": 6.370962322851192, "learning_rate": 1.4830508474576274e-05, "loss": 1.17, "step": 350 }, { "epoch": 0.04470127513252782, "grad_norm": 5.2103942862478325, "learning_rate": 1.4872881355932205e-05, "loss": 1.1943, "step": 351 }, { "epoch": 0.04482862919273445, "grad_norm": 5.756424673207367, "learning_rate": 1.4915254237288137e-05, "loss": 1.0939, "step": 352 }, { "epoch": 0.04495598325294108, "grad_norm": 5.8667551178477675, "learning_rate": 1.4957627118644069e-05, "loss": 1.1213, "step": 353 }, { "epoch": 0.04508333731314772, "grad_norm": 5.172829346899861, "learning_rate": 1.5000000000000002e-05, "loss": 1.079, "step": 354 }, { "epoch": 0.045210691373354345, "grad_norm": 4.573603040529301, "learning_rate": 1.5042372881355934e-05, "loss": 1.0208, "step": 355 }, { "epoch": 0.04533804543356098, "grad_norm": 4.8197230839623035, "learning_rate": 1.5084745762711865e-05, "loss": 1.1836, "step": 356 }, { "epoch": 0.045465399493767614, "grad_norm": 7.777097030397322, "learning_rate": 1.5127118644067797e-05, "loss": 1.1588, "step": 357 }, { "epoch": 0.04559275355397424, "grad_norm": 5.278754553281395, "learning_rate": 1.5169491525423729e-05, "loss": 1.0581, "step": 358 }, { "epoch": 0.045720107614180876, "grad_norm": 4.858938117795735, "learning_rate": 1.5211864406779662e-05, "loss": 1.0556, "step": 359 }, { "epoch": 0.0458474616743875, "grad_norm": 5.661545103286113, "learning_rate": 1.5254237288135594e-05, "loss": 1.1903, "step": 360 }, { "epoch": 0.04597481573459414, "grad_norm": 7.069885982905191, "learning_rate": 1.5296610169491526e-05, "loss": 1.1951, "step": 361 }, { "epoch": 0.04610216979480077, "grad_norm": 4.610622017492172, "learning_rate": 1.533898305084746e-05, "loss": 1.0968, "step": 362 }, { "epoch": 0.0462295238550074, "grad_norm": 5.0935084747906005, "learning_rate": 1.538135593220339e-05, "loss": 1.1064, "step": 363 }, { "epoch": 0.046356877915214034, "grad_norm": 4.494028155508151, "learning_rate": 1.5423728813559326e-05, "loss": 1.0162, "step": 364 }, { "epoch": 0.04648423197542067, "grad_norm": 5.347282268412717, "learning_rate": 1.5466101694915256e-05, "loss": 1.0498, "step": 365 }, { "epoch": 0.046611586035627296, "grad_norm": 6.447015340870528, "learning_rate": 1.550847457627119e-05, "loss": 1.0885, "step": 366 }, { "epoch": 0.04673894009583393, "grad_norm": 5.665803544376279, "learning_rate": 1.555084745762712e-05, "loss": 1.0579, "step": 367 }, { "epoch": 0.046866294156040565, "grad_norm": 6.497470363477434, "learning_rate": 1.5593220338983053e-05, "loss": 1.147, "step": 368 }, { "epoch": 0.04699364821624719, "grad_norm": 6.44022082264147, "learning_rate": 1.5635593220338986e-05, "loss": 1.1009, "step": 369 }, { "epoch": 0.04712100227645383, "grad_norm": 5.137514048462012, "learning_rate": 1.5677966101694916e-05, "loss": 1.0439, "step": 370 }, { "epoch": 0.047248356336660455, "grad_norm": 5.702533680817622, "learning_rate": 1.572033898305085e-05, "loss": 1.0592, "step": 371 }, { "epoch": 0.04737571039686709, "grad_norm": 5.280215190201436, "learning_rate": 1.576271186440678e-05, "loss": 1.1372, "step": 372 }, { "epoch": 0.047503064457073724, "grad_norm": 5.3070710210883725, "learning_rate": 1.5805084745762713e-05, "loss": 1.1866, "step": 373 }, { "epoch": 0.04763041851728035, "grad_norm": 5.790683946551362, "learning_rate": 1.5847457627118646e-05, "loss": 1.122, "step": 374 }, { "epoch": 0.047757772577486986, "grad_norm": 4.4342977225389655, "learning_rate": 1.5889830508474576e-05, "loss": 1.0376, "step": 375 }, { "epoch": 0.04788512663769362, "grad_norm": 7.361533850601036, "learning_rate": 1.593220338983051e-05, "loss": 1.1915, "step": 376 }, { "epoch": 0.04801248069790025, "grad_norm": 6.792856435960524, "learning_rate": 1.5974576271186443e-05, "loss": 1.13, "step": 377 }, { "epoch": 0.04813983475810688, "grad_norm": 5.3009787897284015, "learning_rate": 1.6016949152542373e-05, "loss": 1.1738, "step": 378 }, { "epoch": 0.04826718881831352, "grad_norm": 7.999690049842849, "learning_rate": 1.6059322033898307e-05, "loss": 1.1297, "step": 379 }, { "epoch": 0.048394542878520144, "grad_norm": 5.551331527032272, "learning_rate": 1.6101694915254237e-05, "loss": 1.2117, "step": 380 }, { "epoch": 0.04852189693872678, "grad_norm": 4.79399733321872, "learning_rate": 1.614406779661017e-05, "loss": 1.1036, "step": 381 }, { "epoch": 0.04864925099893341, "grad_norm": 5.685210049924944, "learning_rate": 1.6186440677966104e-05, "loss": 1.2222, "step": 382 }, { "epoch": 0.04877660505914004, "grad_norm": 8.528954948278576, "learning_rate": 1.6228813559322034e-05, "loss": 1.1457, "step": 383 }, { "epoch": 0.048903959119346675, "grad_norm": 5.27660350563353, "learning_rate": 1.6271186440677967e-05, "loss": 0.9738, "step": 384 }, { "epoch": 0.0490313131795533, "grad_norm": 8.813854789754753, "learning_rate": 1.63135593220339e-05, "loss": 1.1817, "step": 385 }, { "epoch": 0.04915866723975994, "grad_norm": 4.481348378738923, "learning_rate": 1.6355932203389834e-05, "loss": 1.0746, "step": 386 }, { "epoch": 0.04928602129996657, "grad_norm": 5.467818000940252, "learning_rate": 1.6398305084745764e-05, "loss": 1.087, "step": 387 }, { "epoch": 0.0494133753601732, "grad_norm": 6.6069025415732945, "learning_rate": 1.6440677966101697e-05, "loss": 1.1628, "step": 388 }, { "epoch": 0.049540729420379834, "grad_norm": 5.1796771893751075, "learning_rate": 1.648305084745763e-05, "loss": 1.1182, "step": 389 }, { "epoch": 0.04966808348058647, "grad_norm": 4.147969192537622, "learning_rate": 1.652542372881356e-05, "loss": 1.1918, "step": 390 }, { "epoch": 0.049795437540793096, "grad_norm": 7.153712193297355, "learning_rate": 1.6567796610169494e-05, "loss": 1.1153, "step": 391 }, { "epoch": 0.04992279160099973, "grad_norm": 6.5428372771306, "learning_rate": 1.6610169491525424e-05, "loss": 1.2467, "step": 392 }, { "epoch": 0.050050145661206365, "grad_norm": 6.720477498664704, "learning_rate": 1.6652542372881357e-05, "loss": 1.1426, "step": 393 }, { "epoch": 0.05017749972141299, "grad_norm": 5.490675564806118, "learning_rate": 1.669491525423729e-05, "loss": 1.1069, "step": 394 }, { "epoch": 0.05030485378161963, "grad_norm": 4.996180028819127, "learning_rate": 1.673728813559322e-05, "loss": 1.1453, "step": 395 }, { "epoch": 0.050432207841826254, "grad_norm": 6.543552037899987, "learning_rate": 1.6779661016949154e-05, "loss": 1.1048, "step": 396 }, { "epoch": 0.05055956190203289, "grad_norm": 6.797227170743476, "learning_rate": 1.6822033898305084e-05, "loss": 1.2534, "step": 397 }, { "epoch": 0.05068691596223952, "grad_norm": 6.369776960986534, "learning_rate": 1.6864406779661018e-05, "loss": 1.175, "step": 398 }, { "epoch": 0.05081427002244615, "grad_norm": 4.493815941253846, "learning_rate": 1.690677966101695e-05, "loss": 1.1982, "step": 399 }, { "epoch": 0.050941624082652785, "grad_norm": 4.799971882419884, "learning_rate": 1.694915254237288e-05, "loss": 1.1094, "step": 400 }, { "epoch": 0.05106897814285942, "grad_norm": 5.5972935199009175, "learning_rate": 1.6991525423728815e-05, "loss": 1.1708, "step": 401 }, { "epoch": 0.05119633220306605, "grad_norm": 5.317337885054691, "learning_rate": 1.7033898305084745e-05, "loss": 1.0627, "step": 402 }, { "epoch": 0.05132368626327268, "grad_norm": 5.190269190692385, "learning_rate": 1.7076271186440678e-05, "loss": 1.086, "step": 403 }, { "epoch": 0.051451040323479316, "grad_norm": 7.00350346766054, "learning_rate": 1.711864406779661e-05, "loss": 1.1158, "step": 404 }, { "epoch": 0.051578394383685944, "grad_norm": 5.506921747563612, "learning_rate": 1.7161016949152545e-05, "loss": 1.1417, "step": 405 }, { "epoch": 0.05170574844389258, "grad_norm": 5.838009549432716, "learning_rate": 1.7203389830508478e-05, "loss": 1.2006, "step": 406 }, { "epoch": 0.051833102504099206, "grad_norm": 6.087071911683631, "learning_rate": 1.7245762711864408e-05, "loss": 1.2103, "step": 407 }, { "epoch": 0.05196045656430584, "grad_norm": 6.958343810418135, "learning_rate": 1.728813559322034e-05, "loss": 1.1675, "step": 408 }, { "epoch": 0.052087810624512475, "grad_norm": 6.131831193883537, "learning_rate": 1.733050847457627e-05, "loss": 1.1751, "step": 409 }, { "epoch": 0.0522151646847191, "grad_norm": 6.429994225862797, "learning_rate": 1.7372881355932205e-05, "loss": 1.0883, "step": 410 }, { "epoch": 0.05234251874492574, "grad_norm": 5.681284096324905, "learning_rate": 1.741525423728814e-05, "loss": 1.072, "step": 411 }, { "epoch": 0.05246987280513237, "grad_norm": 5.152898635657918, "learning_rate": 1.745762711864407e-05, "loss": 1.1444, "step": 412 }, { "epoch": 0.052597226865339, "grad_norm": 8.122164657024648, "learning_rate": 1.7500000000000002e-05, "loss": 1.117, "step": 413 }, { "epoch": 0.05272458092554563, "grad_norm": 7.388012285107703, "learning_rate": 1.7542372881355932e-05, "loss": 1.1604, "step": 414 }, { "epoch": 0.05285193498575227, "grad_norm": 4.142600455576895, "learning_rate": 1.7584745762711865e-05, "loss": 1.0567, "step": 415 }, { "epoch": 0.052979289045958895, "grad_norm": 5.341916132123986, "learning_rate": 1.76271186440678e-05, "loss": 1.1664, "step": 416 }, { "epoch": 0.05310664310616553, "grad_norm": 4.098930284467975, "learning_rate": 1.766949152542373e-05, "loss": 1.0655, "step": 417 }, { "epoch": 0.05323399716637216, "grad_norm": 5.283165939679673, "learning_rate": 1.7711864406779662e-05, "loss": 1.0878, "step": 418 }, { "epoch": 0.05336135122657879, "grad_norm": 5.72671571034816, "learning_rate": 1.7754237288135596e-05, "loss": 1.1076, "step": 419 }, { "epoch": 0.053488705286785426, "grad_norm": 4.181638800501704, "learning_rate": 1.7796610169491526e-05, "loss": 1.0477, "step": 420 }, { "epoch": 0.053616059346992054, "grad_norm": 6.037518500068706, "learning_rate": 1.783898305084746e-05, "loss": 1.0744, "step": 421 }, { "epoch": 0.05374341340719869, "grad_norm": 5.908386812743721, "learning_rate": 1.788135593220339e-05, "loss": 1.2553, "step": 422 }, { "epoch": 0.05387076746740532, "grad_norm": 5.477098120428443, "learning_rate": 1.7923728813559326e-05, "loss": 1.1578, "step": 423 }, { "epoch": 0.05399812152761195, "grad_norm": 5.269725074907827, "learning_rate": 1.7966101694915256e-05, "loss": 1.0921, "step": 424 }, { "epoch": 0.054125475587818585, "grad_norm": 5.99473658806239, "learning_rate": 1.800847457627119e-05, "loss": 1.1249, "step": 425 }, { "epoch": 0.05425282964802522, "grad_norm": 5.212141226810042, "learning_rate": 1.805084745762712e-05, "loss": 1.0942, "step": 426 }, { "epoch": 0.05438018370823185, "grad_norm": 4.914321965340521, "learning_rate": 1.8093220338983053e-05, "loss": 1.0524, "step": 427 }, { "epoch": 0.05450753776843848, "grad_norm": 4.934122789640228, "learning_rate": 1.8135593220338986e-05, "loss": 1.0692, "step": 428 }, { "epoch": 0.05463489182864511, "grad_norm": 5.974731962823762, "learning_rate": 1.8177966101694916e-05, "loss": 1.0516, "step": 429 }, { "epoch": 0.05476224588885174, "grad_norm": 5.18332734102514, "learning_rate": 1.822033898305085e-05, "loss": 1.1025, "step": 430 }, { "epoch": 0.05488959994905838, "grad_norm": 4.61394209816082, "learning_rate": 1.8262711864406783e-05, "loss": 1.091, "step": 431 }, { "epoch": 0.055016954009265005, "grad_norm": 5.894652591010023, "learning_rate": 1.8305084745762713e-05, "loss": 1.1953, "step": 432 }, { "epoch": 0.05514430806947164, "grad_norm": 5.86790849951594, "learning_rate": 1.8347457627118646e-05, "loss": 1.0643, "step": 433 }, { "epoch": 0.055271662129678274, "grad_norm": 5.628648380088236, "learning_rate": 1.8389830508474576e-05, "loss": 1.119, "step": 434 }, { "epoch": 0.0553990161898849, "grad_norm": 4.991344207554816, "learning_rate": 1.843220338983051e-05, "loss": 1.0047, "step": 435 }, { "epoch": 0.055526370250091536, "grad_norm": 6.211890650133721, "learning_rate": 1.8474576271186443e-05, "loss": 1.0869, "step": 436 }, { "epoch": 0.05565372431029817, "grad_norm": 6.316762561456026, "learning_rate": 1.8516949152542373e-05, "loss": 1.0198, "step": 437 }, { "epoch": 0.0557810783705048, "grad_norm": 4.2256016229559235, "learning_rate": 1.8559322033898307e-05, "loss": 1.0982, "step": 438 }, { "epoch": 0.05590843243071143, "grad_norm": 7.262435245717664, "learning_rate": 1.8601694915254237e-05, "loss": 1.0517, "step": 439 }, { "epoch": 0.05603578649091806, "grad_norm": 5.585281184126876, "learning_rate": 1.864406779661017e-05, "loss": 1.2128, "step": 440 }, { "epoch": 0.056163140551124695, "grad_norm": 4.767034803637561, "learning_rate": 1.8686440677966103e-05, "loss": 1.1603, "step": 441 }, { "epoch": 0.05629049461133133, "grad_norm": 6.468924552294705, "learning_rate": 1.8728813559322033e-05, "loss": 1.2014, "step": 442 }, { "epoch": 0.05641784867153796, "grad_norm": 9.31974357153982, "learning_rate": 1.877118644067797e-05, "loss": 1.0906, "step": 443 }, { "epoch": 0.05654520273174459, "grad_norm": 6.012844007647524, "learning_rate": 1.88135593220339e-05, "loss": 1.1387, "step": 444 }, { "epoch": 0.056672556791951226, "grad_norm": 4.083626146262829, "learning_rate": 1.8855932203389834e-05, "loss": 1.1401, "step": 445 }, { "epoch": 0.05679991085215785, "grad_norm": 3.9173329104567634, "learning_rate": 1.8898305084745764e-05, "loss": 1.0576, "step": 446 }, { "epoch": 0.05692726491236449, "grad_norm": 5.714794354280716, "learning_rate": 1.8940677966101697e-05, "loss": 0.975, "step": 447 }, { "epoch": 0.05705461897257112, "grad_norm": 7.531880079322617, "learning_rate": 1.898305084745763e-05, "loss": 1.1463, "step": 448 }, { "epoch": 0.05718197303277775, "grad_norm": 4.742736684054098, "learning_rate": 1.902542372881356e-05, "loss": 0.9861, "step": 449 }, { "epoch": 0.057309327092984384, "grad_norm": 4.8112771598542885, "learning_rate": 1.9067796610169494e-05, "loss": 1.1611, "step": 450 }, { "epoch": 0.05743668115319101, "grad_norm": 6.124219883654539, "learning_rate": 1.9110169491525424e-05, "loss": 1.1772, "step": 451 }, { "epoch": 0.057564035213397646, "grad_norm": 6.490103891067207, "learning_rate": 1.9152542372881357e-05, "loss": 1.1467, "step": 452 }, { "epoch": 0.05769138927360428, "grad_norm": 4.884413799929866, "learning_rate": 1.919491525423729e-05, "loss": 1.0313, "step": 453 }, { "epoch": 0.05781874333381091, "grad_norm": 17.31884120667972, "learning_rate": 1.923728813559322e-05, "loss": 1.0885, "step": 454 }, { "epoch": 0.05794609739401754, "grad_norm": 6.044814909695786, "learning_rate": 1.9279661016949154e-05, "loss": 1.0058, "step": 455 }, { "epoch": 0.05807345145422418, "grad_norm": 6.115362876923583, "learning_rate": 1.9322033898305087e-05, "loss": 1.1625, "step": 456 }, { "epoch": 0.058200805514430805, "grad_norm": 7.167009685498636, "learning_rate": 1.9364406779661017e-05, "loss": 1.2125, "step": 457 }, { "epoch": 0.05832815957463744, "grad_norm": 8.321018070875693, "learning_rate": 1.940677966101695e-05, "loss": 1.2028, "step": 458 }, { "epoch": 0.058455513634844074, "grad_norm": 5.883357171000457, "learning_rate": 1.944915254237288e-05, "loss": 1.0797, "step": 459 }, { "epoch": 0.0585828676950507, "grad_norm": 6.548593372992014, "learning_rate": 1.9491525423728814e-05, "loss": 1.2657, "step": 460 }, { "epoch": 0.058710221755257336, "grad_norm": 6.503332017630883, "learning_rate": 1.9533898305084748e-05, "loss": 1.1852, "step": 461 }, { "epoch": 0.05883757581546396, "grad_norm": 5.406696984660371, "learning_rate": 1.9576271186440678e-05, "loss": 1.0883, "step": 462 }, { "epoch": 0.0589649298756706, "grad_norm": 6.705726669953014, "learning_rate": 1.961864406779661e-05, "loss": 1.0506, "step": 463 }, { "epoch": 0.05909228393587723, "grad_norm": 4.629241235900978, "learning_rate": 1.9661016949152545e-05, "loss": 1.0315, "step": 464 }, { "epoch": 0.05921963799608386, "grad_norm": 5.078040676884035, "learning_rate": 1.9703389830508478e-05, "loss": 1.1109, "step": 465 }, { "epoch": 0.059346992056290494, "grad_norm": 7.199572698841122, "learning_rate": 1.9745762711864408e-05, "loss": 1.0686, "step": 466 }, { "epoch": 0.05947434611649713, "grad_norm": 6.04655483295243, "learning_rate": 1.978813559322034e-05, "loss": 1.2287, "step": 467 }, { "epoch": 0.059601700176703756, "grad_norm": 4.20413933182076, "learning_rate": 1.9830508474576275e-05, "loss": 1.1357, "step": 468 }, { "epoch": 0.05972905423691039, "grad_norm": 5.884048636001412, "learning_rate": 1.9872881355932205e-05, "loss": 1.0665, "step": 469 }, { "epoch": 0.059856408297117025, "grad_norm": 4.659764544387501, "learning_rate": 1.9915254237288138e-05, "loss": 1.0815, "step": 470 }, { "epoch": 0.05998376235732365, "grad_norm": 5.519786788305731, "learning_rate": 1.9957627118644068e-05, "loss": 1.0704, "step": 471 }, { "epoch": 0.06011111641753029, "grad_norm": 8.08682390597387, "learning_rate": 2e-05, "loss": 1.0464, "step": 472 }, { "epoch": 0.06023847047773692, "grad_norm": 5.6181193336206166, "learning_rate": 1.9999999787305685e-05, "loss": 1.0311, "step": 473 }, { "epoch": 0.06036582453794355, "grad_norm": 4.519360327229284, "learning_rate": 1.9999999149222745e-05, "loss": 1.0972, "step": 474 }, { "epoch": 0.060493178598150184, "grad_norm": 6.846898856876654, "learning_rate": 1.9999998085751203e-05, "loss": 1.1147, "step": 475 }, { "epoch": 0.06062053265835681, "grad_norm": 5.986633352740627, "learning_rate": 1.9999996596891108e-05, "loss": 1.1468, "step": 476 }, { "epoch": 0.060747886718563446, "grad_norm": 4.984324679225318, "learning_rate": 1.999999468264253e-05, "loss": 1.1052, "step": 477 }, { "epoch": 0.06087524077877008, "grad_norm": 7.075737893813151, "learning_rate": 1.999999234300554e-05, "loss": 1.0461, "step": 478 }, { "epoch": 0.06100259483897671, "grad_norm": 5.73308430212171, "learning_rate": 1.9999989577980245e-05, "loss": 1.2228, "step": 479 }, { "epoch": 0.06112994889918334, "grad_norm": 6.053575376693396, "learning_rate": 1.9999986387566755e-05, "loss": 1.2758, "step": 480 }, { "epoch": 0.06125730295938998, "grad_norm": 6.078084832772162, "learning_rate": 1.9999982771765212e-05, "loss": 1.1119, "step": 481 }, { "epoch": 0.061384657019596604, "grad_norm": 6.797968774241789, "learning_rate": 1.9999978730575768e-05, "loss": 1.0882, "step": 482 }, { "epoch": 0.06151201107980324, "grad_norm": 6.273452093694109, "learning_rate": 1.999997426399859e-05, "loss": 1.1447, "step": 483 }, { "epoch": 0.06163936514000987, "grad_norm": 7.10284083310267, "learning_rate": 1.999996937203388e-05, "loss": 0.9969, "step": 484 }, { "epoch": 0.0617667192002165, "grad_norm": 5.530254727061982, "learning_rate": 1.9999964054681835e-05, "loss": 1.1789, "step": 485 }, { "epoch": 0.061894073260423135, "grad_norm": 4.669864489858105, "learning_rate": 1.9999958311942685e-05, "loss": 1.1693, "step": 486 }, { "epoch": 0.06202142732062976, "grad_norm": 5.757957922497859, "learning_rate": 1.9999952143816677e-05, "loss": 1.1089, "step": 487 }, { "epoch": 0.0621487813808364, "grad_norm": 7.478914502779971, "learning_rate": 1.999994555030407e-05, "loss": 1.1695, "step": 488 }, { "epoch": 0.06227613544104303, "grad_norm": 7.5854044450973825, "learning_rate": 1.9999938531405142e-05, "loss": 1.1188, "step": 489 }, { "epoch": 0.06240348950124966, "grad_norm": 7.252243747122389, "learning_rate": 1.9999931087120198e-05, "loss": 1.1036, "step": 490 }, { "epoch": 0.06253084356145629, "grad_norm": 5.098055652157508, "learning_rate": 1.999992321744955e-05, "loss": 1.1222, "step": 491 }, { "epoch": 0.06265819762166293, "grad_norm": 5.617857743175649, "learning_rate": 1.9999914922393536e-05, "loss": 1.0715, "step": 492 }, { "epoch": 0.06278555168186956, "grad_norm": 4.719171568373844, "learning_rate": 1.9999906201952507e-05, "loss": 1.0764, "step": 493 }, { "epoch": 0.06291290574207618, "grad_norm": 6.104534602441233, "learning_rate": 1.9999897056126832e-05, "loss": 1.2059, "step": 494 }, { "epoch": 0.06304025980228282, "grad_norm": 6.722891165636998, "learning_rate": 1.9999887484916902e-05, "loss": 1.1001, "step": 495 }, { "epoch": 0.06316761386248945, "grad_norm": 5.75820105806048, "learning_rate": 1.999987748832313e-05, "loss": 1.0382, "step": 496 }, { "epoch": 0.06329496792269608, "grad_norm": 5.701985786262386, "learning_rate": 1.9999867066345927e-05, "loss": 1.0952, "step": 497 }, { "epoch": 0.06342232198290272, "grad_norm": 4.50835236964317, "learning_rate": 1.9999856218985753e-05, "loss": 1.0516, "step": 498 }, { "epoch": 0.06354967604310935, "grad_norm": 6.243899305236716, "learning_rate": 1.9999844946243055e-05, "loss": 0.9709, "step": 499 }, { "epoch": 0.06367703010331598, "grad_norm": 5.025162325082904, "learning_rate": 1.999983324811832e-05, "loss": 1.0088, "step": 500 }, { "epoch": 0.06380438416352262, "grad_norm": 4.394774081832498, "learning_rate": 1.9999821124612047e-05, "loss": 1.0979, "step": 501 }, { "epoch": 0.06393173822372925, "grad_norm": 5.236599122276942, "learning_rate": 1.9999808575724747e-05, "loss": 1.1553, "step": 502 }, { "epoch": 0.06405909228393587, "grad_norm": 5.490313235937539, "learning_rate": 1.9999795601456955e-05, "loss": 1.0991, "step": 503 }, { "epoch": 0.06418644634414251, "grad_norm": 7.443317350150323, "learning_rate": 1.9999782201809227e-05, "loss": 1.029, "step": 504 }, { "epoch": 0.06431380040434914, "grad_norm": 6.019639298288074, "learning_rate": 1.9999768376782126e-05, "loss": 1.0034, "step": 505 }, { "epoch": 0.06444115446455577, "grad_norm": 5.986425302535064, "learning_rate": 1.9999754126376247e-05, "loss": 1.2177, "step": 506 }, { "epoch": 0.06456850852476241, "grad_norm": 6.693771701353711, "learning_rate": 1.999973945059219e-05, "loss": 1.1051, "step": 507 }, { "epoch": 0.06469586258496904, "grad_norm": 5.945756812918701, "learning_rate": 1.9999724349430588e-05, "loss": 1.1274, "step": 508 }, { "epoch": 0.06482321664517567, "grad_norm": 5.769953067264716, "learning_rate": 1.9999708822892074e-05, "loss": 1.1091, "step": 509 }, { "epoch": 0.06495057070538231, "grad_norm": 5.927645566674118, "learning_rate": 1.999969287097731e-05, "loss": 1.229, "step": 510 }, { "epoch": 0.06507792476558893, "grad_norm": 5.822752084151968, "learning_rate": 1.9999676493686982e-05, "loss": 1.068, "step": 511 }, { "epoch": 0.06520527882579556, "grad_norm": 8.400925321930224, "learning_rate": 1.9999659691021783e-05, "loss": 1.1987, "step": 512 }, { "epoch": 0.0653326328860022, "grad_norm": 3.869302190712713, "learning_rate": 1.999964246298242e-05, "loss": 1.0111, "step": 513 }, { "epoch": 0.06545998694620883, "grad_norm": 8.700792421229472, "learning_rate": 1.9999624809569635e-05, "loss": 1.0577, "step": 514 }, { "epoch": 0.06558734100641546, "grad_norm": 5.010001669634086, "learning_rate": 1.9999606730784178e-05, "loss": 1.1465, "step": 515 }, { "epoch": 0.06571469506662209, "grad_norm": 8.145420177808317, "learning_rate": 1.9999588226626814e-05, "loss": 1.1327, "step": 516 }, { "epoch": 0.06584204912682873, "grad_norm": 6.193503193792947, "learning_rate": 1.9999569297098334e-05, "loss": 1.144, "step": 517 }, { "epoch": 0.06596940318703536, "grad_norm": 7.465247190743016, "learning_rate": 1.999954994219954e-05, "loss": 1.1173, "step": 518 }, { "epoch": 0.06609675724724198, "grad_norm": 6.750553814809614, "learning_rate": 1.999953016193126e-05, "loss": 1.1759, "step": 519 }, { "epoch": 0.06622411130744862, "grad_norm": 4.564339045028179, "learning_rate": 1.9999509956294324e-05, "loss": 1.0987, "step": 520 }, { "epoch": 0.06635146536765525, "grad_norm": 5.428131802762721, "learning_rate": 1.9999489325289607e-05, "loss": 1.124, "step": 521 }, { "epoch": 0.06647881942786188, "grad_norm": 7.130879168351056, "learning_rate": 1.9999468268917978e-05, "loss": 1.008, "step": 522 }, { "epoch": 0.06660617348806852, "grad_norm": 6.866731977923751, "learning_rate": 1.9999446787180338e-05, "loss": 1.1246, "step": 523 }, { "epoch": 0.06673352754827515, "grad_norm": 5.66268234801421, "learning_rate": 1.9999424880077592e-05, "loss": 1.1667, "step": 524 }, { "epoch": 0.06686088160848178, "grad_norm": 6.135632375693495, "learning_rate": 1.999940254761068e-05, "loss": 1.1069, "step": 525 }, { "epoch": 0.06698823566868842, "grad_norm": 6.9654472975314015, "learning_rate": 1.9999379789780543e-05, "loss": 1.2112, "step": 526 }, { "epoch": 0.06711558972889504, "grad_norm": 4.0917355771810735, "learning_rate": 1.999935660658816e-05, "loss": 1.0041, "step": 527 }, { "epoch": 0.06724294378910167, "grad_norm": 5.299854262165641, "learning_rate": 1.9999332998034515e-05, "loss": 1.1039, "step": 528 }, { "epoch": 0.06737029784930831, "grad_norm": 5.03714288624225, "learning_rate": 1.9999308964120604e-05, "loss": 1.0356, "step": 529 }, { "epoch": 0.06749765190951494, "grad_norm": 5.018361328580709, "learning_rate": 1.999928450484746e-05, "loss": 1.0958, "step": 530 }, { "epoch": 0.06762500596972157, "grad_norm": 4.259050045925872, "learning_rate": 1.9999259620216113e-05, "loss": 1.0695, "step": 531 }, { "epoch": 0.06775236002992821, "grad_norm": 4.36079760893954, "learning_rate": 1.999923431022763e-05, "loss": 1.1662, "step": 532 }, { "epoch": 0.06787971409013484, "grad_norm": 5.849711758285983, "learning_rate": 1.9999208574883086e-05, "loss": 1.1017, "step": 533 }, { "epoch": 0.06800706815034147, "grad_norm": 4.771071826465309, "learning_rate": 1.999918241418357e-05, "loss": 1.0827, "step": 534 }, { "epoch": 0.0681344222105481, "grad_norm": 4.811774062854366, "learning_rate": 1.99991558281302e-05, "loss": 1.0242, "step": 535 }, { "epoch": 0.06826177627075473, "grad_norm": 6.02176026421573, "learning_rate": 1.999912881672411e-05, "loss": 1.0808, "step": 536 }, { "epoch": 0.06838913033096136, "grad_norm": 5.9725923802994405, "learning_rate": 1.999910137996644e-05, "loss": 1.0689, "step": 537 }, { "epoch": 0.06851648439116799, "grad_norm": 4.913192016936453, "learning_rate": 1.9999073517858365e-05, "loss": 1.015, "step": 538 }, { "epoch": 0.06864383845137463, "grad_norm": 5.977263286120293, "learning_rate": 1.999904523040107e-05, "loss": 1.1264, "step": 539 }, { "epoch": 0.06877119251158126, "grad_norm": 6.274222774929446, "learning_rate": 1.9999016517595752e-05, "loss": 1.0752, "step": 540 }, { "epoch": 0.06889854657178789, "grad_norm": 5.376708180660854, "learning_rate": 1.999898737944364e-05, "loss": 1.0579, "step": 541 }, { "epoch": 0.06902590063199453, "grad_norm": 5.978380991215184, "learning_rate": 1.9998957815945962e-05, "loss": 0.9844, "step": 542 }, { "epoch": 0.06915325469220115, "grad_norm": 5.110581375165403, "learning_rate": 1.999892782710399e-05, "loss": 1.1507, "step": 543 }, { "epoch": 0.06928060875240778, "grad_norm": 5.5490173500932345, "learning_rate": 1.999889741291899e-05, "loss": 1.1175, "step": 544 }, { "epoch": 0.06940796281261442, "grad_norm": 6.375743448626605, "learning_rate": 1.9998866573392265e-05, "loss": 1.1339, "step": 545 }, { "epoch": 0.06953531687282105, "grad_norm": 5.936784560617764, "learning_rate": 1.9998835308525115e-05, "loss": 0.9726, "step": 546 }, { "epoch": 0.06966267093302768, "grad_norm": 6.062258371473456, "learning_rate": 1.9998803618318873e-05, "loss": 1.0518, "step": 547 }, { "epoch": 0.06979002499323432, "grad_norm": 5.482671141521196, "learning_rate": 1.9998771502774895e-05, "loss": 1.1076, "step": 548 }, { "epoch": 0.06991737905344095, "grad_norm": 6.91431518782602, "learning_rate": 1.9998738961894538e-05, "loss": 1.2298, "step": 549 }, { "epoch": 0.07004473311364758, "grad_norm": 8.527637205784295, "learning_rate": 1.9998705995679195e-05, "loss": 1.1865, "step": 550 }, { "epoch": 0.07017208717385422, "grad_norm": 5.06632813192715, "learning_rate": 1.999867260413026e-05, "loss": 1.1329, "step": 551 }, { "epoch": 0.07029944123406084, "grad_norm": 6.611392459528193, "learning_rate": 1.9998638787249158e-05, "loss": 1.023, "step": 552 }, { "epoch": 0.07042679529426747, "grad_norm": 4.434471493854659, "learning_rate": 1.9998604545037325e-05, "loss": 1.0204, "step": 553 }, { "epoch": 0.07055414935447411, "grad_norm": 5.885039821026563, "learning_rate": 1.999856987749622e-05, "loss": 1.1628, "step": 554 }, { "epoch": 0.07068150341468074, "grad_norm": 7.757189707425133, "learning_rate": 1.999853478462732e-05, "loss": 1.0407, "step": 555 }, { "epoch": 0.07080885747488737, "grad_norm": 5.097586842499232, "learning_rate": 1.999849926643211e-05, "loss": 1.0845, "step": 556 }, { "epoch": 0.07093621153509401, "grad_norm": 6.9215922760756765, "learning_rate": 1.999846332291211e-05, "loss": 1.0299, "step": 557 }, { "epoch": 0.07106356559530064, "grad_norm": 4.351081643511859, "learning_rate": 1.9998426954068837e-05, "loss": 1.035, "step": 558 }, { "epoch": 0.07119091965550726, "grad_norm": 5.114464514362844, "learning_rate": 1.9998390159903853e-05, "loss": 1.0353, "step": 559 }, { "epoch": 0.07131827371571389, "grad_norm": 7.082369241365838, "learning_rate": 1.9998352940418713e-05, "loss": 0.9955, "step": 560 }, { "epoch": 0.07144562777592053, "grad_norm": 4.362420880931001, "learning_rate": 1.9998315295615002e-05, "loss": 1.1176, "step": 561 }, { "epoch": 0.07157298183612716, "grad_norm": 6.0660644749385355, "learning_rate": 1.999827722549432e-05, "loss": 1.1078, "step": 562 }, { "epoch": 0.07170033589633379, "grad_norm": 4.456082830731038, "learning_rate": 1.99982387300583e-05, "loss": 1.0612, "step": 563 }, { "epoch": 0.07182768995654043, "grad_norm": 5.829329542602628, "learning_rate": 1.9998199809308558e-05, "loss": 1.0402, "step": 564 }, { "epoch": 0.07195504401674706, "grad_norm": 5.921042673202974, "learning_rate": 1.9998160463246762e-05, "loss": 1.15, "step": 565 }, { "epoch": 0.07208239807695369, "grad_norm": 5.450573868242138, "learning_rate": 1.9998120691874586e-05, "loss": 1.1177, "step": 566 }, { "epoch": 0.07220975213716033, "grad_norm": 5.703991131790351, "learning_rate": 1.9998080495193716e-05, "loss": 1.1882, "step": 567 }, { "epoch": 0.07233710619736695, "grad_norm": 5.68231568533917, "learning_rate": 1.9998039873205866e-05, "loss": 1.0749, "step": 568 }, { "epoch": 0.07246446025757358, "grad_norm": 4.717088015083142, "learning_rate": 1.999799882591277e-05, "loss": 1.1654, "step": 569 }, { "epoch": 0.07259181431778022, "grad_norm": 5.238015073517867, "learning_rate": 1.999795735331616e-05, "loss": 1.0809, "step": 570 }, { "epoch": 0.07271916837798685, "grad_norm": 6.21417528126376, "learning_rate": 1.9997915455417813e-05, "loss": 1.1254, "step": 571 }, { "epoch": 0.07284652243819348, "grad_norm": 4.179097870209438, "learning_rate": 1.9997873132219502e-05, "loss": 1.0508, "step": 572 }, { "epoch": 0.07297387649840012, "grad_norm": 6.321086520222815, "learning_rate": 1.9997830383723034e-05, "loss": 1.1056, "step": 573 }, { "epoch": 0.07310123055860675, "grad_norm": 5.323689961360213, "learning_rate": 1.9997787209930222e-05, "loss": 1.1261, "step": 574 }, { "epoch": 0.07322858461881337, "grad_norm": 5.293979089457133, "learning_rate": 1.999774361084291e-05, "loss": 1.075, "step": 575 }, { "epoch": 0.07335593867902002, "grad_norm": 6.901915419823858, "learning_rate": 1.9997699586462947e-05, "loss": 1.0958, "step": 576 }, { "epoch": 0.07348329273922664, "grad_norm": 4.789539008525201, "learning_rate": 1.9997655136792206e-05, "loss": 1.0798, "step": 577 }, { "epoch": 0.07361064679943327, "grad_norm": 6.839564328066072, "learning_rate": 1.999761026183258e-05, "loss": 1.0093, "step": 578 }, { "epoch": 0.07373800085963991, "grad_norm": 6.244943022517674, "learning_rate": 1.9997564961585976e-05, "loss": 1.06, "step": 579 }, { "epoch": 0.07386535491984654, "grad_norm": 3.9667931485780747, "learning_rate": 1.999751923605432e-05, "loss": 1.0282, "step": 580 }, { "epoch": 0.07399270898005317, "grad_norm": 6.350330566452636, "learning_rate": 1.999747308523956e-05, "loss": 1.1528, "step": 581 }, { "epoch": 0.0741200630402598, "grad_norm": 5.164516042394334, "learning_rate": 1.999742650914366e-05, "loss": 1.031, "step": 582 }, { "epoch": 0.07424741710046644, "grad_norm": 4.923610417638436, "learning_rate": 1.9997379507768596e-05, "loss": 1.0159, "step": 583 }, { "epoch": 0.07437477116067306, "grad_norm": 5.076996381492745, "learning_rate": 1.9997332081116374e-05, "loss": 1.1246, "step": 584 }, { "epoch": 0.07450212522087969, "grad_norm": 3.4257266867804166, "learning_rate": 1.9997284229189008e-05, "loss": 1.1539, "step": 585 }, { "epoch": 0.07462947928108633, "grad_norm": 5.9256759966138635, "learning_rate": 1.9997235951988533e-05, "loss": 1.0759, "step": 586 }, { "epoch": 0.07475683334129296, "grad_norm": 5.6626644961145844, "learning_rate": 1.9997187249517004e-05, "loss": 1.1168, "step": 587 }, { "epoch": 0.07488418740149959, "grad_norm": 5.7046274374791075, "learning_rate": 1.999713812177649e-05, "loss": 0.9921, "step": 588 }, { "epoch": 0.07501154146170623, "grad_norm": 5.994923669426167, "learning_rate": 1.9997088568769084e-05, "loss": 1.2166, "step": 589 }, { "epoch": 0.07513889552191286, "grad_norm": 5.193736544338015, "learning_rate": 1.9997038590496892e-05, "loss": 0.9917, "step": 590 }, { "epoch": 0.07526624958211948, "grad_norm": 4.396294351179956, "learning_rate": 1.9996988186962044e-05, "loss": 1.0891, "step": 591 }, { "epoch": 0.07539360364232613, "grad_norm": 5.49995734458335, "learning_rate": 1.999693735816668e-05, "loss": 0.9993, "step": 592 }, { "epoch": 0.07552095770253275, "grad_norm": 5.253398884946806, "learning_rate": 1.9996886104112963e-05, "loss": 1.0713, "step": 593 }, { "epoch": 0.07564831176273938, "grad_norm": 4.642680011765876, "learning_rate": 1.9996834424803074e-05, "loss": 1.0839, "step": 594 }, { "epoch": 0.07577566582294602, "grad_norm": 6.37376182350069, "learning_rate": 1.999678232023921e-05, "loss": 1.1269, "step": 595 }, { "epoch": 0.07590301988315265, "grad_norm": 5.763360134317636, "learning_rate": 1.9996729790423588e-05, "loss": 1.1208, "step": 596 }, { "epoch": 0.07603037394335928, "grad_norm": 5.206828680769963, "learning_rate": 1.9996676835358443e-05, "loss": 1.0223, "step": 597 }, { "epoch": 0.07615772800356592, "grad_norm": 4.44025776020388, "learning_rate": 1.999662345504603e-05, "loss": 1.07, "step": 598 }, { "epoch": 0.07628508206377255, "grad_norm": 4.782135127297473, "learning_rate": 1.9996569649488614e-05, "loss": 1.0387, "step": 599 }, { "epoch": 0.07641243612397917, "grad_norm": 5.107139915137725, "learning_rate": 1.9996515418688493e-05, "loss": 1.1048, "step": 600 }, { "epoch": 0.07653979018418582, "grad_norm": 5.9873211732412495, "learning_rate": 1.9996460762647962e-05, "loss": 1.0631, "step": 601 }, { "epoch": 0.07666714424439244, "grad_norm": 5.043658764095534, "learning_rate": 1.9996405681369353e-05, "loss": 1.0388, "step": 602 }, { "epoch": 0.07679449830459907, "grad_norm": 4.477337036234279, "learning_rate": 1.999635017485501e-05, "loss": 1.0678, "step": 603 }, { "epoch": 0.07692185236480571, "grad_norm": 3.4802276151035993, "learning_rate": 1.999629424310729e-05, "loss": 1.0386, "step": 604 }, { "epoch": 0.07704920642501234, "grad_norm": 6.354251381564723, "learning_rate": 1.999623788612858e-05, "loss": 1.0509, "step": 605 }, { "epoch": 0.07717656048521897, "grad_norm": 7.521001783969517, "learning_rate": 1.9996181103921268e-05, "loss": 1.0353, "step": 606 }, { "epoch": 0.0773039145454256, "grad_norm": 5.629739248359252, "learning_rate": 1.9996123896487775e-05, "loss": 1.0046, "step": 607 }, { "epoch": 0.07743126860563224, "grad_norm": 5.746840230765029, "learning_rate": 1.9996066263830533e-05, "loss": 1.0223, "step": 608 }, { "epoch": 0.07755862266583886, "grad_norm": 6.298985584272212, "learning_rate": 1.999600820595199e-05, "loss": 1.0892, "step": 609 }, { "epoch": 0.07768597672604549, "grad_norm": 6.803573780037953, "learning_rate": 1.999594972285462e-05, "loss": 1.0924, "step": 610 }, { "epoch": 0.07781333078625213, "grad_norm": 6.37358376722561, "learning_rate": 1.9995890814540915e-05, "loss": 1.0744, "step": 611 }, { "epoch": 0.07794068484645876, "grad_norm": 9.007917100839986, "learning_rate": 1.9995831481013376e-05, "loss": 1.2104, "step": 612 }, { "epoch": 0.07806803890666539, "grad_norm": 8.287857126219594, "learning_rate": 1.999577172227452e-05, "loss": 1.0484, "step": 613 }, { "epoch": 0.07819539296687203, "grad_norm": 4.04299092309741, "learning_rate": 1.9995711538326903e-05, "loss": 0.9676, "step": 614 }, { "epoch": 0.07832274702707866, "grad_norm": 6.571260544897439, "learning_rate": 1.9995650929173075e-05, "loss": 1.0512, "step": 615 }, { "epoch": 0.07845010108728528, "grad_norm": 6.1484744156888915, "learning_rate": 1.999558989481562e-05, "loss": 1.1788, "step": 616 }, { "epoch": 0.07857745514749193, "grad_norm": 5.316568230621563, "learning_rate": 1.999552843525713e-05, "loss": 1.0587, "step": 617 }, { "epoch": 0.07870480920769855, "grad_norm": 4.570047484971308, "learning_rate": 1.999546655050022e-05, "loss": 1.0535, "step": 618 }, { "epoch": 0.07883216326790518, "grad_norm": 6.1608163386830785, "learning_rate": 1.9995404240547527e-05, "loss": 1.0144, "step": 619 }, { "epoch": 0.07895951732811182, "grad_norm": 7.7568651371924595, "learning_rate": 1.9995341505401697e-05, "loss": 1.0017, "step": 620 }, { "epoch": 0.07908687138831845, "grad_norm": 5.0343224284140184, "learning_rate": 1.99952783450654e-05, "loss": 0.9866, "step": 621 }, { "epoch": 0.07921422544852508, "grad_norm": 4.733892334358776, "learning_rate": 1.999521475954132e-05, "loss": 1.0346, "step": 622 }, { "epoch": 0.07934157950873172, "grad_norm": 8.245032462200156, "learning_rate": 1.9995150748832167e-05, "loss": 1.0559, "step": 623 }, { "epoch": 0.07946893356893835, "grad_norm": 5.288275385428034, "learning_rate": 1.9995086312940665e-05, "loss": 1.1287, "step": 624 }, { "epoch": 0.07959628762914497, "grad_norm": 4.692547128203939, "learning_rate": 1.9995021451869548e-05, "loss": 1.0788, "step": 625 }, { "epoch": 0.07972364168935162, "grad_norm": 5.274171136757848, "learning_rate": 1.999495616562158e-05, "loss": 1.0668, "step": 626 }, { "epoch": 0.07985099574955824, "grad_norm": 4.503948598565, "learning_rate": 1.9994890454199537e-05, "loss": 0.9856, "step": 627 }, { "epoch": 0.07997834980976487, "grad_norm": 4.458866099436066, "learning_rate": 1.999482431760621e-05, "loss": 1.1141, "step": 628 }, { "epoch": 0.0801057038699715, "grad_norm": 5.021075273227246, "learning_rate": 1.9994757755844422e-05, "loss": 1.0495, "step": 629 }, { "epoch": 0.08023305793017814, "grad_norm": 6.947131370381861, "learning_rate": 1.9994690768916997e-05, "loss": 0.98, "step": 630 }, { "epoch": 0.08036041199038477, "grad_norm": 7.41009293017886, "learning_rate": 1.999462335682679e-05, "loss": 1.1349, "step": 631 }, { "epoch": 0.0804877660505914, "grad_norm": 5.172513156024174, "learning_rate": 1.999455551957666e-05, "loss": 1.0966, "step": 632 }, { "epoch": 0.08061512011079804, "grad_norm": 5.951912183195801, "learning_rate": 1.99944872571695e-05, "loss": 1.0624, "step": 633 }, { "epoch": 0.08074247417100466, "grad_norm": 7.68449951170285, "learning_rate": 1.9994418569608212e-05, "loss": 0.9459, "step": 634 }, { "epoch": 0.08086982823121129, "grad_norm": 5.28572353524242, "learning_rate": 1.999434945689572e-05, "loss": 1.1071, "step": 635 }, { "epoch": 0.08099718229141793, "grad_norm": 5.390054600709794, "learning_rate": 1.9994279919034958e-05, "loss": 1.1041, "step": 636 }, { "epoch": 0.08112453635162456, "grad_norm": 6.091162812164593, "learning_rate": 1.999420995602889e-05, "loss": 1.0167, "step": 637 }, { "epoch": 0.08125189041183119, "grad_norm": 9.582331419829577, "learning_rate": 1.9994139567880492e-05, "loss": 1.1483, "step": 638 }, { "epoch": 0.08137924447203783, "grad_norm": 6.486061704103913, "learning_rate": 1.999406875459275e-05, "loss": 1.057, "step": 639 }, { "epoch": 0.08150659853224446, "grad_norm": 5.762382774236402, "learning_rate": 1.999399751616869e-05, "loss": 1.1079, "step": 640 }, { "epoch": 0.08163395259245108, "grad_norm": 5.748421825606348, "learning_rate": 1.999392585261133e-05, "loss": 1.0681, "step": 641 }, { "epoch": 0.08176130665265773, "grad_norm": 6.038734654142748, "learning_rate": 1.9993853763923724e-05, "loss": 0.9983, "step": 642 }, { "epoch": 0.08188866071286435, "grad_norm": 6.696020954623414, "learning_rate": 1.9993781250108934e-05, "loss": 0.9574, "step": 643 }, { "epoch": 0.08201601477307098, "grad_norm": 4.880728461504636, "learning_rate": 1.9993708311170055e-05, "loss": 1.0897, "step": 644 }, { "epoch": 0.08214336883327762, "grad_norm": 6.765789355953292, "learning_rate": 1.999363494711018e-05, "loss": 0.9652, "step": 645 }, { "epoch": 0.08227072289348425, "grad_norm": 4.847223623737666, "learning_rate": 1.999356115793243e-05, "loss": 1.079, "step": 646 }, { "epoch": 0.08239807695369088, "grad_norm": 5.234269143927291, "learning_rate": 1.999348694363995e-05, "loss": 1.0237, "step": 647 }, { "epoch": 0.08252543101389752, "grad_norm": 4.89183541331242, "learning_rate": 1.9993412304235898e-05, "loss": 1.0104, "step": 648 }, { "epoch": 0.08265278507410415, "grad_norm": 5.865712718277241, "learning_rate": 1.999333723972344e-05, "loss": 1.0922, "step": 649 }, { "epoch": 0.08278013913431077, "grad_norm": 5.789907869913807, "learning_rate": 1.999326175010578e-05, "loss": 1.0889, "step": 650 }, { "epoch": 0.0829074931945174, "grad_norm": 6.9597588942410065, "learning_rate": 1.9993185835386118e-05, "loss": 1.0146, "step": 651 }, { "epoch": 0.08303484725472404, "grad_norm": 5.810287218463056, "learning_rate": 1.999310949556769e-05, "loss": 1.0555, "step": 652 }, { "epoch": 0.08316220131493067, "grad_norm": 5.914241168247319, "learning_rate": 1.999303273065374e-05, "loss": 1.1138, "step": 653 }, { "epoch": 0.0832895553751373, "grad_norm": 7.40780816406947, "learning_rate": 1.9992955540647544e-05, "loss": 1.0209, "step": 654 }, { "epoch": 0.08341690943534394, "grad_norm": 5.139163287743998, "learning_rate": 1.999287792555237e-05, "loss": 1.1916, "step": 655 }, { "epoch": 0.08354426349555057, "grad_norm": 7.338151071702125, "learning_rate": 1.999279988537153e-05, "loss": 1.0876, "step": 656 }, { "epoch": 0.0836716175557572, "grad_norm": 8.346356045647864, "learning_rate": 1.9992721420108338e-05, "loss": 1.0237, "step": 657 }, { "epoch": 0.08379897161596384, "grad_norm": 5.194369994982586, "learning_rate": 1.9992642529766136e-05, "loss": 1.019, "step": 658 }, { "epoch": 0.08392632567617046, "grad_norm": 6.352857844734931, "learning_rate": 1.999256321434828e-05, "loss": 1.0938, "step": 659 }, { "epoch": 0.08405367973637709, "grad_norm": 4.991216000871772, "learning_rate": 1.9992483473858138e-05, "loss": 1.1457, "step": 660 }, { "epoch": 0.08418103379658373, "grad_norm": 5.119276246534278, "learning_rate": 1.9992403308299112e-05, "loss": 1.0455, "step": 661 }, { "epoch": 0.08430838785679036, "grad_norm": 6.314325956131299, "learning_rate": 1.9992322717674603e-05, "loss": 1.1113, "step": 662 }, { "epoch": 0.08443574191699699, "grad_norm": 7.77910023602844, "learning_rate": 1.9992241701988042e-05, "loss": 1.0262, "step": 663 }, { "epoch": 0.08456309597720363, "grad_norm": 7.1596010849287195, "learning_rate": 1.999216026124288e-05, "loss": 1.0519, "step": 664 }, { "epoch": 0.08469045003741026, "grad_norm": 4.897077210322237, "learning_rate": 1.9992078395442574e-05, "loss": 1.0238, "step": 665 }, { "epoch": 0.08481780409761688, "grad_norm": 6.509385595436648, "learning_rate": 1.9991996104590612e-05, "loss": 1.0995, "step": 666 }, { "epoch": 0.08494515815782352, "grad_norm": 6.02964231578448, "learning_rate": 1.999191338869049e-05, "loss": 1.0622, "step": 667 }, { "epoch": 0.08507251221803015, "grad_norm": 6.841055561733911, "learning_rate": 1.9991830247745732e-05, "loss": 1.089, "step": 668 }, { "epoch": 0.08519986627823678, "grad_norm": 6.576305581782257, "learning_rate": 1.999174668175987e-05, "loss": 1.0683, "step": 669 }, { "epoch": 0.08532722033844342, "grad_norm": 5.960560877989974, "learning_rate": 1.9991662690736462e-05, "loss": 1.0535, "step": 670 }, { "epoch": 0.08545457439865005, "grad_norm": 6.509239086210101, "learning_rate": 1.999157827467908e-05, "loss": 1.0301, "step": 671 }, { "epoch": 0.08558192845885668, "grad_norm": 7.124308770095535, "learning_rate": 1.9991493433591315e-05, "loss": 1.1226, "step": 672 }, { "epoch": 0.0857092825190633, "grad_norm": 5.026968704141392, "learning_rate": 1.9991408167476772e-05, "loss": 1.1574, "step": 673 }, { "epoch": 0.08583663657926995, "grad_norm": 4.301359298900442, "learning_rate": 1.9991322476339088e-05, "loss": 1.0282, "step": 674 }, { "epoch": 0.08596399063947657, "grad_norm": 11.74719529462412, "learning_rate": 1.9991236360181897e-05, "loss": 1.0841, "step": 675 }, { "epoch": 0.0860913446996832, "grad_norm": 5.71392752342204, "learning_rate": 1.999114981900887e-05, "loss": 1.128, "step": 676 }, { "epoch": 0.08621869875988984, "grad_norm": 5.9033570647377465, "learning_rate": 1.9991062852823683e-05, "loss": 1.0681, "step": 677 }, { "epoch": 0.08634605282009647, "grad_norm": 5.322196996403861, "learning_rate": 1.9990975461630043e-05, "loss": 1.0203, "step": 678 }, { "epoch": 0.0864734068803031, "grad_norm": 4.951072869448979, "learning_rate": 1.999088764543166e-05, "loss": 1.0799, "step": 679 }, { "epoch": 0.08660076094050974, "grad_norm": 5.500998839799335, "learning_rate": 1.999079940423227e-05, "loss": 1.0914, "step": 680 }, { "epoch": 0.08672811500071637, "grad_norm": 6.443149791036705, "learning_rate": 1.9990710738035632e-05, "loss": 1.212, "step": 681 }, { "epoch": 0.086855469060923, "grad_norm": 5.5992443869211295, "learning_rate": 1.999062164684551e-05, "loss": 1.1295, "step": 682 }, { "epoch": 0.08698282312112963, "grad_norm": 4.401937119017067, "learning_rate": 1.9990532130665703e-05, "loss": 1.0117, "step": 683 }, { "epoch": 0.08711017718133626, "grad_norm": 6.193840092408911, "learning_rate": 1.9990442189500016e-05, "loss": 1.0194, "step": 684 }, { "epoch": 0.08723753124154289, "grad_norm": 7.51254012466861, "learning_rate": 1.9990351823352268e-05, "loss": 0.9975, "step": 685 }, { "epoch": 0.08736488530174953, "grad_norm": 5.877886124116896, "learning_rate": 1.999026103222631e-05, "loss": 1.1254, "step": 686 }, { "epoch": 0.08749223936195616, "grad_norm": 5.472095272874463, "learning_rate": 1.9990169816126005e-05, "loss": 1.0221, "step": 687 }, { "epoch": 0.08761959342216279, "grad_norm": 4.893392833864217, "learning_rate": 1.999007817505523e-05, "loss": 1.0455, "step": 688 }, { "epoch": 0.08774694748236943, "grad_norm": 5.637380242608783, "learning_rate": 1.9989986109017882e-05, "loss": 1.02, "step": 689 }, { "epoch": 0.08787430154257606, "grad_norm": 4.8513515752131635, "learning_rate": 1.9989893618017882e-05, "loss": 1.0035, "step": 690 }, { "epoch": 0.08800165560278268, "grad_norm": 6.058231699570725, "learning_rate": 1.9989800702059158e-05, "loss": 1.0731, "step": 691 }, { "epoch": 0.08812900966298932, "grad_norm": 5.045541876584629, "learning_rate": 1.9989707361145672e-05, "loss": 0.9995, "step": 692 }, { "epoch": 0.08825636372319595, "grad_norm": 5.568628770940815, "learning_rate": 1.9989613595281384e-05, "loss": 1.0222, "step": 693 }, { "epoch": 0.08838371778340258, "grad_norm": 5.02853680543003, "learning_rate": 1.998951940447029e-05, "loss": 1.0428, "step": 694 }, { "epoch": 0.08851107184360922, "grad_norm": 4.792565214522304, "learning_rate": 1.9989424788716397e-05, "loss": 1.0065, "step": 695 }, { "epoch": 0.08863842590381585, "grad_norm": 5.291371775342626, "learning_rate": 1.9989329748023728e-05, "loss": 1.0209, "step": 696 }, { "epoch": 0.08876577996402248, "grad_norm": 6.034196045176283, "learning_rate": 1.998923428239632e-05, "loss": 1.1247, "step": 697 }, { "epoch": 0.0888931340242291, "grad_norm": 5.512739512804424, "learning_rate": 1.9989138391838243e-05, "loss": 1.0025, "step": 698 }, { "epoch": 0.08902048808443574, "grad_norm": 5.877097871265012, "learning_rate": 1.9989042076353572e-05, "loss": 1.0448, "step": 699 }, { "epoch": 0.08914784214464237, "grad_norm": 4.461166883673963, "learning_rate": 1.9988945335946407e-05, "loss": 1.0821, "step": 700 }, { "epoch": 0.089275196204849, "grad_norm": 6.024518144895393, "learning_rate": 1.9988848170620857e-05, "loss": 1.1252, "step": 701 }, { "epoch": 0.08940255026505564, "grad_norm": 5.810905319826364, "learning_rate": 1.998875058038106e-05, "loss": 1.0291, "step": 702 }, { "epoch": 0.08952990432526227, "grad_norm": 4.288768987566355, "learning_rate": 1.9988652565231167e-05, "loss": 0.951, "step": 703 }, { "epoch": 0.0896572583854689, "grad_norm": 4.429403951945096, "learning_rate": 1.9988554125175347e-05, "loss": 1.1162, "step": 704 }, { "epoch": 0.08978461244567554, "grad_norm": 5.102004214642084, "learning_rate": 1.998845526021779e-05, "loss": 1.02, "step": 705 }, { "epoch": 0.08991196650588217, "grad_norm": 4.930098381995267, "learning_rate": 1.9988355970362693e-05, "loss": 1.0211, "step": 706 }, { "epoch": 0.09003932056608879, "grad_norm": 5.773187737747096, "learning_rate": 1.9988256255614292e-05, "loss": 1.0348, "step": 707 }, { "epoch": 0.09016667462629543, "grad_norm": 5.927086944668368, "learning_rate": 1.9988156115976818e-05, "loss": 1.1057, "step": 708 }, { "epoch": 0.09029402868650206, "grad_norm": 5.025749279734705, "learning_rate": 1.998805555145454e-05, "loss": 0.9713, "step": 709 }, { "epoch": 0.09042138274670869, "grad_norm": 5.882972340229742, "learning_rate": 1.9987954562051724e-05, "loss": 1.0112, "step": 710 }, { "epoch": 0.09054873680691533, "grad_norm": 5.104455101530909, "learning_rate": 1.9987853147772677e-05, "loss": 1.1046, "step": 711 }, { "epoch": 0.09067609086712196, "grad_norm": 4.674587113206371, "learning_rate": 1.9987751308621714e-05, "loss": 1.0097, "step": 712 }, { "epoch": 0.09080344492732859, "grad_norm": 4.381587273734659, "learning_rate": 1.9987649044603158e-05, "loss": 1.0142, "step": 713 }, { "epoch": 0.09093079898753523, "grad_norm": 9.46538823947152, "learning_rate": 1.9987546355721363e-05, "loss": 1.095, "step": 714 }, { "epoch": 0.09105815304774185, "grad_norm": 4.511638425962468, "learning_rate": 1.9987443241980696e-05, "loss": 1.1166, "step": 715 }, { "epoch": 0.09118550710794848, "grad_norm": 5.012728511329524, "learning_rate": 1.9987339703385552e-05, "loss": 1.1406, "step": 716 }, { "epoch": 0.09131286116815512, "grad_norm": 6.335841769660585, "learning_rate": 1.9987235739940325e-05, "loss": 1.1545, "step": 717 }, { "epoch": 0.09144021522836175, "grad_norm": 4.924962410565526, "learning_rate": 1.9987131351649437e-05, "loss": 1.1317, "step": 718 }, { "epoch": 0.09156756928856838, "grad_norm": 6.1544385074745165, "learning_rate": 1.998702653851734e-05, "loss": 1.1361, "step": 719 }, { "epoch": 0.091694923348775, "grad_norm": 6.154575178172681, "learning_rate": 1.998692130054848e-05, "loss": 0.9903, "step": 720 }, { "epoch": 0.09182227740898165, "grad_norm": 6.154952943747507, "learning_rate": 1.998681563774734e-05, "loss": 1.0783, "step": 721 }, { "epoch": 0.09194963146918828, "grad_norm": 4.838842453109084, "learning_rate": 1.9986709550118417e-05, "loss": 1.0136, "step": 722 }, { "epoch": 0.0920769855293949, "grad_norm": 5.410325823718829, "learning_rate": 1.9986603037666218e-05, "loss": 1.0734, "step": 723 }, { "epoch": 0.09220433958960154, "grad_norm": 7.933672605467293, "learning_rate": 1.9986496100395276e-05, "loss": 1.0677, "step": 724 }, { "epoch": 0.09233169364980817, "grad_norm": 9.510319375372314, "learning_rate": 1.9986388738310142e-05, "loss": 1.0391, "step": 725 }, { "epoch": 0.0924590477100148, "grad_norm": 5.218726609229018, "learning_rate": 1.998628095141538e-05, "loss": 1.0036, "step": 726 }, { "epoch": 0.09258640177022144, "grad_norm": 6.188449979354496, "learning_rate": 1.998617273971558e-05, "loss": 1.0897, "step": 727 }, { "epoch": 0.09271375583042807, "grad_norm": 5.214680072630314, "learning_rate": 1.998606410321534e-05, "loss": 1.0975, "step": 728 }, { "epoch": 0.0928411098906347, "grad_norm": 5.1103404588248855, "learning_rate": 1.998595504191928e-05, "loss": 1.115, "step": 729 }, { "epoch": 0.09296846395084134, "grad_norm": 7.251754548435966, "learning_rate": 1.9985845555832047e-05, "loss": 0.9956, "step": 730 }, { "epoch": 0.09309581801104796, "grad_norm": 6.024552653946663, "learning_rate": 1.9985735644958292e-05, "loss": 1.0765, "step": 731 }, { "epoch": 0.09322317207125459, "grad_norm": 4.944129259919682, "learning_rate": 1.9985625309302692e-05, "loss": 1.1096, "step": 732 }, { "epoch": 0.09335052613146123, "grad_norm": 4.15991324847806, "learning_rate": 1.9985514548869942e-05, "loss": 0.9459, "step": 733 }, { "epoch": 0.09347788019166786, "grad_norm": 3.7197710446687253, "learning_rate": 1.998540336366475e-05, "loss": 0.9994, "step": 734 }, { "epoch": 0.09360523425187449, "grad_norm": 6.06962528536452, "learning_rate": 1.9985291753691853e-05, "loss": 0.9692, "step": 735 }, { "epoch": 0.09373258831208113, "grad_norm": 4.293669347546974, "learning_rate": 1.998517971895599e-05, "loss": 1.163, "step": 736 }, { "epoch": 0.09385994237228776, "grad_norm": 4.771429610435823, "learning_rate": 1.9985067259461936e-05, "loss": 1.0687, "step": 737 }, { "epoch": 0.09398729643249439, "grad_norm": 5.6708848998084065, "learning_rate": 1.9984954375214464e-05, "loss": 0.9521, "step": 738 }, { "epoch": 0.09411465049270103, "grad_norm": 7.222470864474539, "learning_rate": 1.9984841066218387e-05, "loss": 1.0992, "step": 739 }, { "epoch": 0.09424200455290765, "grad_norm": 5.365241129523414, "learning_rate": 1.9984727332478517e-05, "loss": 1.1041, "step": 740 }, { "epoch": 0.09436935861311428, "grad_norm": 6.039438648032581, "learning_rate": 1.9984613173999694e-05, "loss": 1.0716, "step": 741 }, { "epoch": 0.09449671267332091, "grad_norm": 4.452459345808136, "learning_rate": 1.9984498590786778e-05, "loss": 1.0688, "step": 742 }, { "epoch": 0.09462406673352755, "grad_norm": 4.496510318180523, "learning_rate": 1.9984383582844636e-05, "loss": 1.1061, "step": 743 }, { "epoch": 0.09475142079373418, "grad_norm": 5.133958012277289, "learning_rate": 1.998426815017817e-05, "loss": 1.0172, "step": 744 }, { "epoch": 0.0948787748539408, "grad_norm": 6.401996670637185, "learning_rate": 1.998415229279228e-05, "loss": 1.0715, "step": 745 }, { "epoch": 0.09500612891414745, "grad_norm": 5.80380879011085, "learning_rate": 1.99840360106919e-05, "loss": 0.9658, "step": 746 }, { "epoch": 0.09513348297435407, "grad_norm": 5.835257004125801, "learning_rate": 1.998391930388198e-05, "loss": 1.1158, "step": 747 }, { "epoch": 0.0952608370345607, "grad_norm": 5.1207374896936955, "learning_rate": 1.9983802172367477e-05, "loss": 1.0881, "step": 748 }, { "epoch": 0.09538819109476734, "grad_norm": 11.487577279944423, "learning_rate": 1.9983684616153378e-05, "loss": 1.1162, "step": 749 }, { "epoch": 0.09551554515497397, "grad_norm": 5.542628422317417, "learning_rate": 1.998356663524468e-05, "loss": 1.1225, "step": 750 }, { "epoch": 0.0956428992151806, "grad_norm": 6.967952118088262, "learning_rate": 1.998344822964641e-05, "loss": 1.1577, "step": 751 }, { "epoch": 0.09577025327538724, "grad_norm": 5.47302950580714, "learning_rate": 1.99833293993636e-05, "loss": 1.0219, "step": 752 }, { "epoch": 0.09589760733559387, "grad_norm": 5.362081572740628, "learning_rate": 1.99832101444013e-05, "loss": 1.0726, "step": 753 }, { "epoch": 0.0960249613958005, "grad_norm": 6.812421045808244, "learning_rate": 1.9983090464764587e-05, "loss": 1.065, "step": 754 }, { "epoch": 0.09615231545600714, "grad_norm": 8.852579514610417, "learning_rate": 1.9982970360458557e-05, "loss": 1.1051, "step": 755 }, { "epoch": 0.09627966951621376, "grad_norm": 6.265131524266463, "learning_rate": 1.998284983148831e-05, "loss": 1.0406, "step": 756 }, { "epoch": 0.09640702357642039, "grad_norm": 10.166042829874742, "learning_rate": 1.9982728877858978e-05, "loss": 1.0842, "step": 757 }, { "epoch": 0.09653437763662703, "grad_norm": 5.6845487546028695, "learning_rate": 1.998260749957571e-05, "loss": 1.0445, "step": 758 }, { "epoch": 0.09666173169683366, "grad_norm": 4.165910779417907, "learning_rate": 1.9982485696643663e-05, "loss": 0.9373, "step": 759 }, { "epoch": 0.09678908575704029, "grad_norm": 4.396198685395114, "learning_rate": 1.998236346906802e-05, "loss": 1.0627, "step": 760 }, { "epoch": 0.09691643981724693, "grad_norm": 6.08568614592465, "learning_rate": 1.9982240816853983e-05, "loss": 1.0471, "step": 761 }, { "epoch": 0.09704379387745356, "grad_norm": 5.1671257225291, "learning_rate": 1.9982117740006763e-05, "loss": 1.1163, "step": 762 }, { "epoch": 0.09717114793766018, "grad_norm": 5.916493408818419, "learning_rate": 1.9981994238531603e-05, "loss": 1.1089, "step": 763 }, { "epoch": 0.09729850199786683, "grad_norm": 4.560569041032856, "learning_rate": 1.9981870312433755e-05, "loss": 0.9946, "step": 764 }, { "epoch": 0.09742585605807345, "grad_norm": 4.936586319610627, "learning_rate": 1.9981745961718486e-05, "loss": 1.1241, "step": 765 }, { "epoch": 0.09755321011828008, "grad_norm": 5.56261932587953, "learning_rate": 1.998162118639109e-05, "loss": 1.1137, "step": 766 }, { "epoch": 0.09768056417848671, "grad_norm": 4.700296051765587, "learning_rate": 1.9981495986456876e-05, "loss": 0.9961, "step": 767 }, { "epoch": 0.09780791823869335, "grad_norm": 5.784826543609449, "learning_rate": 1.9981370361921165e-05, "loss": 1.0256, "step": 768 }, { "epoch": 0.09793527229889998, "grad_norm": 5.929088537148662, "learning_rate": 1.9981244312789306e-05, "loss": 0.9581, "step": 769 }, { "epoch": 0.0980626263591066, "grad_norm": 5.179139535494771, "learning_rate": 1.9981117839066657e-05, "loss": 0.9951, "step": 770 }, { "epoch": 0.09818998041931325, "grad_norm": 6.371153924167855, "learning_rate": 1.99809909407586e-05, "loss": 1.0445, "step": 771 }, { "epoch": 0.09831733447951987, "grad_norm": 7.152752283867728, "learning_rate": 1.998086361787053e-05, "loss": 1.1133, "step": 772 }, { "epoch": 0.0984446885397265, "grad_norm": 5.0226457841069605, "learning_rate": 1.998073587040787e-05, "loss": 1.086, "step": 773 }, { "epoch": 0.09857204259993314, "grad_norm": 5.248569293580547, "learning_rate": 1.998060769837605e-05, "loss": 1.0538, "step": 774 }, { "epoch": 0.09869939666013977, "grad_norm": 6.354143319842696, "learning_rate": 1.9980479101780525e-05, "loss": 1.0211, "step": 775 }, { "epoch": 0.0988267507203464, "grad_norm": 4.431387976904693, "learning_rate": 1.9980350080626755e-05, "loss": 0.9861, "step": 776 }, { "epoch": 0.09895410478055304, "grad_norm": 6.274642885540535, "learning_rate": 1.998022063492024e-05, "loss": 1.0126, "step": 777 }, { "epoch": 0.09908145884075967, "grad_norm": 4.3373684558105, "learning_rate": 1.9980090764666486e-05, "loss": 1.0245, "step": 778 }, { "epoch": 0.0992088129009663, "grad_norm": 5.5406400642685085, "learning_rate": 1.997996046987101e-05, "loss": 1.0031, "step": 779 }, { "epoch": 0.09933616696117294, "grad_norm": 6.194165887352312, "learning_rate": 1.997982975053936e-05, "loss": 1.0883, "step": 780 }, { "epoch": 0.09946352102137956, "grad_norm": 5.211934830651193, "learning_rate": 1.9979698606677095e-05, "loss": 1.1272, "step": 781 }, { "epoch": 0.09959087508158619, "grad_norm": 7.170192413089251, "learning_rate": 1.9979567038289796e-05, "loss": 1.0528, "step": 782 }, { "epoch": 0.09971822914179283, "grad_norm": 6.254459468621482, "learning_rate": 1.997943504538306e-05, "loss": 0.9662, "step": 783 }, { "epoch": 0.09984558320199946, "grad_norm": 6.957908510804562, "learning_rate": 1.9979302627962494e-05, "loss": 1.0736, "step": 784 }, { "epoch": 0.09997293726220609, "grad_norm": 6.320699069316108, "learning_rate": 1.997916978603374e-05, "loss": 1.0725, "step": 785 }, { "epoch": 0.10010029132241273, "grad_norm": 5.789714163937426, "learning_rate": 1.9979036519602446e-05, "loss": 1.0178, "step": 786 }, { "epoch": 0.10022764538261936, "grad_norm": 6.830007751817898, "learning_rate": 1.997890282867428e-05, "loss": 1.0137, "step": 787 }, { "epoch": 0.10035499944282598, "grad_norm": 4.903635482520552, "learning_rate": 1.9978768713254927e-05, "loss": 1.0192, "step": 788 }, { "epoch": 0.10048235350303261, "grad_norm": 18.90650180617438, "learning_rate": 1.99786341733501e-05, "loss": 0.9982, "step": 789 }, { "epoch": 0.10060970756323925, "grad_norm": 4.963165407706841, "learning_rate": 1.997849920896551e-05, "loss": 0.9878, "step": 790 }, { "epoch": 0.10073706162344588, "grad_norm": 4.880410542025724, "learning_rate": 1.9978363820106912e-05, "loss": 1.0394, "step": 791 }, { "epoch": 0.10086441568365251, "grad_norm": 5.359966990525815, "learning_rate": 1.9978228006780056e-05, "loss": 1.0565, "step": 792 }, { "epoch": 0.10099176974385915, "grad_norm": 5.27348234934632, "learning_rate": 1.9978091768990723e-05, "loss": 1.1053, "step": 793 }, { "epoch": 0.10111912380406578, "grad_norm": 6.207040468644822, "learning_rate": 1.9977955106744706e-05, "loss": 1.047, "step": 794 }, { "epoch": 0.1012464778642724, "grad_norm": 5.75935564622613, "learning_rate": 1.9977818020047816e-05, "loss": 0.9662, "step": 795 }, { "epoch": 0.10137383192447905, "grad_norm": 3.5010736725984377, "learning_rate": 1.9977680508905894e-05, "loss": 1.0921, "step": 796 }, { "epoch": 0.10150118598468567, "grad_norm": 7.0157745277425425, "learning_rate": 1.9977542573324782e-05, "loss": 0.9324, "step": 797 }, { "epoch": 0.1016285400448923, "grad_norm": 4.361273239480551, "learning_rate": 1.9977404213310347e-05, "loss": 1.0017, "step": 798 }, { "epoch": 0.10175589410509894, "grad_norm": 5.415330262261502, "learning_rate": 1.997726542886848e-05, "loss": 0.997, "step": 799 }, { "epoch": 0.10188324816530557, "grad_norm": 6.053517402057396, "learning_rate": 1.9977126220005082e-05, "loss": 1.0096, "step": 800 }, { "epoch": 0.1020106022255122, "grad_norm": 5.6546141224148165, "learning_rate": 1.9976986586726072e-05, "loss": 1.1677, "step": 801 }, { "epoch": 0.10213795628571884, "grad_norm": 4.5596153860015125, "learning_rate": 1.9976846529037396e-05, "loss": 1.039, "step": 802 }, { "epoch": 0.10226531034592547, "grad_norm": 6.434913485996966, "learning_rate": 1.9976706046945003e-05, "loss": 1.032, "step": 803 }, { "epoch": 0.1023926644061321, "grad_norm": 6.885333254525804, "learning_rate": 1.9976565140454877e-05, "loss": 1.0179, "step": 804 }, { "epoch": 0.10252001846633874, "grad_norm": 5.634192690488991, "learning_rate": 1.9976423809573014e-05, "loss": 1.007, "step": 805 }, { "epoch": 0.10264737252654536, "grad_norm": 5.028451840415393, "learning_rate": 1.9976282054305415e-05, "loss": 1.0552, "step": 806 }, { "epoch": 0.10277472658675199, "grad_norm": 5.18185209266238, "learning_rate": 1.9976139874658117e-05, "loss": 0.985, "step": 807 }, { "epoch": 0.10290208064695863, "grad_norm": 5.351761194703276, "learning_rate": 1.9975997270637172e-05, "loss": 0.977, "step": 808 }, { "epoch": 0.10302943470716526, "grad_norm": 5.0199059016310725, "learning_rate": 1.997585424224864e-05, "loss": 1.0321, "step": 809 }, { "epoch": 0.10315678876737189, "grad_norm": 7.4969247235155425, "learning_rate": 1.9975710789498603e-05, "loss": 1.1377, "step": 810 }, { "epoch": 0.10328414282757852, "grad_norm": 5.024824126093265, "learning_rate": 1.9975566912393173e-05, "loss": 1.0533, "step": 811 }, { "epoch": 0.10341149688778516, "grad_norm": 9.205641086845837, "learning_rate": 1.9975422610938463e-05, "loss": 1.0876, "step": 812 }, { "epoch": 0.10353885094799178, "grad_norm": 4.248778448141489, "learning_rate": 1.9975277885140613e-05, "loss": 0.9902, "step": 813 }, { "epoch": 0.10366620500819841, "grad_norm": 5.970955805599458, "learning_rate": 1.9975132735005778e-05, "loss": 1.0198, "step": 814 }, { "epoch": 0.10379355906840505, "grad_norm": 5.154292249370319, "learning_rate": 1.9974987160540132e-05, "loss": 1.0242, "step": 815 }, { "epoch": 0.10392091312861168, "grad_norm": 5.2176426609278055, "learning_rate": 1.9974841161749875e-05, "loss": 1.1066, "step": 816 }, { "epoch": 0.10404826718881831, "grad_norm": 5.4636206413600235, "learning_rate": 1.997469473864121e-05, "loss": 1.1187, "step": 817 }, { "epoch": 0.10417562124902495, "grad_norm": 5.683022885312907, "learning_rate": 1.997454789122037e-05, "loss": 1.1843, "step": 818 }, { "epoch": 0.10430297530923158, "grad_norm": 5.910681621146795, "learning_rate": 1.9974400619493595e-05, "loss": 1.0454, "step": 819 }, { "epoch": 0.1044303293694382, "grad_norm": 5.36205187083293, "learning_rate": 1.9974252923467162e-05, "loss": 0.9426, "step": 820 }, { "epoch": 0.10455768342964485, "grad_norm": 5.648169546775838, "learning_rate": 1.9974104803147344e-05, "loss": 1.0387, "step": 821 }, { "epoch": 0.10468503748985147, "grad_norm": 4.938694869698999, "learning_rate": 1.9973956258540438e-05, "loss": 1.0432, "step": 822 }, { "epoch": 0.1048123915500581, "grad_norm": 6.627397211487185, "learning_rate": 1.9973807289652777e-05, "loss": 1.1652, "step": 823 }, { "epoch": 0.10493974561026474, "grad_norm": 6.178454019201148, "learning_rate": 1.9973657896490687e-05, "loss": 1.0397, "step": 824 }, { "epoch": 0.10506709967047137, "grad_norm": 6.174888628368443, "learning_rate": 1.9973508079060524e-05, "loss": 0.978, "step": 825 }, { "epoch": 0.105194453730678, "grad_norm": 6.708740158855683, "learning_rate": 1.9973357837368664e-05, "loss": 0.9996, "step": 826 }, { "epoch": 0.10532180779088464, "grad_norm": 7.273286451170561, "learning_rate": 1.99732071714215e-05, "loss": 0.9919, "step": 827 }, { "epoch": 0.10544916185109127, "grad_norm": 4.836429305944261, "learning_rate": 1.997305608122544e-05, "loss": 0.9303, "step": 828 }, { "epoch": 0.1055765159112979, "grad_norm": 5.203196115551915, "learning_rate": 1.9972904566786903e-05, "loss": 0.9287, "step": 829 }, { "epoch": 0.10570386997150454, "grad_norm": 5.355710926325952, "learning_rate": 1.9972752628112344e-05, "loss": 1.0132, "step": 830 }, { "epoch": 0.10583122403171116, "grad_norm": 5.978964807733789, "learning_rate": 1.9972600265208224e-05, "loss": 1.0344, "step": 831 }, { "epoch": 0.10595857809191779, "grad_norm": 4.315744257777001, "learning_rate": 1.9972447478081026e-05, "loss": 1.1191, "step": 832 }, { "epoch": 0.10608593215212442, "grad_norm": 5.5114553971235605, "learning_rate": 1.9972294266737243e-05, "loss": 1.0505, "step": 833 }, { "epoch": 0.10621328621233106, "grad_norm": 7.059071476639452, "learning_rate": 1.9972140631183396e-05, "loss": 1.2202, "step": 834 }, { "epoch": 0.10634064027253769, "grad_norm": 4.830647719965635, "learning_rate": 1.9971986571426024e-05, "loss": 1.0377, "step": 835 }, { "epoch": 0.10646799433274431, "grad_norm": 5.099515121448014, "learning_rate": 1.9971832087471678e-05, "loss": 1.0378, "step": 836 }, { "epoch": 0.10659534839295096, "grad_norm": 5.874044137666145, "learning_rate": 1.9971677179326925e-05, "loss": 1.0322, "step": 837 }, { "epoch": 0.10672270245315758, "grad_norm": 6.157219050476186, "learning_rate": 1.9971521846998362e-05, "loss": 1.0744, "step": 838 }, { "epoch": 0.10685005651336421, "grad_norm": 4.74324840763616, "learning_rate": 1.997136609049259e-05, "loss": 0.9892, "step": 839 }, { "epoch": 0.10697741057357085, "grad_norm": 3.737316042860656, "learning_rate": 1.997120990981624e-05, "loss": 1.082, "step": 840 }, { "epoch": 0.10710476463377748, "grad_norm": 5.970295491277951, "learning_rate": 1.9971053304975955e-05, "loss": 1.0749, "step": 841 }, { "epoch": 0.10723211869398411, "grad_norm": 8.05229262408578, "learning_rate": 1.9970896275978392e-05, "loss": 1.0926, "step": 842 }, { "epoch": 0.10735947275419075, "grad_norm": 4.885471541605568, "learning_rate": 1.9970738822830237e-05, "loss": 0.9979, "step": 843 }, { "epoch": 0.10748682681439738, "grad_norm": 6.75132907922118, "learning_rate": 1.9970580945538187e-05, "loss": 1.0148, "step": 844 }, { "epoch": 0.107614180874604, "grad_norm": 5.0097069928342846, "learning_rate": 1.997042264410895e-05, "loss": 1.1069, "step": 845 }, { "epoch": 0.10774153493481065, "grad_norm": 5.598578899449339, "learning_rate": 1.9970263918549274e-05, "loss": 1.0181, "step": 846 }, { "epoch": 0.10786888899501727, "grad_norm": 5.37790388575956, "learning_rate": 1.9970104768865895e-05, "loss": 1.0247, "step": 847 }, { "epoch": 0.1079962430552239, "grad_norm": 6.595977551571855, "learning_rate": 1.9969945195065596e-05, "loss": 1.0204, "step": 848 }, { "epoch": 0.10812359711543054, "grad_norm": 7.964236907516185, "learning_rate": 1.996978519715516e-05, "loss": 0.9665, "step": 849 }, { "epoch": 0.10825095117563717, "grad_norm": 6.392637201835179, "learning_rate": 1.9969624775141393e-05, "loss": 1.0868, "step": 850 }, { "epoch": 0.1083783052358438, "grad_norm": 5.587715175504954, "learning_rate": 1.9969463929031118e-05, "loss": 0.9563, "step": 851 }, { "epoch": 0.10850565929605044, "grad_norm": 4.658229330377023, "learning_rate": 1.9969302658831182e-05, "loss": 1.0354, "step": 852 }, { "epoch": 0.10863301335625707, "grad_norm": 4.787778299474807, "learning_rate": 1.996914096454844e-05, "loss": 0.9508, "step": 853 }, { "epoch": 0.1087603674164637, "grad_norm": 4.867284685024919, "learning_rate": 1.996897884618977e-05, "loss": 1.0619, "step": 854 }, { "epoch": 0.10888772147667033, "grad_norm": 5.147589421211179, "learning_rate": 1.9968816303762076e-05, "loss": 0.9208, "step": 855 }, { "epoch": 0.10901507553687696, "grad_norm": 6.599867455278636, "learning_rate": 1.9968653337272262e-05, "loss": 0.9198, "step": 856 }, { "epoch": 0.10914242959708359, "grad_norm": 7.3878300162248465, "learning_rate": 1.9968489946727265e-05, "loss": 1.0832, "step": 857 }, { "epoch": 0.10926978365729022, "grad_norm": 5.149081153750179, "learning_rate": 1.996832613213404e-05, "loss": 1.0392, "step": 858 }, { "epoch": 0.10939713771749686, "grad_norm": 7.385644504779292, "learning_rate": 1.9968161893499548e-05, "loss": 1.0305, "step": 859 }, { "epoch": 0.10952449177770349, "grad_norm": 6.126046850311139, "learning_rate": 1.9967997230830784e-05, "loss": 1.0887, "step": 860 }, { "epoch": 0.10965184583791011, "grad_norm": 7.1861931981170715, "learning_rate": 1.996783214413474e-05, "loss": 1.109, "step": 861 }, { "epoch": 0.10977919989811676, "grad_norm": 6.273300682466664, "learning_rate": 1.9967666633418454e-05, "loss": 1.0648, "step": 862 }, { "epoch": 0.10990655395832338, "grad_norm": 6.694825340539664, "learning_rate": 1.996750069868895e-05, "loss": 1.1229, "step": 863 }, { "epoch": 0.11003390801853001, "grad_norm": 4.287595849036246, "learning_rate": 1.9967334339953303e-05, "loss": 0.9974, "step": 864 }, { "epoch": 0.11016126207873665, "grad_norm": 5.7722573112928215, "learning_rate": 1.996716755721858e-05, "loss": 1.0254, "step": 865 }, { "epoch": 0.11028861613894328, "grad_norm": 4.54564301360154, "learning_rate": 1.9967000350491873e-05, "loss": 0.947, "step": 866 }, { "epoch": 0.11041597019914991, "grad_norm": 4.937993278489573, "learning_rate": 1.9966832719780305e-05, "loss": 1.105, "step": 867 }, { "epoch": 0.11054332425935655, "grad_norm": 5.060781116430827, "learning_rate": 1.9966664665090997e-05, "loss": 1.111, "step": 868 }, { "epoch": 0.11067067831956318, "grad_norm": 6.785440723383181, "learning_rate": 1.9966496186431106e-05, "loss": 1.0737, "step": 869 }, { "epoch": 0.1107980323797698, "grad_norm": 6.19654754989308, "learning_rate": 1.9966327283807794e-05, "loss": 1.0739, "step": 870 }, { "epoch": 0.11092538643997644, "grad_norm": 6.420826475951312, "learning_rate": 1.996615795722825e-05, "loss": 1.0866, "step": 871 }, { "epoch": 0.11105274050018307, "grad_norm": 5.052261359889531, "learning_rate": 1.996598820669967e-05, "loss": 1.0155, "step": 872 }, { "epoch": 0.1111800945603897, "grad_norm": 4.816232002119726, "learning_rate": 1.996581803222928e-05, "loss": 1.0649, "step": 873 }, { "epoch": 0.11130744862059634, "grad_norm": 4.5369893524939595, "learning_rate": 1.9965647433824315e-05, "loss": 0.9625, "step": 874 }, { "epoch": 0.11143480268080297, "grad_norm": 5.307753652943214, "learning_rate": 1.9965476411492043e-05, "loss": 0.9882, "step": 875 }, { "epoch": 0.1115621567410096, "grad_norm": 6.092648763675741, "learning_rate": 1.9965304965239723e-05, "loss": 0.8954, "step": 876 }, { "epoch": 0.11168951080121624, "grad_norm": 6.984597836210311, "learning_rate": 1.996513309507466e-05, "loss": 1.0302, "step": 877 }, { "epoch": 0.11181686486142287, "grad_norm": 5.913009731264718, "learning_rate": 1.9964960801004164e-05, "loss": 1.082, "step": 878 }, { "epoch": 0.11194421892162949, "grad_norm": 4.500339071427008, "learning_rate": 1.9964788083035554e-05, "loss": 1.0465, "step": 879 }, { "epoch": 0.11207157298183612, "grad_norm": 6.428641140272211, "learning_rate": 1.9964614941176194e-05, "loss": 1.054, "step": 880 }, { "epoch": 0.11219892704204276, "grad_norm": 5.458770889720949, "learning_rate": 1.9964441375433436e-05, "loss": 0.9883, "step": 881 }, { "epoch": 0.11232628110224939, "grad_norm": 5.406102525761324, "learning_rate": 1.996426738581467e-05, "loss": 0.9914, "step": 882 }, { "epoch": 0.11245363516245602, "grad_norm": 5.6600517858286175, "learning_rate": 1.996409297232729e-05, "loss": 0.9404, "step": 883 }, { "epoch": 0.11258098922266266, "grad_norm": 6.302272858935621, "learning_rate": 1.996391813497872e-05, "loss": 0.9588, "step": 884 }, { "epoch": 0.11270834328286929, "grad_norm": 6.660119593007626, "learning_rate": 1.99637428737764e-05, "loss": 1.0316, "step": 885 }, { "epoch": 0.11283569734307591, "grad_norm": 5.668058934930348, "learning_rate": 1.9963567188727783e-05, "loss": 1.0363, "step": 886 }, { "epoch": 0.11296305140328256, "grad_norm": 4.359857566676804, "learning_rate": 1.9963391079840344e-05, "loss": 1.0509, "step": 887 }, { "epoch": 0.11309040546348918, "grad_norm": 5.11794036685001, "learning_rate": 1.9963214547121573e-05, "loss": 1.0572, "step": 888 }, { "epoch": 0.11321775952369581, "grad_norm": 8.069355260721297, "learning_rate": 1.9963037590578977e-05, "loss": 1.0512, "step": 889 }, { "epoch": 0.11334511358390245, "grad_norm": 5.085180082630144, "learning_rate": 1.9962860210220085e-05, "loss": 0.9946, "step": 890 }, { "epoch": 0.11347246764410908, "grad_norm": 6.3451816089753414, "learning_rate": 1.9962682406052445e-05, "loss": 1.0836, "step": 891 }, { "epoch": 0.1135998217043157, "grad_norm": 4.30454150054419, "learning_rate": 1.996250417808362e-05, "loss": 1.0612, "step": 892 }, { "epoch": 0.11372717576452235, "grad_norm": 6.121196889586544, "learning_rate": 1.996232552632119e-05, "loss": 1.1126, "step": 893 }, { "epoch": 0.11385452982472898, "grad_norm": 5.352515632788303, "learning_rate": 1.9962146450772756e-05, "loss": 1.0238, "step": 894 }, { "epoch": 0.1139818838849356, "grad_norm": 6.644230409121299, "learning_rate": 1.9961966951445933e-05, "loss": 1.008, "step": 895 }, { "epoch": 0.11410923794514224, "grad_norm": 5.595505082119347, "learning_rate": 1.9961787028348357e-05, "loss": 1.085, "step": 896 }, { "epoch": 0.11423659200534887, "grad_norm": 5.556965678651519, "learning_rate": 1.9961606681487685e-05, "loss": 1.0309, "step": 897 }, { "epoch": 0.1143639460655555, "grad_norm": 4.581100382319801, "learning_rate": 1.996142591087159e-05, "loss": 1.0942, "step": 898 }, { "epoch": 0.11449130012576214, "grad_norm": 5.835283153368007, "learning_rate": 1.9961244716507757e-05, "loss": 1.1027, "step": 899 }, { "epoch": 0.11461865418596877, "grad_norm": 5.063142900500023, "learning_rate": 1.9961063098403897e-05, "loss": 0.9936, "step": 900 }, { "epoch": 0.1147460082461754, "grad_norm": 4.51443898818577, "learning_rate": 1.9960881056567732e-05, "loss": 0.9346, "step": 901 }, { "epoch": 0.11487336230638202, "grad_norm": 5.329502856059749, "learning_rate": 1.9960698591007008e-05, "loss": 1.0803, "step": 902 }, { "epoch": 0.11500071636658867, "grad_norm": 4.504037740993079, "learning_rate": 1.996051570172949e-05, "loss": 0.9776, "step": 903 }, { "epoch": 0.11512807042679529, "grad_norm": 6.117784882628449, "learning_rate": 1.9960332388742952e-05, "loss": 1.0179, "step": 904 }, { "epoch": 0.11525542448700192, "grad_norm": 5.357360995256535, "learning_rate": 1.99601486520552e-05, "loss": 1.1039, "step": 905 }, { "epoch": 0.11538277854720856, "grad_norm": 10.642215559466937, "learning_rate": 1.995996449167404e-05, "loss": 0.9746, "step": 906 }, { "epoch": 0.11551013260741519, "grad_norm": 5.78019713016134, "learning_rate": 1.9959779907607317e-05, "loss": 0.9421, "step": 907 }, { "epoch": 0.11563748666762182, "grad_norm": 6.034713778278319, "learning_rate": 1.995959489986287e-05, "loss": 1.1411, "step": 908 }, { "epoch": 0.11576484072782846, "grad_norm": 5.582388650865901, "learning_rate": 1.9959409468448582e-05, "loss": 1.0087, "step": 909 }, { "epoch": 0.11589219478803509, "grad_norm": 5.796074084454103, "learning_rate": 1.995922361337233e-05, "loss": 1.1047, "step": 910 }, { "epoch": 0.11601954884824171, "grad_norm": 6.864811231180803, "learning_rate": 1.9959037334642027e-05, "loss": 1.0263, "step": 911 }, { "epoch": 0.11614690290844835, "grad_norm": 5.806496093265156, "learning_rate": 1.9958850632265595e-05, "loss": 0.9675, "step": 912 }, { "epoch": 0.11627425696865498, "grad_norm": 4.753198851564046, "learning_rate": 1.995866350625098e-05, "loss": 0.9769, "step": 913 }, { "epoch": 0.11640161102886161, "grad_norm": 5.277047375560149, "learning_rate": 1.9958475956606133e-05, "loss": 1.0622, "step": 914 }, { "epoch": 0.11652896508906825, "grad_norm": 4.805170397798593, "learning_rate": 1.995828798333904e-05, "loss": 1.0204, "step": 915 }, { "epoch": 0.11665631914927488, "grad_norm": 3.8686471361527017, "learning_rate": 1.9958099586457696e-05, "loss": 1.0807, "step": 916 }, { "epoch": 0.1167836732094815, "grad_norm": 5.415077015337016, "learning_rate": 1.9957910765970114e-05, "loss": 1.0983, "step": 917 }, { "epoch": 0.11691102726968815, "grad_norm": 6.980435687788938, "learning_rate": 1.9957721521884322e-05, "loss": 1.1048, "step": 918 }, { "epoch": 0.11703838132989478, "grad_norm": 4.371332647836583, "learning_rate": 1.9957531854208378e-05, "loss": 1.0857, "step": 919 }, { "epoch": 0.1171657353901014, "grad_norm": 5.134828586707463, "learning_rate": 1.9957341762950346e-05, "loss": 1.0729, "step": 920 }, { "epoch": 0.11729308945030804, "grad_norm": 5.138492594150226, "learning_rate": 1.9957151248118314e-05, "loss": 1.131, "step": 921 }, { "epoch": 0.11742044351051467, "grad_norm": 4.016436424357209, "learning_rate": 1.9956960309720385e-05, "loss": 1.0131, "step": 922 }, { "epoch": 0.1175477975707213, "grad_norm": 5.120844016543802, "learning_rate": 1.995676894776468e-05, "loss": 1.0525, "step": 923 }, { "epoch": 0.11767515163092793, "grad_norm": 6.446405565457038, "learning_rate": 1.995657716225934e-05, "loss": 1.0622, "step": 924 }, { "epoch": 0.11780250569113457, "grad_norm": 5.7319024599644814, "learning_rate": 1.9956384953212526e-05, "loss": 1.0326, "step": 925 }, { "epoch": 0.1179298597513412, "grad_norm": 5.503297337568938, "learning_rate": 1.995619232063241e-05, "loss": 0.9426, "step": 926 }, { "epoch": 0.11805721381154782, "grad_norm": 5.387966927033936, "learning_rate": 1.9955999264527194e-05, "loss": 1.0234, "step": 927 }, { "epoch": 0.11818456787175446, "grad_norm": 4.988396723164246, "learning_rate": 1.9955805784905083e-05, "loss": 1.0124, "step": 928 }, { "epoch": 0.11831192193196109, "grad_norm": 5.309249422003908, "learning_rate": 1.9955611881774308e-05, "loss": 1.1048, "step": 929 }, { "epoch": 0.11843927599216772, "grad_norm": 5.357774679073277, "learning_rate": 1.995541755514312e-05, "loss": 0.9624, "step": 930 }, { "epoch": 0.11856663005237436, "grad_norm": 6.42531343281874, "learning_rate": 1.9955222805019786e-05, "loss": 1.0409, "step": 931 }, { "epoch": 0.11869398411258099, "grad_norm": 5.131309347239302, "learning_rate": 1.9955027631412584e-05, "loss": 1.1056, "step": 932 }, { "epoch": 0.11882133817278762, "grad_norm": 4.4340370527963895, "learning_rate": 1.9954832034329827e-05, "loss": 0.9607, "step": 933 }, { "epoch": 0.11894869223299426, "grad_norm": 5.347056475736151, "learning_rate": 1.9954636013779826e-05, "loss": 1.0851, "step": 934 }, { "epoch": 0.11907604629320089, "grad_norm": 6.211339474511673, "learning_rate": 1.995443956977093e-05, "loss": 1.0095, "step": 935 }, { "epoch": 0.11920340035340751, "grad_norm": 5.982173504781401, "learning_rate": 1.995424270231148e-05, "loss": 1.0042, "step": 936 }, { "epoch": 0.11933075441361415, "grad_norm": 6.007968697173025, "learning_rate": 1.9954045411409864e-05, "loss": 1.0089, "step": 937 }, { "epoch": 0.11945810847382078, "grad_norm": 4.726619196780303, "learning_rate": 1.9953847697074472e-05, "loss": 0.984, "step": 938 }, { "epoch": 0.11958546253402741, "grad_norm": 4.7802597715905275, "learning_rate": 1.995364955931371e-05, "loss": 1.0665, "step": 939 }, { "epoch": 0.11971281659423405, "grad_norm": 4.51709910688604, "learning_rate": 1.995345099813601e-05, "loss": 0.9949, "step": 940 }, { "epoch": 0.11984017065444068, "grad_norm": 5.500703073126749, "learning_rate": 1.9953252013549816e-05, "loss": 1.0011, "step": 941 }, { "epoch": 0.1199675247146473, "grad_norm": 6.24212088805979, "learning_rate": 1.9953052605563596e-05, "loss": 1.0222, "step": 942 }, { "epoch": 0.12009487877485395, "grad_norm": 4.63458043245687, "learning_rate": 1.995285277418583e-05, "loss": 1.0849, "step": 943 }, { "epoch": 0.12022223283506057, "grad_norm": 3.6977378837815587, "learning_rate": 1.9952652519425016e-05, "loss": 0.9489, "step": 944 }, { "epoch": 0.1203495868952672, "grad_norm": 5.320802371961369, "learning_rate": 1.995245184128968e-05, "loss": 1.1189, "step": 945 }, { "epoch": 0.12047694095547384, "grad_norm": 5.860292896853712, "learning_rate": 1.9952250739788356e-05, "loss": 0.9716, "step": 946 }, { "epoch": 0.12060429501568047, "grad_norm": 5.153604094452106, "learning_rate": 1.995204921492959e-05, "loss": 0.9592, "step": 947 }, { "epoch": 0.1207316490758871, "grad_norm": 7.173086026737213, "learning_rate": 1.995184726672197e-05, "loss": 1.0795, "step": 948 }, { "epoch": 0.12085900313609373, "grad_norm": 6.421640656365196, "learning_rate": 1.9951644895174076e-05, "loss": 1.0193, "step": 949 }, { "epoch": 0.12098635719630037, "grad_norm": 5.4544739024931195, "learning_rate": 1.995144210029452e-05, "loss": 0.9711, "step": 950 }, { "epoch": 0.121113711256507, "grad_norm": 5.661363850647127, "learning_rate": 1.9951238882091926e-05, "loss": 1.0674, "step": 951 }, { "epoch": 0.12124106531671362, "grad_norm": 7.360293450517743, "learning_rate": 1.995103524057494e-05, "loss": 1.0153, "step": 952 }, { "epoch": 0.12136841937692026, "grad_norm": 4.63858305836024, "learning_rate": 1.995083117575223e-05, "loss": 1.1147, "step": 953 }, { "epoch": 0.12149577343712689, "grad_norm": 5.686394552597674, "learning_rate": 1.9950626687632466e-05, "loss": 0.9923, "step": 954 }, { "epoch": 0.12162312749733352, "grad_norm": 6.112128610386265, "learning_rate": 1.9950421776224353e-05, "loss": 0.9806, "step": 955 }, { "epoch": 0.12175048155754016, "grad_norm": 6.282131759471412, "learning_rate": 1.9950216441536613e-05, "loss": 1.165, "step": 956 }, { "epoch": 0.12187783561774679, "grad_norm": 5.361202895850754, "learning_rate": 1.9950010683577968e-05, "loss": 1.0751, "step": 957 }, { "epoch": 0.12200518967795342, "grad_norm": 4.535062251730972, "learning_rate": 1.9949804502357183e-05, "loss": 1.0677, "step": 958 }, { "epoch": 0.12213254373816006, "grad_norm": 6.974919756902898, "learning_rate": 1.994959789788302e-05, "loss": 1.0085, "step": 959 }, { "epoch": 0.12225989779836668, "grad_norm": 5.319029621165861, "learning_rate": 1.994939087016427e-05, "loss": 1.004, "step": 960 }, { "epoch": 0.12238725185857331, "grad_norm": 6.546630563474553, "learning_rate": 1.9949183419209747e-05, "loss": 0.9709, "step": 961 }, { "epoch": 0.12251460591877995, "grad_norm": 5.408159569220149, "learning_rate": 1.9948975545028263e-05, "loss": 1.0071, "step": 962 }, { "epoch": 0.12264195997898658, "grad_norm": 5.251176611431323, "learning_rate": 1.994876724762867e-05, "loss": 1.0617, "step": 963 }, { "epoch": 0.12276931403919321, "grad_norm": 4.902891917186358, "learning_rate": 1.9948558527019826e-05, "loss": 1.0336, "step": 964 }, { "epoch": 0.12289666809939985, "grad_norm": 6.705538939706773, "learning_rate": 1.994834938321061e-05, "loss": 1.0693, "step": 965 }, { "epoch": 0.12302402215960648, "grad_norm": 6.087297201988893, "learning_rate": 1.994813981620992e-05, "loss": 1.0077, "step": 966 }, { "epoch": 0.1231513762198131, "grad_norm": 4.56559021693041, "learning_rate": 1.9947929826026668e-05, "loss": 1.0278, "step": 967 }, { "epoch": 0.12327873028001975, "grad_norm": 4.71483525462677, "learning_rate": 1.9947719412669787e-05, "loss": 1.1168, "step": 968 }, { "epoch": 0.12340608434022637, "grad_norm": 6.379245316884144, "learning_rate": 1.994750857614823e-05, "loss": 1.0561, "step": 969 }, { "epoch": 0.123533438400433, "grad_norm": 5.052821385327841, "learning_rate": 1.9947297316470963e-05, "loss": 1.1216, "step": 970 }, { "epoch": 0.12366079246063963, "grad_norm": 4.041449839225834, "learning_rate": 1.9947085633646977e-05, "loss": 0.9505, "step": 971 }, { "epoch": 0.12378814652084627, "grad_norm": 4.702999278064229, "learning_rate": 1.994687352768527e-05, "loss": 1.1083, "step": 972 }, { "epoch": 0.1239155005810529, "grad_norm": 4.5330116793643835, "learning_rate": 1.994666099859487e-05, "loss": 1.0322, "step": 973 }, { "epoch": 0.12404285464125953, "grad_norm": 5.515008735269388, "learning_rate": 1.994644804638482e-05, "loss": 1.005, "step": 974 }, { "epoch": 0.12417020870146617, "grad_norm": 8.7381555226762, "learning_rate": 1.9946234671064172e-05, "loss": 0.9686, "step": 975 }, { "epoch": 0.1242975627616728, "grad_norm": 5.550035376908558, "learning_rate": 1.994602087264201e-05, "loss": 1.108, "step": 976 }, { "epoch": 0.12442491682187942, "grad_norm": 5.977919959197699, "learning_rate": 1.994580665112742e-05, "loss": 1.0092, "step": 977 }, { "epoch": 0.12455227088208606, "grad_norm": 4.282609932418776, "learning_rate": 1.994559200652952e-05, "loss": 1.0182, "step": 978 }, { "epoch": 0.12467962494229269, "grad_norm": 5.907460245135092, "learning_rate": 1.9945376938857443e-05, "loss": 0.9318, "step": 979 }, { "epoch": 0.12480697900249932, "grad_norm": 4.831551629010998, "learning_rate": 1.994516144812033e-05, "loss": 1.0026, "step": 980 }, { "epoch": 0.12493433306270596, "grad_norm": 4.311846255079303, "learning_rate": 1.994494553432736e-05, "loss": 0.9798, "step": 981 }, { "epoch": 0.12506168712291257, "grad_norm": 5.6537046659323265, "learning_rate": 1.9944729197487702e-05, "loss": 1.0372, "step": 982 }, { "epoch": 0.12518904118311922, "grad_norm": 8.135826527084301, "learning_rate": 1.994451243761057e-05, "loss": 1.1158, "step": 983 }, { "epoch": 0.12531639524332586, "grad_norm": 7.315360789394048, "learning_rate": 1.9944295254705187e-05, "loss": 1.1006, "step": 984 }, { "epoch": 0.12544374930353247, "grad_norm": 8.115377978949036, "learning_rate": 1.994407764878078e-05, "loss": 1.0766, "step": 985 }, { "epoch": 0.1255711033637391, "grad_norm": 5.391810013262356, "learning_rate": 1.9943859619846617e-05, "loss": 1.0636, "step": 986 }, { "epoch": 0.12569845742394575, "grad_norm": 6.15608052441614, "learning_rate": 1.9943641167911965e-05, "loss": 1.0649, "step": 987 }, { "epoch": 0.12582581148415237, "grad_norm": 8.880531843600096, "learning_rate": 1.994342229298612e-05, "loss": 1.0144, "step": 988 }, { "epoch": 0.125953165544359, "grad_norm": 6.462124439889047, "learning_rate": 1.9943202995078394e-05, "loss": 1.0238, "step": 989 }, { "epoch": 0.12608051960456565, "grad_norm": 10.131981973721643, "learning_rate": 1.994298327419811e-05, "loss": 1.0622, "step": 990 }, { "epoch": 0.12620787366477226, "grad_norm": 6.29611830033835, "learning_rate": 1.9942763130354624e-05, "loss": 1.1349, "step": 991 }, { "epoch": 0.1263352277249789, "grad_norm": 5.458378138855743, "learning_rate": 1.9942542563557294e-05, "loss": 1.0549, "step": 992 }, { "epoch": 0.12646258178518555, "grad_norm": 5.555032754246985, "learning_rate": 1.9942321573815502e-05, "loss": 0.9425, "step": 993 }, { "epoch": 0.12658993584539216, "grad_norm": 4.502659753383141, "learning_rate": 1.994210016113865e-05, "loss": 1.1005, "step": 994 }, { "epoch": 0.1267172899055988, "grad_norm": 5.353733663701035, "learning_rate": 1.994187832553616e-05, "loss": 1.0607, "step": 995 }, { "epoch": 0.12684464396580544, "grad_norm": 3.80325171228991, "learning_rate": 1.9941656067017466e-05, "loss": 1.139, "step": 996 }, { "epoch": 0.12697199802601206, "grad_norm": 4.255154345141497, "learning_rate": 1.9941433385592022e-05, "loss": 0.9297, "step": 997 }, { "epoch": 0.1270993520862187, "grad_norm": 4.50577894650802, "learning_rate": 1.99412102812693e-05, "loss": 1.0254, "step": 998 }, { "epoch": 0.12722670614642534, "grad_norm": 6.663816351330356, "learning_rate": 1.9940986754058792e-05, "loss": 1.1039, "step": 999 }, { "epoch": 0.12735406020663195, "grad_norm": 6.1321546845655694, "learning_rate": 1.9940762803970006e-05, "loss": 1.0365, "step": 1000 }, { "epoch": 0.1274814142668386, "grad_norm": 5.012189883106625, "learning_rate": 1.9940538431012472e-05, "loss": 1.0125, "step": 1001 }, { "epoch": 0.12760876832704524, "grad_norm": 4.761917222778728, "learning_rate": 1.9940313635195728e-05, "loss": 1.0547, "step": 1002 }, { "epoch": 0.12773612238725185, "grad_norm": 5.289529123387824, "learning_rate": 1.9940088416529342e-05, "loss": 1.0712, "step": 1003 }, { "epoch": 0.1278634764474585, "grad_norm": 5.437705902059703, "learning_rate": 1.9939862775022893e-05, "loss": 1.0807, "step": 1004 }, { "epoch": 0.12799083050766513, "grad_norm": 6.951708291416241, "learning_rate": 1.993963671068598e-05, "loss": 1.0059, "step": 1005 }, { "epoch": 0.12811818456787175, "grad_norm": 4.9604876456962845, "learning_rate": 1.9939410223528215e-05, "loss": 1.0181, "step": 1006 }, { "epoch": 0.1282455386280784, "grad_norm": 7.510870367376681, "learning_rate": 1.993918331355924e-05, "loss": 1.0302, "step": 1007 }, { "epoch": 0.12837289268828503, "grad_norm": 5.293053432786597, "learning_rate": 1.9938955980788703e-05, "loss": 1.0705, "step": 1008 }, { "epoch": 0.12850024674849164, "grad_norm": 6.103049982237845, "learning_rate": 1.9938728225226273e-05, "loss": 0.9717, "step": 1009 }, { "epoch": 0.12862760080869828, "grad_norm": 4.882607026926755, "learning_rate": 1.9938500046881643e-05, "loss": 0.9793, "step": 1010 }, { "epoch": 0.12875495486890493, "grad_norm": 5.646936831956459, "learning_rate": 1.9938271445764515e-05, "loss": 0.9924, "step": 1011 }, { "epoch": 0.12888230892911154, "grad_norm": 6.42042187216709, "learning_rate": 1.9938042421884617e-05, "loss": 0.9512, "step": 1012 }, { "epoch": 0.12900966298931818, "grad_norm": 4.131351782487081, "learning_rate": 1.993781297525169e-05, "loss": 1.0183, "step": 1013 }, { "epoch": 0.12913701704952482, "grad_norm": 5.587315444117175, "learning_rate": 1.9937583105875494e-05, "loss": 0.9225, "step": 1014 }, { "epoch": 0.12926437110973144, "grad_norm": 5.817599121088648, "learning_rate": 1.9937352813765808e-05, "loss": 0.9889, "step": 1015 }, { "epoch": 0.12939172516993808, "grad_norm": 4.717808307383814, "learning_rate": 1.9937122098932428e-05, "loss": 0.9959, "step": 1016 }, { "epoch": 0.12951907923014472, "grad_norm": 4.6219142208021475, "learning_rate": 1.9936890961385168e-05, "loss": 0.9909, "step": 1017 }, { "epoch": 0.12964643329035133, "grad_norm": 5.360274258899001, "learning_rate": 1.993665940113386e-05, "loss": 1.0057, "step": 1018 }, { "epoch": 0.12977378735055797, "grad_norm": 5.580861336737161, "learning_rate": 1.9936427418188357e-05, "loss": 0.9989, "step": 1019 }, { "epoch": 0.12990114141076461, "grad_norm": 5.159156858475661, "learning_rate": 1.9936195012558524e-05, "loss": 0.9662, "step": 1020 }, { "epoch": 0.13002849547097123, "grad_norm": 5.548067034656748, "learning_rate": 1.993596218425425e-05, "loss": 1.0041, "step": 1021 }, { "epoch": 0.13015584953117787, "grad_norm": 4.9843518486800695, "learning_rate": 1.9935728933285438e-05, "loss": 1.0422, "step": 1022 }, { "epoch": 0.1302832035913845, "grad_norm": 4.844871735661303, "learning_rate": 1.9935495259662008e-05, "loss": 1.0537, "step": 1023 }, { "epoch": 0.13041055765159112, "grad_norm": 5.4011935363233645, "learning_rate": 1.9935261163393904e-05, "loss": 0.9057, "step": 1024 }, { "epoch": 0.13053791171179777, "grad_norm": 5.113145096425448, "learning_rate": 1.9935026644491082e-05, "loss": 1.0, "step": 1025 }, { "epoch": 0.1306652657720044, "grad_norm": 4.776971436010882, "learning_rate": 1.9934791702963515e-05, "loss": 1.0148, "step": 1026 }, { "epoch": 0.13079261983221102, "grad_norm": 5.132128897887906, "learning_rate": 1.9934556338821206e-05, "loss": 0.9756, "step": 1027 }, { "epoch": 0.13091997389241766, "grad_norm": 4.755116768575519, "learning_rate": 1.9934320552074162e-05, "loss": 1.0135, "step": 1028 }, { "epoch": 0.13104732795262428, "grad_norm": 4.623487276696922, "learning_rate": 1.9934084342732413e-05, "loss": 0.9728, "step": 1029 }, { "epoch": 0.13117468201283092, "grad_norm": 5.275131383046155, "learning_rate": 1.9933847710806e-05, "loss": 0.9805, "step": 1030 }, { "epoch": 0.13130203607303756, "grad_norm": 5.539016357574028, "learning_rate": 1.9933610656305006e-05, "loss": 1.1283, "step": 1031 }, { "epoch": 0.13142939013324417, "grad_norm": 5.368997460236626, "learning_rate": 1.99333731792395e-05, "loss": 1.0306, "step": 1032 }, { "epoch": 0.13155674419345081, "grad_norm": 4.374498829065241, "learning_rate": 1.9933135279619592e-05, "loss": 0.9264, "step": 1033 }, { "epoch": 0.13168409825365746, "grad_norm": 4.632795825947545, "learning_rate": 1.9932896957455397e-05, "loss": 0.9511, "step": 1034 }, { "epoch": 0.13181145231386407, "grad_norm": 5.802893791461331, "learning_rate": 1.9932658212757053e-05, "loss": 1.0492, "step": 1035 }, { "epoch": 0.1319388063740707, "grad_norm": 7.5402494612894975, "learning_rate": 1.9932419045534724e-05, "loss": 1.1128, "step": 1036 }, { "epoch": 0.13206616043427735, "grad_norm": 5.214650079795567, "learning_rate": 1.9932179455798574e-05, "loss": 0.9812, "step": 1037 }, { "epoch": 0.13219351449448397, "grad_norm": 5.761631024226228, "learning_rate": 1.9931939443558803e-05, "loss": 1.0368, "step": 1038 }, { "epoch": 0.1323208685546906, "grad_norm": 5.505637660709738, "learning_rate": 1.993169900882561e-05, "loss": 1.0554, "step": 1039 }, { "epoch": 0.13244822261489725, "grad_norm": 7.051467786001133, "learning_rate": 1.9931458151609234e-05, "loss": 1.0514, "step": 1040 }, { "epoch": 0.13257557667510386, "grad_norm": 6.307248187578084, "learning_rate": 1.9931216871919914e-05, "loss": 1.0074, "step": 1041 }, { "epoch": 0.1327029307353105, "grad_norm": 5.105711016930289, "learning_rate": 1.9930975169767918e-05, "loss": 1.0188, "step": 1042 }, { "epoch": 0.13283028479551715, "grad_norm": 5.433468782261706, "learning_rate": 1.9930733045163525e-05, "loss": 0.9844, "step": 1043 }, { "epoch": 0.13295763885572376, "grad_norm": 6.700120372544943, "learning_rate": 1.9930490498117035e-05, "loss": 1.0511, "step": 1044 }, { "epoch": 0.1330849929159304, "grad_norm": 6.7738719831634775, "learning_rate": 1.9930247528638768e-05, "loss": 0.9928, "step": 1045 }, { "epoch": 0.13321234697613704, "grad_norm": 6.388895187743395, "learning_rate": 1.9930004136739058e-05, "loss": 0.9695, "step": 1046 }, { "epoch": 0.13333970103634366, "grad_norm": 5.2922802466753955, "learning_rate": 1.9929760322428256e-05, "loss": 0.9448, "step": 1047 }, { "epoch": 0.1334670550965503, "grad_norm": 5.865390141642061, "learning_rate": 1.9929516085716736e-05, "loss": 1.1244, "step": 1048 }, { "epoch": 0.13359440915675694, "grad_norm": 5.007676716360697, "learning_rate": 1.992927142661489e-05, "loss": 0.9693, "step": 1049 }, { "epoch": 0.13372176321696355, "grad_norm": 5.057958001797684, "learning_rate": 1.992902634513312e-05, "loss": 0.949, "step": 1050 }, { "epoch": 0.1338491172771702, "grad_norm": 4.937284440755442, "learning_rate": 1.9928780841281858e-05, "loss": 1.0291, "step": 1051 }, { "epoch": 0.13397647133737683, "grad_norm": 5.169117541086143, "learning_rate": 1.9928534915071543e-05, "loss": 1.0791, "step": 1052 }, { "epoch": 0.13410382539758345, "grad_norm": 4.1750885091753736, "learning_rate": 1.9928288566512638e-05, "loss": 1.0141, "step": 1053 }, { "epoch": 0.1342311794577901, "grad_norm": 7.6227363603328575, "learning_rate": 1.9928041795615616e-05, "loss": 0.9137, "step": 1054 }, { "epoch": 0.13435853351799673, "grad_norm": 5.799733570161933, "learning_rate": 1.992779460239099e-05, "loss": 1.039, "step": 1055 }, { "epoch": 0.13448588757820334, "grad_norm": 5.177060573023828, "learning_rate": 1.9927546986849258e-05, "loss": 0.977, "step": 1056 }, { "epoch": 0.13461324163841, "grad_norm": 7.3578280951204444, "learning_rate": 1.9927298949000965e-05, "loss": 0.9648, "step": 1057 }, { "epoch": 0.13474059569861663, "grad_norm": 4.946775198994267, "learning_rate": 1.9927050488856657e-05, "loss": 1.1389, "step": 1058 }, { "epoch": 0.13486794975882324, "grad_norm": 6.061502590914894, "learning_rate": 1.9926801606426906e-05, "loss": 1.0901, "step": 1059 }, { "epoch": 0.13499530381902988, "grad_norm": 5.923565706250715, "learning_rate": 1.992655230172229e-05, "loss": 1.0891, "step": 1060 }, { "epoch": 0.13512265787923652, "grad_norm": 4.96207252794465, "learning_rate": 1.992630257475343e-05, "loss": 0.9961, "step": 1061 }, { "epoch": 0.13525001193944314, "grad_norm": 7.651995788732587, "learning_rate": 1.9926052425530936e-05, "loss": 0.9719, "step": 1062 }, { "epoch": 0.13537736599964978, "grad_norm": 6.374845016689928, "learning_rate": 1.9925801854065456e-05, "loss": 0.9967, "step": 1063 }, { "epoch": 0.13550472005985642, "grad_norm": 5.183903929938708, "learning_rate": 1.9925550860367646e-05, "loss": 0.986, "step": 1064 }, { "epoch": 0.13563207412006303, "grad_norm": 5.011184200046132, "learning_rate": 1.9925299444448183e-05, "loss": 0.9907, "step": 1065 }, { "epoch": 0.13575942818026968, "grad_norm": 6.122976786381155, "learning_rate": 1.9925047606317766e-05, "loss": 1.0059, "step": 1066 }, { "epoch": 0.13588678224047632, "grad_norm": 5.423363560969625, "learning_rate": 1.9924795345987103e-05, "loss": 1.0698, "step": 1067 }, { "epoch": 0.13601413630068293, "grad_norm": 5.226175473513691, "learning_rate": 1.9924542663466925e-05, "loss": 1.0515, "step": 1068 }, { "epoch": 0.13614149036088957, "grad_norm": 6.246310861904402, "learning_rate": 1.9924289558767982e-05, "loss": 1.096, "step": 1069 }, { "epoch": 0.1362688444210962, "grad_norm": 6.652299021796196, "learning_rate": 1.9924036031901042e-05, "loss": 1.0495, "step": 1070 }, { "epoch": 0.13639619848130283, "grad_norm": 5.8620075466350805, "learning_rate": 1.992378208287689e-05, "loss": 0.9825, "step": 1071 }, { "epoch": 0.13652355254150947, "grad_norm": 6.050447734312274, "learning_rate": 1.992352771170633e-05, "loss": 0.9908, "step": 1072 }, { "epoch": 0.13665090660171608, "grad_norm": 5.240944454895212, "learning_rate": 1.9923272918400175e-05, "loss": 0.9909, "step": 1073 }, { "epoch": 0.13677826066192272, "grad_norm": 5.091152909868271, "learning_rate": 1.992301770296927e-05, "loss": 1.0386, "step": 1074 }, { "epoch": 0.13690561472212937, "grad_norm": 5.6087775377108064, "learning_rate": 1.9922762065424474e-05, "loss": 1.0048, "step": 1075 }, { "epoch": 0.13703296878233598, "grad_norm": 6.818629036310338, "learning_rate": 1.992250600577665e-05, "loss": 0.9762, "step": 1076 }, { "epoch": 0.13716032284254262, "grad_norm": 6.777493187743689, "learning_rate": 1.9922249524036704e-05, "loss": 1.1636, "step": 1077 }, { "epoch": 0.13728767690274926, "grad_norm": 4.841139354431293, "learning_rate": 1.9921992620215544e-05, "loss": 1.0353, "step": 1078 }, { "epoch": 0.13741503096295588, "grad_norm": 4.447936344624334, "learning_rate": 1.992173529432409e-05, "loss": 1.0011, "step": 1079 }, { "epoch": 0.13754238502316252, "grad_norm": 7.258787157460566, "learning_rate": 1.9921477546373296e-05, "loss": 0.9964, "step": 1080 }, { "epoch": 0.13766973908336916, "grad_norm": 6.816925011599404, "learning_rate": 1.9921219376374123e-05, "loss": 0.8766, "step": 1081 }, { "epoch": 0.13779709314357577, "grad_norm": 5.154850070375168, "learning_rate": 1.9920960784337552e-05, "loss": 1.0945, "step": 1082 }, { "epoch": 0.1379244472037824, "grad_norm": 3.4197004541756377, "learning_rate": 1.992070177027459e-05, "loss": 0.96, "step": 1083 }, { "epoch": 0.13805180126398905, "grad_norm": 4.389033453340384, "learning_rate": 1.9920442334196248e-05, "loss": 1.0093, "step": 1084 }, { "epoch": 0.13817915532419567, "grad_norm": 5.624768231173525, "learning_rate": 1.9920182476113564e-05, "loss": 0.9758, "step": 1085 }, { "epoch": 0.1383065093844023, "grad_norm": 4.090395885605275, "learning_rate": 1.9919922196037596e-05, "loss": 0.984, "step": 1086 }, { "epoch": 0.13843386344460895, "grad_norm": 4.890555384326312, "learning_rate": 1.9919661493979408e-05, "loss": 1.0094, "step": 1087 }, { "epoch": 0.13856121750481556, "grad_norm": 9.665373310076777, "learning_rate": 1.99194003699501e-05, "loss": 1.0565, "step": 1088 }, { "epoch": 0.1386885715650222, "grad_norm": 5.37681690823644, "learning_rate": 1.991913882396077e-05, "loss": 0.9501, "step": 1089 }, { "epoch": 0.13881592562522885, "grad_norm": 4.849219402929612, "learning_rate": 1.991887685602255e-05, "loss": 0.9674, "step": 1090 }, { "epoch": 0.13894327968543546, "grad_norm": 3.6688144346749265, "learning_rate": 1.9918614466146586e-05, "loss": 1.0883, "step": 1091 }, { "epoch": 0.1390706337456421, "grad_norm": 6.9702190681083955, "learning_rate": 1.9918351654344033e-05, "loss": 1.0757, "step": 1092 }, { "epoch": 0.13919798780584874, "grad_norm": 4.918950450259129, "learning_rate": 1.991808842062607e-05, "loss": 1.042, "step": 1093 }, { "epoch": 0.13932534186605536, "grad_norm": 4.6766149658767615, "learning_rate": 1.9917824765003905e-05, "loss": 1.0009, "step": 1094 }, { "epoch": 0.139452695926262, "grad_norm": 5.286134165579219, "learning_rate": 1.9917560687488743e-05, "loss": 0.9648, "step": 1095 }, { "epoch": 0.13958004998646864, "grad_norm": 4.8150572484813035, "learning_rate": 1.9917296188091823e-05, "loss": 0.9133, "step": 1096 }, { "epoch": 0.13970740404667525, "grad_norm": 5.7510746490669575, "learning_rate": 1.9917031266824395e-05, "loss": 0.8875, "step": 1097 }, { "epoch": 0.1398347581068819, "grad_norm": 5.203916380671531, "learning_rate": 1.991676592369773e-05, "loss": 0.9772, "step": 1098 }, { "epoch": 0.13996211216708854, "grad_norm": 4.728170729754482, "learning_rate": 1.991650015872311e-05, "loss": 1.087, "step": 1099 }, { "epoch": 0.14008946622729515, "grad_norm": 4.143056479483173, "learning_rate": 1.991623397191185e-05, "loss": 1.0218, "step": 1100 }, { "epoch": 0.1402168202875018, "grad_norm": 4.069189579388974, "learning_rate": 1.9915967363275264e-05, "loss": 0.9937, "step": 1101 }, { "epoch": 0.14034417434770843, "grad_norm": 9.021029169953739, "learning_rate": 1.9915700332824696e-05, "loss": 0.9962, "step": 1102 }, { "epoch": 0.14047152840791505, "grad_norm": 5.2339468254213175, "learning_rate": 1.9915432880571508e-05, "loss": 1.0367, "step": 1103 }, { "epoch": 0.1405988824681217, "grad_norm": 5.262071083693544, "learning_rate": 1.9915165006527076e-05, "loss": 1.0258, "step": 1104 }, { "epoch": 0.14072623652832833, "grad_norm": 5.158737784672319, "learning_rate": 1.991489671070279e-05, "loss": 0.9628, "step": 1105 }, { "epoch": 0.14085359058853494, "grad_norm": 4.923590079715338, "learning_rate": 1.9914627993110072e-05, "loss": 0.9043, "step": 1106 }, { "epoch": 0.14098094464874159, "grad_norm": 5.1952724828111085, "learning_rate": 1.9914358853760346e-05, "loss": 0.9633, "step": 1107 }, { "epoch": 0.14110829870894823, "grad_norm": 4.745809212149704, "learning_rate": 1.9914089292665065e-05, "loss": 0.9792, "step": 1108 }, { "epoch": 0.14123565276915484, "grad_norm": 5.416111413911312, "learning_rate": 1.991381930983569e-05, "loss": 1.0286, "step": 1109 }, { "epoch": 0.14136300682936148, "grad_norm": 5.574909726083418, "learning_rate": 1.9913548905283714e-05, "loss": 1.083, "step": 1110 }, { "epoch": 0.14149036088956812, "grad_norm": 3.9528528655095663, "learning_rate": 1.9913278079020633e-05, "loss": 0.9055, "step": 1111 }, { "epoch": 0.14161771494977474, "grad_norm": 6.877030228906015, "learning_rate": 1.9913006831057967e-05, "loss": 1.0158, "step": 1112 }, { "epoch": 0.14174506900998138, "grad_norm": 5.337862217782534, "learning_rate": 1.9912735161407264e-05, "loss": 1.0859, "step": 1113 }, { "epoch": 0.14187242307018802, "grad_norm": 5.764308872267546, "learning_rate": 1.991246307008007e-05, "loss": 0.9251, "step": 1114 }, { "epoch": 0.14199977713039463, "grad_norm": 5.64803125970419, "learning_rate": 1.9912190557087964e-05, "loss": 0.9771, "step": 1115 }, { "epoch": 0.14212713119060127, "grad_norm": 4.898251762701724, "learning_rate": 1.991191762244254e-05, "loss": 1.0447, "step": 1116 }, { "epoch": 0.14225448525080792, "grad_norm": 6.234584565811258, "learning_rate": 1.9911644266155402e-05, "loss": 1.0607, "step": 1117 }, { "epoch": 0.14238183931101453, "grad_norm": 5.266679154554281, "learning_rate": 1.9911370488238185e-05, "loss": 0.9349, "step": 1118 }, { "epoch": 0.14250919337122117, "grad_norm": 6.256153586880418, "learning_rate": 1.9911096288702532e-05, "loss": 1.2035, "step": 1119 }, { "epoch": 0.14263654743142778, "grad_norm": 6.450600044668659, "learning_rate": 1.9910821667560106e-05, "loss": 1.0081, "step": 1120 }, { "epoch": 0.14276390149163443, "grad_norm": 3.8362862919805774, "learning_rate": 1.9910546624822596e-05, "loss": 1.0396, "step": 1121 }, { "epoch": 0.14289125555184107, "grad_norm": 5.81600157570588, "learning_rate": 1.9910271160501694e-05, "loss": 0.9653, "step": 1122 }, { "epoch": 0.14301860961204768, "grad_norm": 4.879159571085699, "learning_rate": 1.990999527460912e-05, "loss": 0.9589, "step": 1123 }, { "epoch": 0.14314596367225432, "grad_norm": 6.8926005396879795, "learning_rate": 1.990971896715661e-05, "loss": 1.0908, "step": 1124 }, { "epoch": 0.14327331773246096, "grad_norm": 7.01316195540877, "learning_rate": 1.990944223815592e-05, "loss": 1.0536, "step": 1125 }, { "epoch": 0.14340067179266758, "grad_norm": 4.769891348826958, "learning_rate": 1.990916508761882e-05, "loss": 1.0685, "step": 1126 }, { "epoch": 0.14352802585287422, "grad_norm": 5.274988017136401, "learning_rate": 1.9908887515557103e-05, "loss": 0.9612, "step": 1127 }, { "epoch": 0.14365537991308086, "grad_norm": 4.753994918159333, "learning_rate": 1.990860952198257e-05, "loss": 1.0291, "step": 1128 }, { "epoch": 0.14378273397328747, "grad_norm": 4.599970958452185, "learning_rate": 1.990833110690705e-05, "loss": 1.0102, "step": 1129 }, { "epoch": 0.14391008803349412, "grad_norm": 5.262607513561777, "learning_rate": 1.990805227034239e-05, "loss": 0.9505, "step": 1130 }, { "epoch": 0.14403744209370076, "grad_norm": 4.750430438967478, "learning_rate": 1.9907773012300442e-05, "loss": 1.1432, "step": 1131 }, { "epoch": 0.14416479615390737, "grad_norm": 5.5534318022376485, "learning_rate": 1.99074933327931e-05, "loss": 1.0166, "step": 1132 }, { "epoch": 0.144292150214114, "grad_norm": 4.36148381243615, "learning_rate": 1.9907213231832244e-05, "loss": 0.9515, "step": 1133 }, { "epoch": 0.14441950427432065, "grad_norm": 5.396342465843601, "learning_rate": 1.99069327094298e-05, "loss": 0.988, "step": 1134 }, { "epoch": 0.14454685833452727, "grad_norm": 4.160370125444187, "learning_rate": 1.99066517655977e-05, "loss": 0.9402, "step": 1135 }, { "epoch": 0.1446742123947339, "grad_norm": 4.221462705684877, "learning_rate": 1.990637040034789e-05, "loss": 0.923, "step": 1136 }, { "epoch": 0.14480156645494055, "grad_norm": 6.253117203600368, "learning_rate": 1.9906088613692348e-05, "loss": 1.0339, "step": 1137 }, { "epoch": 0.14492892051514716, "grad_norm": 6.017093469305539, "learning_rate": 1.9905806405643053e-05, "loss": 1.0136, "step": 1138 }, { "epoch": 0.1450562745753538, "grad_norm": 4.399259175443589, "learning_rate": 1.990552377621201e-05, "loss": 1.0272, "step": 1139 }, { "epoch": 0.14518362863556045, "grad_norm": 4.547342282958136, "learning_rate": 1.990524072541125e-05, "loss": 0.9937, "step": 1140 }, { "epoch": 0.14531098269576706, "grad_norm": 4.855709152837835, "learning_rate": 1.9904957253252804e-05, "loss": 0.9244, "step": 1141 }, { "epoch": 0.1454383367559737, "grad_norm": 5.5757047814984615, "learning_rate": 1.9904673359748735e-05, "loss": 0.9908, "step": 1142 }, { "epoch": 0.14556569081618034, "grad_norm": 6.043856870540701, "learning_rate": 1.9904389044911122e-05, "loss": 1.0196, "step": 1143 }, { "epoch": 0.14569304487638696, "grad_norm": 4.801416490086993, "learning_rate": 1.9904104308752053e-05, "loss": 1.0553, "step": 1144 }, { "epoch": 0.1458203989365936, "grad_norm": 5.379496091195757, "learning_rate": 1.9903819151283645e-05, "loss": 1.0304, "step": 1145 }, { "epoch": 0.14594775299680024, "grad_norm": 4.545430481586915, "learning_rate": 1.9903533572518026e-05, "loss": 0.9908, "step": 1146 }, { "epoch": 0.14607510705700685, "grad_norm": 5.220752628775305, "learning_rate": 1.9903247572467344e-05, "loss": 1.1251, "step": 1147 }, { "epoch": 0.1462024611172135, "grad_norm": 6.674829190815485, "learning_rate": 1.990296115114377e-05, "loss": 1.0095, "step": 1148 }, { "epoch": 0.14632981517742014, "grad_norm": 5.507004179382678, "learning_rate": 1.9902674308559483e-05, "loss": 1.0024, "step": 1149 }, { "epoch": 0.14645716923762675, "grad_norm": 4.71990972755875, "learning_rate": 1.9902387044726686e-05, "loss": 0.9865, "step": 1150 }, { "epoch": 0.1465845232978334, "grad_norm": 4.478833113678234, "learning_rate": 1.9902099359657597e-05, "loss": 0.9298, "step": 1151 }, { "epoch": 0.14671187735804003, "grad_norm": 6.364648530400351, "learning_rate": 1.9901811253364458e-05, "loss": 1.0776, "step": 1152 }, { "epoch": 0.14683923141824665, "grad_norm": 6.283898853112022, "learning_rate": 1.9901522725859523e-05, "loss": 1.0698, "step": 1153 }, { "epoch": 0.1469665854784533, "grad_norm": 4.844800872017544, "learning_rate": 1.9901233777155062e-05, "loss": 0.8839, "step": 1154 }, { "epoch": 0.14709393953865993, "grad_norm": 6.435025526587445, "learning_rate": 1.9900944407263373e-05, "loss": 1.0899, "step": 1155 }, { "epoch": 0.14722129359886654, "grad_norm": 6.109479898396119, "learning_rate": 1.9900654616196765e-05, "loss": 1.0966, "step": 1156 }, { "epoch": 0.14734864765907318, "grad_norm": 4.981320392872287, "learning_rate": 1.9900364403967555e-05, "loss": 0.9802, "step": 1157 }, { "epoch": 0.14747600171927983, "grad_norm": 6.337664612206646, "learning_rate": 1.9900073770588104e-05, "loss": 0.9783, "step": 1158 }, { "epoch": 0.14760335577948644, "grad_norm": 7.134358467286747, "learning_rate": 1.9899782716070764e-05, "loss": 1.0323, "step": 1159 }, { "epoch": 0.14773070983969308, "grad_norm": 6.583584293781033, "learning_rate": 1.9899491240427917e-05, "loss": 1.0655, "step": 1160 }, { "epoch": 0.14785806389989972, "grad_norm": 7.120906523954685, "learning_rate": 1.9899199343671968e-05, "loss": 1.0935, "step": 1161 }, { "epoch": 0.14798541796010634, "grad_norm": 5.433366980575255, "learning_rate": 1.9898907025815327e-05, "loss": 0.9979, "step": 1162 }, { "epoch": 0.14811277202031298, "grad_norm": 6.2705095615215205, "learning_rate": 1.9898614286870433e-05, "loss": 1.1334, "step": 1163 }, { "epoch": 0.1482401260805196, "grad_norm": 4.831201455882813, "learning_rate": 1.9898321126849743e-05, "loss": 1.0073, "step": 1164 }, { "epoch": 0.14836748014072623, "grad_norm": 4.672358765322368, "learning_rate": 1.9898027545765715e-05, "loss": 1.0821, "step": 1165 }, { "epoch": 0.14849483420093287, "grad_norm": 7.180955909703082, "learning_rate": 1.989773354363085e-05, "loss": 0.9288, "step": 1166 }, { "epoch": 0.1486221882611395, "grad_norm": 4.403280569757165, "learning_rate": 1.989743912045765e-05, "loss": 0.961, "step": 1167 }, { "epoch": 0.14874954232134613, "grad_norm": 8.908893607791116, "learning_rate": 1.9897144276258637e-05, "loss": 1.0988, "step": 1168 }, { "epoch": 0.14887689638155277, "grad_norm": 4.88691858604188, "learning_rate": 1.9896849011046356e-05, "loss": 0.9468, "step": 1169 }, { "epoch": 0.14900425044175938, "grad_norm": 4.439021534715612, "learning_rate": 1.989655332483337e-05, "loss": 1.09, "step": 1170 }, { "epoch": 0.14913160450196603, "grad_norm": 4.584781117513443, "learning_rate": 1.989625721763225e-05, "loss": 1.1378, "step": 1171 }, { "epoch": 0.14925895856217267, "grad_norm": 6.829460380092409, "learning_rate": 1.9895960689455598e-05, "loss": 1.0116, "step": 1172 }, { "epoch": 0.14938631262237928, "grad_norm": 5.925553037691101, "learning_rate": 1.9895663740316027e-05, "loss": 0.9667, "step": 1173 }, { "epoch": 0.14951366668258592, "grad_norm": 5.650634169645478, "learning_rate": 1.9895366370226164e-05, "loss": 1.0056, "step": 1174 }, { "epoch": 0.14964102074279256, "grad_norm": 6.303749288830858, "learning_rate": 1.9895068579198667e-05, "loss": 0.9515, "step": 1175 }, { "epoch": 0.14976837480299918, "grad_norm": 4.734757511813959, "learning_rate": 1.9894770367246197e-05, "loss": 1.0735, "step": 1176 }, { "epoch": 0.14989572886320582, "grad_norm": 6.024659029227881, "learning_rate": 1.9894471734381443e-05, "loss": 1.086, "step": 1177 }, { "epoch": 0.15002308292341246, "grad_norm": 7.097352579325189, "learning_rate": 1.989417268061711e-05, "loss": 0.951, "step": 1178 }, { "epoch": 0.15015043698361907, "grad_norm": 5.378720969428306, "learning_rate": 1.989387320596591e-05, "loss": 1.0527, "step": 1179 }, { "epoch": 0.15027779104382571, "grad_norm": 5.978861447650805, "learning_rate": 1.9893573310440592e-05, "loss": 1.0597, "step": 1180 }, { "epoch": 0.15040514510403236, "grad_norm": 8.018983252708948, "learning_rate": 1.989327299405391e-05, "loss": 0.8077, "step": 1181 }, { "epoch": 0.15053249916423897, "grad_norm": 5.923675182970437, "learning_rate": 1.9892972256818642e-05, "loss": 1.0515, "step": 1182 }, { "epoch": 0.1506598532244456, "grad_norm": 5.0075340729200315, "learning_rate": 1.989267109874758e-05, "loss": 0.9554, "step": 1183 }, { "epoch": 0.15078720728465225, "grad_norm": 4.6758452931338805, "learning_rate": 1.989236951985353e-05, "loss": 0.9426, "step": 1184 }, { "epoch": 0.15091456134485887, "grad_norm": 4.204069010290642, "learning_rate": 1.9892067520149325e-05, "loss": 1.0049, "step": 1185 }, { "epoch": 0.1510419154050655, "grad_norm": 5.048848336214381, "learning_rate": 1.989176509964781e-05, "loss": 1.0377, "step": 1186 }, { "epoch": 0.15116926946527215, "grad_norm": 6.138944051196484, "learning_rate": 1.9891462258361854e-05, "loss": 0.985, "step": 1187 }, { "epoch": 0.15129662352547876, "grad_norm": 5.009658449450558, "learning_rate": 1.9891158996304332e-05, "loss": 1.0061, "step": 1188 }, { "epoch": 0.1514239775856854, "grad_norm": 5.76423574524903, "learning_rate": 1.989085531348815e-05, "loss": 1.1038, "step": 1189 }, { "epoch": 0.15155133164589205, "grad_norm": 22.6411250699605, "learning_rate": 1.9890551209926228e-05, "loss": 1.0009, "step": 1190 }, { "epoch": 0.15167868570609866, "grad_norm": 7.817797032377672, "learning_rate": 1.9890246685631497e-05, "loss": 1.0513, "step": 1191 }, { "epoch": 0.1518060397663053, "grad_norm": 5.7520894110552065, "learning_rate": 1.9889941740616915e-05, "loss": 0.9986, "step": 1192 }, { "epoch": 0.15193339382651194, "grad_norm": 18.699414183880535, "learning_rate": 1.988963637489545e-05, "loss": 1.0264, "step": 1193 }, { "epoch": 0.15206074788671856, "grad_norm": 4.828371109600161, "learning_rate": 1.9889330588480092e-05, "loss": 1.1089, "step": 1194 }, { "epoch": 0.1521881019469252, "grad_norm": 5.636301851353449, "learning_rate": 1.9889024381383853e-05, "loss": 1.1113, "step": 1195 }, { "epoch": 0.15231545600713184, "grad_norm": 6.401306924892736, "learning_rate": 1.9888717753619756e-05, "loss": 1.0345, "step": 1196 }, { "epoch": 0.15244281006733845, "grad_norm": 4.8383120621596785, "learning_rate": 1.988841070520085e-05, "loss": 1.0335, "step": 1197 }, { "epoch": 0.1525701641275451, "grad_norm": 4.596736430265082, "learning_rate": 1.9888103236140187e-05, "loss": 0.9814, "step": 1198 }, { "epoch": 0.15269751818775174, "grad_norm": 5.860190047479529, "learning_rate": 1.988779534645085e-05, "loss": 1.0338, "step": 1199 }, { "epoch": 0.15282487224795835, "grad_norm": 8.045167730751048, "learning_rate": 1.9887487036145942e-05, "loss": 0.9122, "step": 1200 }, { "epoch": 0.152952226308165, "grad_norm": 5.64218312561491, "learning_rate": 1.988717830523857e-05, "loss": 1.0578, "step": 1201 }, { "epoch": 0.15307958036837163, "grad_norm": 5.637619781913279, "learning_rate": 1.9886869153741873e-05, "loss": 1.0015, "step": 1202 }, { "epoch": 0.15320693442857825, "grad_norm": 5.581245639367872, "learning_rate": 1.9886559581669e-05, "loss": 1.0501, "step": 1203 }, { "epoch": 0.1533342884887849, "grad_norm": 4.895855236680835, "learning_rate": 1.9886249589033115e-05, "loss": 1.0239, "step": 1204 }, { "epoch": 0.15346164254899153, "grad_norm": 6.5685870323690345, "learning_rate": 1.988593917584741e-05, "loss": 1.0942, "step": 1205 }, { "epoch": 0.15358899660919814, "grad_norm": 8.102356330141617, "learning_rate": 1.9885628342125093e-05, "loss": 1.093, "step": 1206 }, { "epoch": 0.15371635066940478, "grad_norm": 4.9198943497393985, "learning_rate": 1.9885317087879378e-05, "loss": 1.049, "step": 1207 }, { "epoch": 0.15384370472961142, "grad_norm": 4.840514055806769, "learning_rate": 1.9885005413123515e-05, "loss": 0.9917, "step": 1208 }, { "epoch": 0.15397105878981804, "grad_norm": 5.329710962069987, "learning_rate": 1.9884693317870754e-05, "loss": 0.8854, "step": 1209 }, { "epoch": 0.15409841285002468, "grad_norm": 6.40699250174998, "learning_rate": 1.9884380802134374e-05, "loss": 0.9925, "step": 1210 }, { "epoch": 0.1542257669102313, "grad_norm": 4.920757832972157, "learning_rate": 1.988406786592767e-05, "loss": 0.959, "step": 1211 }, { "epoch": 0.15435312097043793, "grad_norm": 5.794855058912914, "learning_rate": 1.9883754509263952e-05, "loss": 1.1196, "step": 1212 }, { "epoch": 0.15448047503064458, "grad_norm": 5.76384991041534, "learning_rate": 1.9883440732156553e-05, "loss": 1.0452, "step": 1213 }, { "epoch": 0.1546078290908512, "grad_norm": 5.213327114429976, "learning_rate": 1.9883126534618818e-05, "loss": 0.9344, "step": 1214 }, { "epoch": 0.15473518315105783, "grad_norm": 6.551134712996828, "learning_rate": 1.988281191666411e-05, "loss": 1.0484, "step": 1215 }, { "epoch": 0.15486253721126447, "grad_norm": 4.463543399675435, "learning_rate": 1.988249687830582e-05, "loss": 0.9069, "step": 1216 }, { "epoch": 0.1549898912714711, "grad_norm": 4.702098037275322, "learning_rate": 1.9882181419557342e-05, "loss": 1.0329, "step": 1217 }, { "epoch": 0.15511724533167773, "grad_norm": 5.438172923574985, "learning_rate": 1.9881865540432104e-05, "loss": 0.9534, "step": 1218 }, { "epoch": 0.15524459939188437, "grad_norm": 5.295644431518071, "learning_rate": 1.9881549240943533e-05, "loss": 1.0289, "step": 1219 }, { "epoch": 0.15537195345209098, "grad_norm": 4.500410485089305, "learning_rate": 1.988123252110509e-05, "loss": 1.0132, "step": 1220 }, { "epoch": 0.15549930751229762, "grad_norm": 4.585819379334822, "learning_rate": 1.9880915380930245e-05, "loss": 1.0024, "step": 1221 }, { "epoch": 0.15562666157250427, "grad_norm": 7.193516344506464, "learning_rate": 1.9880597820432493e-05, "loss": 0.9475, "step": 1222 }, { "epoch": 0.15575401563271088, "grad_norm": 5.1443825760360244, "learning_rate": 1.988027983962534e-05, "loss": 1.0516, "step": 1223 }, { "epoch": 0.15588136969291752, "grad_norm": 5.090082076634423, "learning_rate": 1.9879961438522312e-05, "loss": 0.966, "step": 1224 }, { "epoch": 0.15600872375312416, "grad_norm": 4.790525527190118, "learning_rate": 1.987964261713695e-05, "loss": 0.8954, "step": 1225 }, { "epoch": 0.15613607781333078, "grad_norm": 5.236927104927023, "learning_rate": 1.9879323375482825e-05, "loss": 1.0287, "step": 1226 }, { "epoch": 0.15626343187353742, "grad_norm": 5.950369610245666, "learning_rate": 1.987900371357351e-05, "loss": 1.0874, "step": 1227 }, { "epoch": 0.15639078593374406, "grad_norm": 5.524607144711156, "learning_rate": 1.9878683631422605e-05, "loss": 0.9406, "step": 1228 }, { "epoch": 0.15651813999395067, "grad_norm": 5.079437180226869, "learning_rate": 1.987836312904373e-05, "loss": 1.039, "step": 1229 }, { "epoch": 0.1566454940541573, "grad_norm": 6.124208048789126, "learning_rate": 1.9878042206450515e-05, "loss": 0.9518, "step": 1230 }, { "epoch": 0.15677284811436396, "grad_norm": 4.603658564682635, "learning_rate": 1.9877720863656605e-05, "loss": 0.9797, "step": 1231 }, { "epoch": 0.15690020217457057, "grad_norm": 5.297661376914641, "learning_rate": 1.9877399100675684e-05, "loss": 1.0236, "step": 1232 }, { "epoch": 0.1570275562347772, "grad_norm": 5.3680721259125255, "learning_rate": 1.987707691752143e-05, "loss": 0.9727, "step": 1233 }, { "epoch": 0.15715491029498385, "grad_norm": 5.842960079883927, "learning_rate": 1.987675431420755e-05, "loss": 0.9885, "step": 1234 }, { "epoch": 0.15728226435519047, "grad_norm": 3.570953778995695, "learning_rate": 1.9876431290747766e-05, "loss": 0.9444, "step": 1235 }, { "epoch": 0.1574096184153971, "grad_norm": 4.451910556028384, "learning_rate": 1.987610784715582e-05, "loss": 0.9464, "step": 1236 }, { "epoch": 0.15753697247560375, "grad_norm": 5.669527939617677, "learning_rate": 1.9875783983445473e-05, "loss": 0.9582, "step": 1237 }, { "epoch": 0.15766432653581036, "grad_norm": 5.212123844470004, "learning_rate": 1.9875459699630503e-05, "loss": 0.9536, "step": 1238 }, { "epoch": 0.157791680596017, "grad_norm": 5.681033304276333, "learning_rate": 1.9875134995724697e-05, "loss": 0.9886, "step": 1239 }, { "epoch": 0.15791903465622364, "grad_norm": 4.781488350306441, "learning_rate": 1.9874809871741877e-05, "loss": 1.0747, "step": 1240 }, { "epoch": 0.15804638871643026, "grad_norm": 5.27408922477416, "learning_rate": 1.9874484327695862e-05, "loss": 1.0797, "step": 1241 }, { "epoch": 0.1581737427766369, "grad_norm": 5.110825638930353, "learning_rate": 1.9874158363600513e-05, "loss": 0.9206, "step": 1242 }, { "epoch": 0.15830109683684354, "grad_norm": 5.286355163870387, "learning_rate": 1.9873831979469687e-05, "loss": 0.9544, "step": 1243 }, { "epoch": 0.15842845089705015, "grad_norm": 5.876306266847982, "learning_rate": 1.9873505175317272e-05, "loss": 0.9467, "step": 1244 }, { "epoch": 0.1585558049572568, "grad_norm": 3.8290624599508156, "learning_rate": 1.987317795115717e-05, "loss": 0.99, "step": 1245 }, { "epoch": 0.15868315901746344, "grad_norm": 6.334550171788543, "learning_rate": 1.98728503070033e-05, "loss": 1.0207, "step": 1246 }, { "epoch": 0.15881051307767005, "grad_norm": 5.201014001722949, "learning_rate": 1.9872522242869598e-05, "loss": 1.0032, "step": 1247 }, { "epoch": 0.1589378671378767, "grad_norm": 5.339655506580464, "learning_rate": 1.987219375877002e-05, "loss": 1.0571, "step": 1248 }, { "epoch": 0.15906522119808333, "grad_norm": 5.917438223951082, "learning_rate": 1.9871864854718545e-05, "loss": 0.9298, "step": 1249 }, { "epoch": 0.15919257525828995, "grad_norm": 5.84159509430854, "learning_rate": 1.9871535530729154e-05, "loss": 1.1235, "step": 1250 }, { "epoch": 0.1593199293184966, "grad_norm": 6.647444797729966, "learning_rate": 1.9871205786815865e-05, "loss": 1.0961, "step": 1251 }, { "epoch": 0.15944728337870323, "grad_norm": 4.506879845567531, "learning_rate": 1.9870875622992697e-05, "loss": 1.0051, "step": 1252 }, { "epoch": 0.15957463743890984, "grad_norm": 4.978751044171506, "learning_rate": 1.9870545039273704e-05, "loss": 0.9519, "step": 1253 }, { "epoch": 0.15970199149911649, "grad_norm": 4.884386465067718, "learning_rate": 1.9870214035672945e-05, "loss": 1.0405, "step": 1254 }, { "epoch": 0.1598293455593231, "grad_norm": 6.01301752032264, "learning_rate": 1.9869882612204496e-05, "loss": 0.9768, "step": 1255 }, { "epoch": 0.15995669961952974, "grad_norm": 5.66422076826293, "learning_rate": 1.986955076888246e-05, "loss": 0.8278, "step": 1256 }, { "epoch": 0.16008405367973638, "grad_norm": 6.38059333314379, "learning_rate": 1.986921850572095e-05, "loss": 0.9132, "step": 1257 }, { "epoch": 0.160211407739943, "grad_norm": 5.136288561774867, "learning_rate": 1.9868885822734104e-05, "loss": 1.0093, "step": 1258 }, { "epoch": 0.16033876180014964, "grad_norm": 6.1340803451624435, "learning_rate": 1.986855271993607e-05, "loss": 0.9994, "step": 1259 }, { "epoch": 0.16046611586035628, "grad_norm": 6.610357838234759, "learning_rate": 1.9868219197341024e-05, "loss": 0.8919, "step": 1260 }, { "epoch": 0.1605934699205629, "grad_norm": 5.068832393028328, "learning_rate": 1.9867885254963147e-05, "loss": 0.958, "step": 1261 }, { "epoch": 0.16072082398076953, "grad_norm": 5.098652919809279, "learning_rate": 1.9867550892816646e-05, "loss": 1.0166, "step": 1262 }, { "epoch": 0.16084817804097618, "grad_norm": 5.54546086727413, "learning_rate": 1.9867216110915745e-05, "loss": 0.9558, "step": 1263 }, { "epoch": 0.1609755321011828, "grad_norm": 3.6506732894506726, "learning_rate": 1.986688090927469e-05, "loss": 0.9749, "step": 1264 }, { "epoch": 0.16110288616138943, "grad_norm": 6.630011336603153, "learning_rate": 1.9866545287907732e-05, "loss": 1.087, "step": 1265 }, { "epoch": 0.16123024022159607, "grad_norm": 5.1520144550405025, "learning_rate": 1.9866209246829152e-05, "loss": 1.0564, "step": 1266 }, { "epoch": 0.16135759428180269, "grad_norm": 4.567456623073952, "learning_rate": 1.9865872786053245e-05, "loss": 0.9943, "step": 1267 }, { "epoch": 0.16148494834200933, "grad_norm": 5.429662910241082, "learning_rate": 1.9865535905594326e-05, "loss": 1.0046, "step": 1268 }, { "epoch": 0.16161230240221597, "grad_norm": 4.745719989140955, "learning_rate": 1.986519860546672e-05, "loss": 0.9511, "step": 1269 }, { "epoch": 0.16173965646242258, "grad_norm": 4.907924293524879, "learning_rate": 1.986486088568478e-05, "loss": 0.947, "step": 1270 }, { "epoch": 0.16186701052262922, "grad_norm": 5.93928420715077, "learning_rate": 1.9864522746262867e-05, "loss": 1.0879, "step": 1271 }, { "epoch": 0.16199436458283586, "grad_norm": 4.6692044079111, "learning_rate": 1.986418418721537e-05, "loss": 0.9495, "step": 1272 }, { "epoch": 0.16212171864304248, "grad_norm": 6.435001221651496, "learning_rate": 1.986384520855669e-05, "loss": 0.9653, "step": 1273 }, { "epoch": 0.16224907270324912, "grad_norm": 4.8527476756639745, "learning_rate": 1.9863505810301246e-05, "loss": 1.0149, "step": 1274 }, { "epoch": 0.16237642676345576, "grad_norm": 5.268659764215143, "learning_rate": 1.9863165992463477e-05, "loss": 0.988, "step": 1275 }, { "epoch": 0.16250378082366237, "grad_norm": 6.928202404369275, "learning_rate": 1.986282575505783e-05, "loss": 1.0151, "step": 1276 }, { "epoch": 0.16263113488386902, "grad_norm": 5.708832025208047, "learning_rate": 1.9862485098098796e-05, "loss": 0.986, "step": 1277 }, { "epoch": 0.16275848894407566, "grad_norm": 4.293735314223316, "learning_rate": 1.986214402160085e-05, "loss": 0.9516, "step": 1278 }, { "epoch": 0.16288584300428227, "grad_norm": 5.128487865527736, "learning_rate": 1.9861802525578508e-05, "loss": 0.9776, "step": 1279 }, { "epoch": 0.1630131970644889, "grad_norm": 5.926519580527757, "learning_rate": 1.986146061004629e-05, "loss": 0.9454, "step": 1280 }, { "epoch": 0.16314055112469555, "grad_norm": 4.709791762168808, "learning_rate": 1.9861118275018755e-05, "loss": 1.055, "step": 1281 }, { "epoch": 0.16326790518490217, "grad_norm": 3.9447872001005515, "learning_rate": 1.9860775520510453e-05, "loss": 1.0019, "step": 1282 }, { "epoch": 0.1633952592451088, "grad_norm": 4.657867150816275, "learning_rate": 1.9860432346535966e-05, "loss": 0.9158, "step": 1283 }, { "epoch": 0.16352261330531545, "grad_norm": 6.558586770395333, "learning_rate": 1.9860088753109896e-05, "loss": 1.0146, "step": 1284 }, { "epoch": 0.16364996736552206, "grad_norm": 6.790804131467284, "learning_rate": 1.985974474024686e-05, "loss": 1.1179, "step": 1285 }, { "epoch": 0.1637773214257287, "grad_norm": 4.380667149497051, "learning_rate": 1.9859400307961486e-05, "loss": 1.029, "step": 1286 }, { "epoch": 0.16390467548593535, "grad_norm": 6.457342584561743, "learning_rate": 1.985905545626843e-05, "loss": 1.0326, "step": 1287 }, { "epoch": 0.16403202954614196, "grad_norm": 4.491102535418511, "learning_rate": 1.985871018518236e-05, "loss": 1.0289, "step": 1288 }, { "epoch": 0.1641593836063486, "grad_norm": 5.129222154149322, "learning_rate": 1.9858364494717966e-05, "loss": 1.0228, "step": 1289 }, { "epoch": 0.16428673766655524, "grad_norm": 5.162285434060186, "learning_rate": 1.9858018384889946e-05, "loss": 0.9045, "step": 1290 }, { "epoch": 0.16441409172676186, "grad_norm": 5.6943603054539915, "learning_rate": 1.9857671855713038e-05, "loss": 0.9674, "step": 1291 }, { "epoch": 0.1645414457869685, "grad_norm": 5.6519327284473695, "learning_rate": 1.9857324907201966e-05, "loss": 0.9993, "step": 1292 }, { "epoch": 0.16466879984717514, "grad_norm": 5.529623880960854, "learning_rate": 1.98569775393715e-05, "loss": 0.973, "step": 1293 }, { "epoch": 0.16479615390738175, "grad_norm": 4.634977146985319, "learning_rate": 1.9856629752236413e-05, "loss": 1.0061, "step": 1294 }, { "epoch": 0.1649235079675884, "grad_norm": 5.395211300534669, "learning_rate": 1.9856281545811497e-05, "loss": 0.9469, "step": 1295 }, { "epoch": 0.16505086202779504, "grad_norm": 4.268896392501476, "learning_rate": 1.9855932920111563e-05, "loss": 0.9921, "step": 1296 }, { "epoch": 0.16517821608800165, "grad_norm": 7.711568727329617, "learning_rate": 1.9855583875151453e-05, "loss": 0.9872, "step": 1297 }, { "epoch": 0.1653055701482083, "grad_norm": 5.241036528942064, "learning_rate": 1.9855234410946002e-05, "loss": 0.9546, "step": 1298 }, { "epoch": 0.16543292420841493, "grad_norm": 6.764661337078974, "learning_rate": 1.985488452751008e-05, "loss": 0.9521, "step": 1299 }, { "epoch": 0.16556027826862155, "grad_norm": 6.087105126077417, "learning_rate": 1.9854534224858574e-05, "loss": 1.0039, "step": 1300 }, { "epoch": 0.1656876323288282, "grad_norm": 5.684029327166587, "learning_rate": 1.9854183503006383e-05, "loss": 1.082, "step": 1301 }, { "epoch": 0.1658149863890348, "grad_norm": 4.707341632660841, "learning_rate": 1.9853832361968424e-05, "loss": 0.9413, "step": 1302 }, { "epoch": 0.16594234044924144, "grad_norm": 5.88976947430451, "learning_rate": 1.9853480801759637e-05, "loss": 1.0828, "step": 1303 }, { "epoch": 0.16606969450944808, "grad_norm": 4.565661654367643, "learning_rate": 1.9853128822394976e-05, "loss": 1.0482, "step": 1304 }, { "epoch": 0.1661970485696547, "grad_norm": 4.481935482188907, "learning_rate": 1.9852776423889414e-05, "loss": 1.1119, "step": 1305 }, { "epoch": 0.16632440262986134, "grad_norm": 11.635677561258317, "learning_rate": 1.9852423606257943e-05, "loss": 0.9503, "step": 1306 }, { "epoch": 0.16645175669006798, "grad_norm": 7.3166335722827265, "learning_rate": 1.9852070369515566e-05, "loss": 1.0055, "step": 1307 }, { "epoch": 0.1665791107502746, "grad_norm": 5.1385735125753165, "learning_rate": 1.9851716713677315e-05, "loss": 0.9379, "step": 1308 }, { "epoch": 0.16670646481048124, "grad_norm": 4.744954692274551, "learning_rate": 1.9851362638758236e-05, "loss": 1.0048, "step": 1309 }, { "epoch": 0.16683381887068788, "grad_norm": 4.680360774292839, "learning_rate": 1.9851008144773386e-05, "loss": 0.9329, "step": 1310 }, { "epoch": 0.1669611729308945, "grad_norm": 5.54574530474556, "learning_rate": 1.9850653231737844e-05, "loss": 1.0084, "step": 1311 }, { "epoch": 0.16708852699110113, "grad_norm": 5.629096341017364, "learning_rate": 1.985029789966671e-05, "loss": 0.9647, "step": 1312 }, { "epoch": 0.16721588105130777, "grad_norm": 4.247339201112461, "learning_rate": 1.98499421485751e-05, "loss": 1.018, "step": 1313 }, { "epoch": 0.1673432351115144, "grad_norm": 4.8500695135109835, "learning_rate": 1.984958597847815e-05, "loss": 1.0345, "step": 1314 }, { "epoch": 0.16747058917172103, "grad_norm": 5.242236255129088, "learning_rate": 1.9849229389391e-05, "loss": 1.0124, "step": 1315 }, { "epoch": 0.16759794323192767, "grad_norm": 4.494156115860417, "learning_rate": 1.984887238132883e-05, "loss": 0.9282, "step": 1316 }, { "epoch": 0.16772529729213428, "grad_norm": 5.527577705958565, "learning_rate": 1.9848514954306827e-05, "loss": 0.9766, "step": 1317 }, { "epoch": 0.16785265135234093, "grad_norm": 7.153181059331445, "learning_rate": 1.9848157108340186e-05, "loss": 0.9793, "step": 1318 }, { "epoch": 0.16798000541254757, "grad_norm": 4.274232333675409, "learning_rate": 1.984779884344414e-05, "loss": 0.8939, "step": 1319 }, { "epoch": 0.16810735947275418, "grad_norm": 6.091392647998473, "learning_rate": 1.9847440159633918e-05, "loss": 1.0784, "step": 1320 }, { "epoch": 0.16823471353296082, "grad_norm": 4.3504516717848, "learning_rate": 1.9847081056924788e-05, "loss": 1.0399, "step": 1321 }, { "epoch": 0.16836206759316746, "grad_norm": 5.874738646842187, "learning_rate": 1.984672153533202e-05, "loss": 1.0156, "step": 1322 }, { "epoch": 0.16848942165337408, "grad_norm": 6.273024606003035, "learning_rate": 1.9846361594870914e-05, "loss": 1.0532, "step": 1323 }, { "epoch": 0.16861677571358072, "grad_norm": 9.43738116258996, "learning_rate": 1.9846001235556775e-05, "loss": 0.9758, "step": 1324 }, { "epoch": 0.16874412977378736, "grad_norm": 3.951354463349465, "learning_rate": 1.984564045740493e-05, "loss": 1.0017, "step": 1325 }, { "epoch": 0.16887148383399397, "grad_norm": 6.057988641405209, "learning_rate": 1.984527926043074e-05, "loss": 1.0013, "step": 1326 }, { "epoch": 0.16899883789420062, "grad_norm": 5.121047938510769, "learning_rate": 1.9844917644649553e-05, "loss": 1.0206, "step": 1327 }, { "epoch": 0.16912619195440726, "grad_norm": 6.34730347577412, "learning_rate": 1.984455561007676e-05, "loss": 1.0719, "step": 1328 }, { "epoch": 0.16925354601461387, "grad_norm": 6.9857632411586525, "learning_rate": 1.984419315672776e-05, "loss": 1.1088, "step": 1329 }, { "epoch": 0.1693809000748205, "grad_norm": 4.054283636482987, "learning_rate": 1.9843830284617975e-05, "loss": 0.9645, "step": 1330 }, { "epoch": 0.16950825413502715, "grad_norm": 5.763261181254879, "learning_rate": 1.9843466993762836e-05, "loss": 1.0112, "step": 1331 }, { "epoch": 0.16963560819523377, "grad_norm": 5.006710508504241, "learning_rate": 1.98431032841778e-05, "loss": 1.0243, "step": 1332 }, { "epoch": 0.1697629622554404, "grad_norm": 5.33453967438594, "learning_rate": 1.9842739155878337e-05, "loss": 1.0415, "step": 1333 }, { "epoch": 0.16989031631564705, "grad_norm": 4.239488505334722, "learning_rate": 1.984237460887994e-05, "loss": 1.0452, "step": 1334 }, { "epoch": 0.17001767037585366, "grad_norm": 8.172692340070634, "learning_rate": 1.9842009643198113e-05, "loss": 0.9793, "step": 1335 }, { "epoch": 0.1701450244360603, "grad_norm": 5.34305572461287, "learning_rate": 1.984164425884838e-05, "loss": 1.0048, "step": 1336 }, { "epoch": 0.17027237849626695, "grad_norm": 5.792797865504691, "learning_rate": 1.984127845584629e-05, "loss": 0.9795, "step": 1337 }, { "epoch": 0.17039973255647356, "grad_norm": 4.318874126588256, "learning_rate": 1.9840912234207396e-05, "loss": 0.9736, "step": 1338 }, { "epoch": 0.1705270866166802, "grad_norm": 4.162413651770721, "learning_rate": 1.9840545593947286e-05, "loss": 1.0151, "step": 1339 }, { "epoch": 0.17065444067688684, "grad_norm": 6.5925908086697325, "learning_rate": 1.9840178535081548e-05, "loss": 0.904, "step": 1340 }, { "epoch": 0.17078179473709346, "grad_norm": 5.536949712479812, "learning_rate": 1.98398110576258e-05, "loss": 1.0522, "step": 1341 }, { "epoch": 0.1709091487973001, "grad_norm": 7.280052774258995, "learning_rate": 1.9839443161595668e-05, "loss": 1.0073, "step": 1342 }, { "epoch": 0.17103650285750674, "grad_norm": 5.838656403647462, "learning_rate": 1.9839074847006815e-05, "loss": 0.9766, "step": 1343 }, { "epoch": 0.17116385691771335, "grad_norm": 6.120541876944528, "learning_rate": 1.9838706113874896e-05, "loss": 1.003, "step": 1344 }, { "epoch": 0.17129121097792, "grad_norm": 6.272822405730764, "learning_rate": 1.9838336962215606e-05, "loss": 0.9179, "step": 1345 }, { "epoch": 0.1714185650381266, "grad_norm": 4.571611477394392, "learning_rate": 1.983796739204464e-05, "loss": 1.1381, "step": 1346 }, { "epoch": 0.17154591909833325, "grad_norm": 5.164721379109247, "learning_rate": 1.9837597403377726e-05, "loss": 1.0764, "step": 1347 }, { "epoch": 0.1716732731585399, "grad_norm": 5.382269800120944, "learning_rate": 1.98372269962306e-05, "loss": 1.099, "step": 1348 }, { "epoch": 0.1718006272187465, "grad_norm": 5.6010520628007985, "learning_rate": 1.9836856170619018e-05, "loss": 0.8125, "step": 1349 }, { "epoch": 0.17192798127895315, "grad_norm": 6.5416130825043, "learning_rate": 1.983648492655875e-05, "loss": 0.9736, "step": 1350 }, { "epoch": 0.1720553353391598, "grad_norm": 5.986232857622864, "learning_rate": 1.9836113264065598e-05, "loss": 1.1308, "step": 1351 }, { "epoch": 0.1721826893993664, "grad_norm": 3.721914787944601, "learning_rate": 1.983574118315537e-05, "loss": 0.9768, "step": 1352 }, { "epoch": 0.17231004345957304, "grad_norm": 5.081106208746815, "learning_rate": 1.983536868384389e-05, "loss": 1.0861, "step": 1353 }, { "epoch": 0.17243739751977968, "grad_norm": 4.766787127771113, "learning_rate": 1.9834995766147e-05, "loss": 0.9764, "step": 1354 }, { "epoch": 0.1725647515799863, "grad_norm": 5.844392628957778, "learning_rate": 1.9834622430080574e-05, "loss": 0.9675, "step": 1355 }, { "epoch": 0.17269210564019294, "grad_norm": 4.266773111287161, "learning_rate": 1.9834248675660484e-05, "loss": 0.9577, "step": 1356 }, { "epoch": 0.17281945970039958, "grad_norm": 5.339045366544977, "learning_rate": 1.9833874502902636e-05, "loss": 1.0691, "step": 1357 }, { "epoch": 0.1729468137606062, "grad_norm": 5.321239149867997, "learning_rate": 1.9833499911822944e-05, "loss": 1.02, "step": 1358 }, { "epoch": 0.17307416782081284, "grad_norm": 6.036001912839067, "learning_rate": 1.983312490243734e-05, "loss": 1.0136, "step": 1359 }, { "epoch": 0.17320152188101948, "grad_norm": 5.842316966880855, "learning_rate": 1.9832749474761782e-05, "loss": 0.9487, "step": 1360 }, { "epoch": 0.1733288759412261, "grad_norm": 5.268133726086745, "learning_rate": 1.9832373628812235e-05, "loss": 0.897, "step": 1361 }, { "epoch": 0.17345623000143273, "grad_norm": 10.765103043746514, "learning_rate": 1.9831997364604693e-05, "loss": 1.057, "step": 1362 }, { "epoch": 0.17358358406163937, "grad_norm": 6.986459988359291, "learning_rate": 1.983162068215515e-05, "loss": 0.984, "step": 1363 }, { "epoch": 0.173710938121846, "grad_norm": 6.54607260801127, "learning_rate": 1.9831243581479643e-05, "loss": 1.1114, "step": 1364 }, { "epoch": 0.17383829218205263, "grad_norm": 4.736502694914325, "learning_rate": 1.983086606259421e-05, "loss": 0.9659, "step": 1365 }, { "epoch": 0.17396564624225927, "grad_norm": 5.3548235459378, "learning_rate": 1.9830488125514907e-05, "loss": 0.9164, "step": 1366 }, { "epoch": 0.17409300030246588, "grad_norm": 5.012251530527317, "learning_rate": 1.983010977025781e-05, "loss": 1.0724, "step": 1367 }, { "epoch": 0.17422035436267252, "grad_norm": 4.104810144678136, "learning_rate": 1.982973099683902e-05, "loss": 1.0126, "step": 1368 }, { "epoch": 0.17434770842287917, "grad_norm": 4.606415189475125, "learning_rate": 1.9829351805274643e-05, "loss": 0.9828, "step": 1369 }, { "epoch": 0.17447506248308578, "grad_norm": 4.9605551264425625, "learning_rate": 1.9828972195580815e-05, "loss": 1.0372, "step": 1370 }, { "epoch": 0.17460241654329242, "grad_norm": 6.515749118272252, "learning_rate": 1.9828592167773676e-05, "loss": 0.9049, "step": 1371 }, { "epoch": 0.17472977060349906, "grad_norm": 5.611944008796001, "learning_rate": 1.9828211721869404e-05, "loss": 1.0385, "step": 1372 }, { "epoch": 0.17485712466370568, "grad_norm": 4.296467265811125, "learning_rate": 1.9827830857884173e-05, "loss": 1.0467, "step": 1373 }, { "epoch": 0.17498447872391232, "grad_norm": 5.582847839807333, "learning_rate": 1.9827449575834187e-05, "loss": 0.9349, "step": 1374 }, { "epoch": 0.17511183278411896, "grad_norm": 5.693181480389708, "learning_rate": 1.9827067875735667e-05, "loss": 0.8793, "step": 1375 }, { "epoch": 0.17523918684432557, "grad_norm": 5.261651325431034, "learning_rate": 1.982668575760485e-05, "loss": 0.9581, "step": 1376 }, { "epoch": 0.17536654090453221, "grad_norm": 4.977291802306023, "learning_rate": 1.982630322145799e-05, "loss": 0.9355, "step": 1377 }, { "epoch": 0.17549389496473886, "grad_norm": 8.424306090442627, "learning_rate": 1.9825920267311358e-05, "loss": 1.0181, "step": 1378 }, { "epoch": 0.17562124902494547, "grad_norm": 4.067377536948859, "learning_rate": 1.9825536895181245e-05, "loss": 0.9847, "step": 1379 }, { "epoch": 0.1757486030851521, "grad_norm": 5.434342969610268, "learning_rate": 1.982515310508396e-05, "loss": 1.0621, "step": 1380 }, { "epoch": 0.17587595714535875, "grad_norm": 3.951322725200578, "learning_rate": 1.9824768897035833e-05, "loss": 0.9415, "step": 1381 }, { "epoch": 0.17600331120556537, "grad_norm": 4.20572669754507, "learning_rate": 1.98243842710532e-05, "loss": 0.9017, "step": 1382 }, { "epoch": 0.176130665265772, "grad_norm": 5.642519645876438, "learning_rate": 1.9823999227152426e-05, "loss": 0.9919, "step": 1383 }, { "epoch": 0.17625801932597865, "grad_norm": 5.8060276543263845, "learning_rate": 1.9823613765349894e-05, "loss": 0.9357, "step": 1384 }, { "epoch": 0.17638537338618526, "grad_norm": 6.730220495639194, "learning_rate": 1.9823227885661994e-05, "loss": 1.0167, "step": 1385 }, { "epoch": 0.1765127274463919, "grad_norm": 4.98295281189637, "learning_rate": 1.982284158810515e-05, "loss": 0.9644, "step": 1386 }, { "epoch": 0.17664008150659855, "grad_norm": 5.611349539312602, "learning_rate": 1.982245487269579e-05, "loss": 0.9759, "step": 1387 }, { "epoch": 0.17676743556680516, "grad_norm": 5.830986568146951, "learning_rate": 1.982206773945036e-05, "loss": 0.9505, "step": 1388 }, { "epoch": 0.1768947896270118, "grad_norm": 4.7729098313991845, "learning_rate": 1.9821680188385334e-05, "loss": 1.0816, "step": 1389 }, { "epoch": 0.17702214368721844, "grad_norm": 4.199991298848176, "learning_rate": 1.982129221951719e-05, "loss": 1.1747, "step": 1390 }, { "epoch": 0.17714949774742506, "grad_norm": 5.562743963827968, "learning_rate": 1.9820903832862445e-05, "loss": 1.0474, "step": 1391 }, { "epoch": 0.1772768518076317, "grad_norm": 6.610761492520081, "learning_rate": 1.9820515028437612e-05, "loss": 1.0511, "step": 1392 }, { "epoch": 0.1774042058678383, "grad_norm": 4.700877720465981, "learning_rate": 1.9820125806259233e-05, "loss": 1.034, "step": 1393 }, { "epoch": 0.17753155992804495, "grad_norm": 5.507829641761889, "learning_rate": 1.981973616634386e-05, "loss": 0.9607, "step": 1394 }, { "epoch": 0.1776589139882516, "grad_norm": 4.55152367032817, "learning_rate": 1.9819346108708074e-05, "loss": 1.0074, "step": 1395 }, { "epoch": 0.1777862680484582, "grad_norm": 5.155387667593372, "learning_rate": 1.9818955633368464e-05, "loss": 0.9597, "step": 1396 }, { "epoch": 0.17791362210866485, "grad_norm": 5.73277489064217, "learning_rate": 1.981856474034164e-05, "loss": 1.0927, "step": 1397 }, { "epoch": 0.1780409761688715, "grad_norm": 4.071601648882491, "learning_rate": 1.9818173429644237e-05, "loss": 0.9982, "step": 1398 }, { "epoch": 0.1781683302290781, "grad_norm": 3.9397300581268513, "learning_rate": 1.9817781701292892e-05, "loss": 0.9403, "step": 1399 }, { "epoch": 0.17829568428928474, "grad_norm": 4.89777375854755, "learning_rate": 1.9817389555304274e-05, "loss": 0.9848, "step": 1400 }, { "epoch": 0.1784230383494914, "grad_norm": 5.384909433154051, "learning_rate": 1.9816996991695057e-05, "loss": 1.0004, "step": 1401 }, { "epoch": 0.178550392409698, "grad_norm": 6.171274817534474, "learning_rate": 1.9816604010481955e-05, "loss": 0.9742, "step": 1402 }, { "epoch": 0.17867774646990464, "grad_norm": 5.372669158161206, "learning_rate": 1.981621061168167e-05, "loss": 1.0258, "step": 1403 }, { "epoch": 0.17880510053011128, "grad_norm": 5.42572394959394, "learning_rate": 1.9815816795310945e-05, "loss": 0.8953, "step": 1404 }, { "epoch": 0.1789324545903179, "grad_norm": 4.888856025546814, "learning_rate": 1.981542256138653e-05, "loss": 0.9233, "step": 1405 }, { "epoch": 0.17905980865052454, "grad_norm": 5.561367305342423, "learning_rate": 1.9815027909925194e-05, "loss": 0.9601, "step": 1406 }, { "epoch": 0.17918716271073118, "grad_norm": 4.272674659833029, "learning_rate": 1.9814632840943728e-05, "loss": 1.0174, "step": 1407 }, { "epoch": 0.1793145167709378, "grad_norm": 4.094915326073295, "learning_rate": 1.9814237354458937e-05, "loss": 0.9695, "step": 1408 }, { "epoch": 0.17944187083114443, "grad_norm": 4.905545323784712, "learning_rate": 1.981384145048764e-05, "loss": 0.929, "step": 1409 }, { "epoch": 0.17956922489135108, "grad_norm": 4.367058003097394, "learning_rate": 1.9813445129046685e-05, "loss": 0.9274, "step": 1410 }, { "epoch": 0.1796965789515577, "grad_norm": 6.3573246680356394, "learning_rate": 1.9813048390152926e-05, "loss": 1.0155, "step": 1411 }, { "epoch": 0.17982393301176433, "grad_norm": 6.87006575640397, "learning_rate": 1.9812651233823245e-05, "loss": 0.9062, "step": 1412 }, { "epoch": 0.17995128707197097, "grad_norm": 4.697870202127253, "learning_rate": 1.9812253660074532e-05, "loss": 1.0905, "step": 1413 }, { "epoch": 0.18007864113217759, "grad_norm": 5.959781639656172, "learning_rate": 1.98118556689237e-05, "loss": 1.119, "step": 1414 }, { "epoch": 0.18020599519238423, "grad_norm": 4.111138375223385, "learning_rate": 1.9811457260387683e-05, "loss": 0.923, "step": 1415 }, { "epoch": 0.18033334925259087, "grad_norm": 4.006177185634729, "learning_rate": 1.9811058434483422e-05, "loss": 1.0849, "step": 1416 }, { "epoch": 0.18046070331279748, "grad_norm": 4.48631303225857, "learning_rate": 1.981065919122789e-05, "loss": 1.0728, "step": 1417 }, { "epoch": 0.18058805737300412, "grad_norm": 6.525586168407294, "learning_rate": 1.9810259530638064e-05, "loss": 1.0408, "step": 1418 }, { "epoch": 0.18071541143321077, "grad_norm": 6.020659799505925, "learning_rate": 1.980985945273095e-05, "loss": 1.0975, "step": 1419 }, { "epoch": 0.18084276549341738, "grad_norm": 6.885051800346678, "learning_rate": 1.9809458957523563e-05, "loss": 0.9118, "step": 1420 }, { "epoch": 0.18097011955362402, "grad_norm": 3.859705119358011, "learning_rate": 1.9809058045032942e-05, "loss": 1.0455, "step": 1421 }, { "epoch": 0.18109747361383066, "grad_norm": 5.034443854459602, "learning_rate": 1.980865671527614e-05, "loss": 0.9053, "step": 1422 }, { "epoch": 0.18122482767403728, "grad_norm": 4.970510494794013, "learning_rate": 1.9808254968270236e-05, "loss": 1.0684, "step": 1423 }, { "epoch": 0.18135218173424392, "grad_norm": 6.993515553435381, "learning_rate": 1.9807852804032306e-05, "loss": 1.0819, "step": 1424 }, { "epoch": 0.18147953579445056, "grad_norm": 4.3244256139826405, "learning_rate": 1.980745022257947e-05, "loss": 1.0832, "step": 1425 }, { "epoch": 0.18160688985465717, "grad_norm": 5.41282386427537, "learning_rate": 1.9807047223928847e-05, "loss": 1.0109, "step": 1426 }, { "epoch": 0.1817342439148638, "grad_norm": 5.5849581199995075, "learning_rate": 1.980664380809758e-05, "loss": 0.9701, "step": 1427 }, { "epoch": 0.18186159797507045, "grad_norm": 7.5150214763849705, "learning_rate": 1.980623997510283e-05, "loss": 1.1214, "step": 1428 }, { "epoch": 0.18198895203527707, "grad_norm": 5.378983019888339, "learning_rate": 1.9805835724961783e-05, "loss": 1.0311, "step": 1429 }, { "epoch": 0.1821163060954837, "grad_norm": 5.328501578975879, "learning_rate": 1.9805431057691627e-05, "loss": 1.089, "step": 1430 }, { "epoch": 0.18224366015569035, "grad_norm": 4.827672449525467, "learning_rate": 1.9805025973309577e-05, "loss": 0.9638, "step": 1431 }, { "epoch": 0.18237101421589696, "grad_norm": 4.960158495677945, "learning_rate": 1.980462047183287e-05, "loss": 0.994, "step": 1432 }, { "epoch": 0.1824983682761036, "grad_norm": 4.417898594090962, "learning_rate": 1.9804214553278747e-05, "loss": 0.9635, "step": 1433 }, { "epoch": 0.18262572233631025, "grad_norm": 4.3511745752527755, "learning_rate": 1.9803808217664483e-05, "loss": 0.9231, "step": 1434 }, { "epoch": 0.18275307639651686, "grad_norm": 7.574369841195433, "learning_rate": 1.9803401465007363e-05, "loss": 0.9365, "step": 1435 }, { "epoch": 0.1828804304567235, "grad_norm": 4.817720034473403, "learning_rate": 1.9802994295324685e-05, "loss": 0.9657, "step": 1436 }, { "epoch": 0.18300778451693012, "grad_norm": 3.9337316979442223, "learning_rate": 1.980258670863377e-05, "loss": 1.0757, "step": 1437 }, { "epoch": 0.18313513857713676, "grad_norm": 4.912953650419165, "learning_rate": 1.980217870495196e-05, "loss": 0.8668, "step": 1438 }, { "epoch": 0.1832624926373434, "grad_norm": 6.3601703568201255, "learning_rate": 1.980177028429661e-05, "loss": 1.0216, "step": 1439 }, { "epoch": 0.18338984669755, "grad_norm": 4.361051722376627, "learning_rate": 1.980136144668509e-05, "loss": 1.0007, "step": 1440 }, { "epoch": 0.18351720075775665, "grad_norm": 4.831064656441857, "learning_rate": 1.98009521921348e-05, "loss": 1.0569, "step": 1441 }, { "epoch": 0.1836445548179633, "grad_norm": 4.673866077337553, "learning_rate": 1.9800542520663136e-05, "loss": 1.1306, "step": 1442 }, { "epoch": 0.1837719088781699, "grad_norm": 5.025317562564007, "learning_rate": 1.980013243228754e-05, "loss": 1.0482, "step": 1443 }, { "epoch": 0.18389926293837655, "grad_norm": 4.917699005025452, "learning_rate": 1.979972192702544e-05, "loss": 0.9926, "step": 1444 }, { "epoch": 0.1840266169985832, "grad_norm": 6.695090006422005, "learning_rate": 1.9799311004894314e-05, "loss": 1.0492, "step": 1445 }, { "epoch": 0.1841539710587898, "grad_norm": 5.783826429176605, "learning_rate": 1.9798899665911636e-05, "loss": 0.9291, "step": 1446 }, { "epoch": 0.18428132511899645, "grad_norm": 8.583389528948576, "learning_rate": 1.97984879100949e-05, "loss": 1.0729, "step": 1447 }, { "epoch": 0.1844086791792031, "grad_norm": 4.9845657551297196, "learning_rate": 1.9798075737461627e-05, "loss": 1.081, "step": 1448 }, { "epoch": 0.1845360332394097, "grad_norm": 4.630384249468881, "learning_rate": 1.9797663148029352e-05, "loss": 0.9889, "step": 1449 }, { "epoch": 0.18466338729961634, "grad_norm": 4.459600725990378, "learning_rate": 1.9797250141815617e-05, "loss": 1.0447, "step": 1450 }, { "epoch": 0.18479074135982299, "grad_norm": 7.9994144225384165, "learning_rate": 1.9796836718838e-05, "loss": 1.0084, "step": 1451 }, { "epoch": 0.1849180954200296, "grad_norm": 5.037288099049157, "learning_rate": 1.9796422879114082e-05, "loss": 0.9369, "step": 1452 }, { "epoch": 0.18504544948023624, "grad_norm": 5.12163116638082, "learning_rate": 1.9796008622661472e-05, "loss": 1.0629, "step": 1453 }, { "epoch": 0.18517280354044288, "grad_norm": 4.711496399073382, "learning_rate": 1.9795593949497786e-05, "loss": 1.0271, "step": 1454 }, { "epoch": 0.1853001576006495, "grad_norm": 6.076948429563655, "learning_rate": 1.979517885964067e-05, "loss": 0.9624, "step": 1455 }, { "epoch": 0.18542751166085614, "grad_norm": 4.85950570375367, "learning_rate": 1.979476335310778e-05, "loss": 1.0016, "step": 1456 }, { "epoch": 0.18555486572106278, "grad_norm": 6.515755558306762, "learning_rate": 1.9794347429916786e-05, "loss": 0.9468, "step": 1457 }, { "epoch": 0.1856822197812694, "grad_norm": 4.261883670402554, "learning_rate": 1.9793931090085385e-05, "loss": 0.9449, "step": 1458 }, { "epoch": 0.18580957384147603, "grad_norm": 5.087873755370306, "learning_rate": 1.9793514333631287e-05, "loss": 1.0691, "step": 1459 }, { "epoch": 0.18593692790168267, "grad_norm": 6.254327115367058, "learning_rate": 1.9793097160572223e-05, "loss": 1.0433, "step": 1460 }, { "epoch": 0.1860642819618893, "grad_norm": 6.42593203660469, "learning_rate": 1.9792679570925933e-05, "loss": 1.0038, "step": 1461 }, { "epoch": 0.18619163602209593, "grad_norm": 4.263729582608785, "learning_rate": 1.9792261564710188e-05, "loss": 1.041, "step": 1462 }, { "epoch": 0.18631899008230257, "grad_norm": 4.486672480798122, "learning_rate": 1.9791843141942763e-05, "loss": 1.0097, "step": 1463 }, { "epoch": 0.18644634414250918, "grad_norm": 8.109797883813664, "learning_rate": 1.979142430264146e-05, "loss": 0.9008, "step": 1464 }, { "epoch": 0.18657369820271583, "grad_norm": 5.566391858534027, "learning_rate": 1.97910050468241e-05, "loss": 0.9353, "step": 1465 }, { "epoch": 0.18670105226292247, "grad_norm": 5.093553410580432, "learning_rate": 1.979058537450851e-05, "loss": 0.9952, "step": 1466 }, { "epoch": 0.18682840632312908, "grad_norm": 6.860038429797663, "learning_rate": 1.979016528571255e-05, "loss": 0.9609, "step": 1467 }, { "epoch": 0.18695576038333572, "grad_norm": 6.188720582735326, "learning_rate": 1.9789744780454082e-05, "loss": 1.0192, "step": 1468 }, { "epoch": 0.18708311444354236, "grad_norm": 5.187779614369765, "learning_rate": 1.9789323858751e-05, "loss": 1.1023, "step": 1469 }, { "epoch": 0.18721046850374898, "grad_norm": 5.060137161989241, "learning_rate": 1.978890252062121e-05, "loss": 1.0415, "step": 1470 }, { "epoch": 0.18733782256395562, "grad_norm": 4.245005646287378, "learning_rate": 1.9788480766082626e-05, "loss": 1.0162, "step": 1471 }, { "epoch": 0.18746517662416226, "grad_norm": 5.212867024991071, "learning_rate": 1.9788058595153202e-05, "loss": 0.8992, "step": 1472 }, { "epoch": 0.18759253068436887, "grad_norm": 3.5576216105774727, "learning_rate": 1.978763600785089e-05, "loss": 0.9837, "step": 1473 }, { "epoch": 0.18771988474457552, "grad_norm": 4.995126447174946, "learning_rate": 1.9787213004193665e-05, "loss": 0.967, "step": 1474 }, { "epoch": 0.18784723880478216, "grad_norm": 5.481986109719116, "learning_rate": 1.9786789584199523e-05, "loss": 0.9039, "step": 1475 }, { "epoch": 0.18797459286498877, "grad_norm": 5.599221052763061, "learning_rate": 1.9786365747886475e-05, "loss": 1.0468, "step": 1476 }, { "epoch": 0.1881019469251954, "grad_norm": 4.9890705819427374, "learning_rate": 1.9785941495272553e-05, "loss": 1.0024, "step": 1477 }, { "epoch": 0.18822930098540205, "grad_norm": 6.845984246635087, "learning_rate": 1.9785516826375805e-05, "loss": 0.9939, "step": 1478 }, { "epoch": 0.18835665504560867, "grad_norm": 4.7652065406059565, "learning_rate": 1.978509174121429e-05, "loss": 0.946, "step": 1479 }, { "epoch": 0.1884840091058153, "grad_norm": 7.010725251690263, "learning_rate": 1.978466623980609e-05, "loss": 1.0261, "step": 1480 }, { "epoch": 0.18861136316602195, "grad_norm": 5.342474712763604, "learning_rate": 1.9784240322169316e-05, "loss": 1.0171, "step": 1481 }, { "epoch": 0.18873871722622856, "grad_norm": 6.2207481969521, "learning_rate": 1.9783813988322076e-05, "loss": 1.0655, "step": 1482 }, { "epoch": 0.1888660712864352, "grad_norm": 6.051037206093783, "learning_rate": 1.9783387238282513e-05, "loss": 0.9079, "step": 1483 }, { "epoch": 0.18899342534664182, "grad_norm": 5.941004953369172, "learning_rate": 1.9782960072068772e-05, "loss": 1.0933, "step": 1484 }, { "epoch": 0.18912077940684846, "grad_norm": 3.9146807214716772, "learning_rate": 1.978253248969903e-05, "loss": 1.0632, "step": 1485 }, { "epoch": 0.1892481334670551, "grad_norm": 5.4431079577859105, "learning_rate": 1.9782104491191475e-05, "loss": 0.9911, "step": 1486 }, { "epoch": 0.18937548752726172, "grad_norm": 4.052988037995278, "learning_rate": 1.9781676076564316e-05, "loss": 1.0377, "step": 1487 }, { "epoch": 0.18950284158746836, "grad_norm": 5.62963947124253, "learning_rate": 1.978124724583577e-05, "loss": 0.9062, "step": 1488 }, { "epoch": 0.189630195647675, "grad_norm": 5.138572399027655, "learning_rate": 1.978081799902409e-05, "loss": 0.9558, "step": 1489 }, { "epoch": 0.1897575497078816, "grad_norm": 4.889826407754536, "learning_rate": 1.9780388336147524e-05, "loss": 1.0003, "step": 1490 }, { "epoch": 0.18988490376808825, "grad_norm": 7.62897875429576, "learning_rate": 1.9779958257224355e-05, "loss": 1.0335, "step": 1491 }, { "epoch": 0.1900122578282949, "grad_norm": 6.335914922220099, "learning_rate": 1.9779527762272877e-05, "loss": 1.0124, "step": 1492 }, { "epoch": 0.1901396118885015, "grad_norm": 4.470530335373219, "learning_rate": 1.9779096851311406e-05, "loss": 1.093, "step": 1493 }, { "epoch": 0.19026696594870815, "grad_norm": 8.02603348111266, "learning_rate": 1.977866552435827e-05, "loss": 1.0535, "step": 1494 }, { "epoch": 0.1903943200089148, "grad_norm": 4.870751901973264, "learning_rate": 1.9778233781431814e-05, "loss": 1.0418, "step": 1495 }, { "epoch": 0.1905216740691214, "grad_norm": 4.832118681946968, "learning_rate": 1.977780162255041e-05, "loss": 1.0292, "step": 1496 }, { "epoch": 0.19064902812932805, "grad_norm": 5.930854692540798, "learning_rate": 1.9777369047732438e-05, "loss": 0.9462, "step": 1497 }, { "epoch": 0.1907763821895347, "grad_norm": 5.495050110245704, "learning_rate": 1.9776936056996297e-05, "loss": 0.8484, "step": 1498 }, { "epoch": 0.1909037362497413, "grad_norm": 5.508620355758505, "learning_rate": 1.977650265036041e-05, "loss": 1.0204, "step": 1499 }, { "epoch": 0.19103109030994794, "grad_norm": 5.195661444167357, "learning_rate": 1.9776068827843214e-05, "loss": 0.9216, "step": 1500 }, { "epoch": 0.19115844437015458, "grad_norm": 7.5291732520655374, "learning_rate": 1.9775634589463158e-05, "loss": 0.9515, "step": 1501 }, { "epoch": 0.1912857984303612, "grad_norm": 3.888663536687444, "learning_rate": 1.977519993523872e-05, "loss": 0.9452, "step": 1502 }, { "epoch": 0.19141315249056784, "grad_norm": 5.3393483019284895, "learning_rate": 1.9774764865188388e-05, "loss": 0.9975, "step": 1503 }, { "epoch": 0.19154050655077448, "grad_norm": 7.013216892528038, "learning_rate": 1.977432937933067e-05, "loss": 0.9489, "step": 1504 }, { "epoch": 0.1916678606109811, "grad_norm": 4.091945570836384, "learning_rate": 1.9773893477684086e-05, "loss": 1.0263, "step": 1505 }, { "epoch": 0.19179521467118774, "grad_norm": 6.133647497396183, "learning_rate": 1.977345716026718e-05, "loss": 0.9503, "step": 1506 }, { "epoch": 0.19192256873139438, "grad_norm": 6.061068965688803, "learning_rate": 1.9773020427098522e-05, "loss": 0.996, "step": 1507 }, { "epoch": 0.192049922791601, "grad_norm": 5.329895262413773, "learning_rate": 1.9772583278196677e-05, "loss": 0.988, "step": 1508 }, { "epoch": 0.19217727685180763, "grad_norm": 6.2580468828024545, "learning_rate": 1.977214571358025e-05, "loss": 0.9768, "step": 1509 }, { "epoch": 0.19230463091201427, "grad_norm": 5.9261243566210045, "learning_rate": 1.9771707733267852e-05, "loss": 0.9682, "step": 1510 }, { "epoch": 0.1924319849722209, "grad_norm": 5.140494208715624, "learning_rate": 1.977126933727811e-05, "loss": 1.08, "step": 1511 }, { "epoch": 0.19255933903242753, "grad_norm": 6.7300714252643825, "learning_rate": 1.977083052562968e-05, "loss": 1.11, "step": 1512 }, { "epoch": 0.19268669309263417, "grad_norm": 5.585036667965532, "learning_rate": 1.977039129834122e-05, "loss": 1.1374, "step": 1513 }, { "epoch": 0.19281404715284078, "grad_norm": 4.049696480009359, "learning_rate": 1.9769951655431426e-05, "loss": 0.9615, "step": 1514 }, { "epoch": 0.19294140121304743, "grad_norm": 5.976439949723937, "learning_rate": 1.976951159691899e-05, "loss": 1.0128, "step": 1515 }, { "epoch": 0.19306875527325407, "grad_norm": 4.616711174007855, "learning_rate": 1.9769071122822634e-05, "loss": 1.0134, "step": 1516 }, { "epoch": 0.19319610933346068, "grad_norm": 6.802833628690229, "learning_rate": 1.97686302331611e-05, "loss": 0.8562, "step": 1517 }, { "epoch": 0.19332346339366732, "grad_norm": 4.702031512207982, "learning_rate": 1.9768188927953134e-05, "loss": 1.0063, "step": 1518 }, { "epoch": 0.19345081745387396, "grad_norm": 4.7894770830185385, "learning_rate": 1.9767747207217516e-05, "loss": 0.9469, "step": 1519 }, { "epoch": 0.19357817151408058, "grad_norm": 5.886225263924752, "learning_rate": 1.9767305070973033e-05, "loss": 1.0466, "step": 1520 }, { "epoch": 0.19370552557428722, "grad_norm": 4.56563617096565, "learning_rate": 1.9766862519238493e-05, "loss": 1.0864, "step": 1521 }, { "epoch": 0.19383287963449386, "grad_norm": 5.274761480208912, "learning_rate": 1.9766419552032723e-05, "loss": 1.0344, "step": 1522 }, { "epoch": 0.19396023369470047, "grad_norm": 6.4716598673917485, "learning_rate": 1.9765976169374566e-05, "loss": 1.0106, "step": 1523 }, { "epoch": 0.19408758775490711, "grad_norm": 6.49377231098414, "learning_rate": 1.9765532371282882e-05, "loss": 1.0763, "step": 1524 }, { "epoch": 0.19421494181511376, "grad_norm": 5.465689014999976, "learning_rate": 1.976508815777655e-05, "loss": 0.9861, "step": 1525 }, { "epoch": 0.19434229587532037, "grad_norm": 5.770104464373687, "learning_rate": 1.976464352887447e-05, "loss": 1.0394, "step": 1526 }, { "epoch": 0.194469649935527, "grad_norm": 7.618250626931756, "learning_rate": 1.9764198484595553e-05, "loss": 1.0921, "step": 1527 }, { "epoch": 0.19459700399573365, "grad_norm": 4.240896460573713, "learning_rate": 1.9763753024958724e-05, "loss": 0.9663, "step": 1528 }, { "epoch": 0.19472435805594027, "grad_norm": 4.724200807830366, "learning_rate": 1.9763307149982945e-05, "loss": 1.1107, "step": 1529 }, { "epoch": 0.1948517121161469, "grad_norm": 6.047163924379785, "learning_rate": 1.9762860859687173e-05, "loss": 0.9791, "step": 1530 }, { "epoch": 0.19497906617635352, "grad_norm": 5.820781427733166, "learning_rate": 1.9762414154090398e-05, "loss": 0.9669, "step": 1531 }, { "epoch": 0.19510642023656016, "grad_norm": 6.023807657745205, "learning_rate": 1.976196703321162e-05, "loss": 0.953, "step": 1532 }, { "epoch": 0.1952337742967668, "grad_norm": 6.153987255679557, "learning_rate": 1.9761519497069863e-05, "loss": 0.9781, "step": 1533 }, { "epoch": 0.19536112835697342, "grad_norm": 6.374491054190222, "learning_rate": 1.9761071545684158e-05, "loss": 0.9647, "step": 1534 }, { "epoch": 0.19548848241718006, "grad_norm": 4.914819316472806, "learning_rate": 1.9760623179073568e-05, "loss": 1.0278, "step": 1535 }, { "epoch": 0.1956158364773867, "grad_norm": 5.046922051650349, "learning_rate": 1.9760174397257158e-05, "loss": 0.9767, "step": 1536 }, { "epoch": 0.19574319053759331, "grad_norm": 6.035067442206626, "learning_rate": 1.9759725200254027e-05, "loss": 0.9667, "step": 1537 }, { "epoch": 0.19587054459779996, "grad_norm": 7.227809225227867, "learning_rate": 1.9759275588083273e-05, "loss": 0.8844, "step": 1538 }, { "epoch": 0.1959978986580066, "grad_norm": 5.3397633812304175, "learning_rate": 1.975882556076403e-05, "loss": 0.88, "step": 1539 }, { "epoch": 0.1961252527182132, "grad_norm": 5.160049056182221, "learning_rate": 1.975837511831544e-05, "loss": 0.9729, "step": 1540 }, { "epoch": 0.19625260677841985, "grad_norm": 5.5355201230080056, "learning_rate": 1.9757924260756665e-05, "loss": 0.983, "step": 1541 }, { "epoch": 0.1963799608386265, "grad_norm": 5.863808218740776, "learning_rate": 1.975747298810688e-05, "loss": 0.9425, "step": 1542 }, { "epoch": 0.1965073148988331, "grad_norm": 4.844791423453396, "learning_rate": 1.9757021300385288e-05, "loss": 0.9033, "step": 1543 }, { "epoch": 0.19663466895903975, "grad_norm": 5.3108868113088405, "learning_rate": 1.9756569197611098e-05, "loss": 1.0034, "step": 1544 }, { "epoch": 0.1967620230192464, "grad_norm": 5.669919856613489, "learning_rate": 1.975611667980354e-05, "loss": 1.0133, "step": 1545 }, { "epoch": 0.196889377079453, "grad_norm": 5.563726225720538, "learning_rate": 1.9755663746981873e-05, "loss": 1.0517, "step": 1546 }, { "epoch": 0.19701673113965965, "grad_norm": 4.65256759143498, "learning_rate": 1.9755210399165358e-05, "loss": 0.9638, "step": 1547 }, { "epoch": 0.1971440851998663, "grad_norm": 5.692109304765883, "learning_rate": 1.9754756636373277e-05, "loss": 1.0341, "step": 1548 }, { "epoch": 0.1972714392600729, "grad_norm": 5.938750125384786, "learning_rate": 1.975430245862494e-05, "loss": 1.0953, "step": 1549 }, { "epoch": 0.19739879332027954, "grad_norm": 4.856150646003446, "learning_rate": 1.9753847865939657e-05, "loss": 0.9406, "step": 1550 }, { "epoch": 0.19752614738048618, "grad_norm": 4.081784765659373, "learning_rate": 1.9753392858336776e-05, "loss": 1.0257, "step": 1551 }, { "epoch": 0.1976535014406928, "grad_norm": 5.313787865208813, "learning_rate": 1.975293743583565e-05, "loss": 1.0489, "step": 1552 }, { "epoch": 0.19778085550089944, "grad_norm": 5.256694158959123, "learning_rate": 1.9752481598455648e-05, "loss": 1.0289, "step": 1553 }, { "epoch": 0.19790820956110608, "grad_norm": 3.9844003713959366, "learning_rate": 1.9752025346216164e-05, "loss": 1.0425, "step": 1554 }, { "epoch": 0.1980355636213127, "grad_norm": 6.3238206329291895, "learning_rate": 1.975156867913661e-05, "loss": 0.9651, "step": 1555 }, { "epoch": 0.19816291768151933, "grad_norm": 4.594948125801704, "learning_rate": 1.97511115972364e-05, "loss": 1.0036, "step": 1556 }, { "epoch": 0.19829027174172598, "grad_norm": 5.107193694114138, "learning_rate": 1.9750654100534992e-05, "loss": 1.0419, "step": 1557 }, { "epoch": 0.1984176258019326, "grad_norm": 5.134129835650002, "learning_rate": 1.9750196189051837e-05, "loss": 0.9756, "step": 1558 }, { "epoch": 0.19854497986213923, "grad_norm": 4.923212553870282, "learning_rate": 1.974973786280642e-05, "loss": 0.9265, "step": 1559 }, { "epoch": 0.19867233392234587, "grad_norm": 3.993883702005666, "learning_rate": 1.9749279121818235e-05, "loss": 0.9821, "step": 1560 }, { "epoch": 0.1987996879825525, "grad_norm": 5.936873553507101, "learning_rate": 1.97488199661068e-05, "loss": 1.1268, "step": 1561 }, { "epoch": 0.19892704204275913, "grad_norm": 5.895892714705748, "learning_rate": 1.9748360395691644e-05, "loss": 1.0248, "step": 1562 }, { "epoch": 0.19905439610296577, "grad_norm": 6.590647469234013, "learning_rate": 1.9747900410592314e-05, "loss": 0.8957, "step": 1563 }, { "epoch": 0.19918175016317238, "grad_norm": 4.850838673233757, "learning_rate": 1.9747440010828384e-05, "loss": 0.9782, "step": 1564 }, { "epoch": 0.19930910422337902, "grad_norm": 4.491532730813909, "learning_rate": 1.974697919641943e-05, "loss": 1.0488, "step": 1565 }, { "epoch": 0.19943645828358567, "grad_norm": 6.2761674501680424, "learning_rate": 1.974651796738506e-05, "loss": 0.9976, "step": 1566 }, { "epoch": 0.19956381234379228, "grad_norm": 3.8229999723721324, "learning_rate": 1.9746056323744897e-05, "loss": 1.0592, "step": 1567 }, { "epoch": 0.19969116640399892, "grad_norm": 6.720524043585452, "learning_rate": 1.9745594265518574e-05, "loss": 0.9593, "step": 1568 }, { "epoch": 0.19981852046420556, "grad_norm": 4.296872336646902, "learning_rate": 1.9745131792725748e-05, "loss": 0.9543, "step": 1569 }, { "epoch": 0.19994587452441218, "grad_norm": 3.8372903524284334, "learning_rate": 1.974466890538609e-05, "loss": 0.9131, "step": 1570 }, { "epoch": 0.20007322858461882, "grad_norm": 5.553138656470616, "learning_rate": 1.9744205603519293e-05, "loss": 0.9076, "step": 1571 }, { "epoch": 0.20020058264482546, "grad_norm": 6.149281231816345, "learning_rate": 1.9743741887145067e-05, "loss": 1.1001, "step": 1572 }, { "epoch": 0.20032793670503207, "grad_norm": 4.25695904218789, "learning_rate": 1.9743277756283133e-05, "loss": 0.9958, "step": 1573 }, { "epoch": 0.20045529076523871, "grad_norm": 5.06832475301277, "learning_rate": 1.974281321095324e-05, "loss": 0.9478, "step": 1574 }, { "epoch": 0.20058264482544533, "grad_norm": 5.257481285421065, "learning_rate": 1.9742348251175146e-05, "loss": 0.9822, "step": 1575 }, { "epoch": 0.20070999888565197, "grad_norm": 5.073481575950944, "learning_rate": 1.974188287696863e-05, "loss": 0.9306, "step": 1576 }, { "epoch": 0.2008373529458586, "grad_norm": 5.268382632298335, "learning_rate": 1.974141708835349e-05, "loss": 1.0149, "step": 1577 }, { "epoch": 0.20096470700606522, "grad_norm": 9.723198856770377, "learning_rate": 1.9740950885349536e-05, "loss": 1.0157, "step": 1578 }, { "epoch": 0.20109206106627187, "grad_norm": 5.560161827806057, "learning_rate": 1.9740484267976608e-05, "loss": 0.9826, "step": 1579 }, { "epoch": 0.2012194151264785, "grad_norm": 5.714445234524424, "learning_rate": 1.974001723625455e-05, "loss": 0.9877, "step": 1580 }, { "epoch": 0.20134676918668512, "grad_norm": 4.86436044752818, "learning_rate": 1.9739549790203224e-05, "loss": 1.0302, "step": 1581 }, { "epoch": 0.20147412324689176, "grad_norm": 5.9878916952751995, "learning_rate": 1.973908192984252e-05, "loss": 0.9946, "step": 1582 }, { "epoch": 0.2016014773070984, "grad_norm": 6.879338230864136, "learning_rate": 1.9738613655192345e-05, "loss": 0.9916, "step": 1583 }, { "epoch": 0.20172883136730502, "grad_norm": 4.571181099017479, "learning_rate": 1.973814496627261e-05, "loss": 1.1194, "step": 1584 }, { "epoch": 0.20185618542751166, "grad_norm": 5.342571284702611, "learning_rate": 1.9737675863103257e-05, "loss": 0.9541, "step": 1585 }, { "epoch": 0.2019835394877183, "grad_norm": 5.99493798663698, "learning_rate": 1.9737206345704244e-05, "loss": 1.0448, "step": 1586 }, { "epoch": 0.2021108935479249, "grad_norm": 5.206679954039697, "learning_rate": 1.9736736414095538e-05, "loss": 0.9584, "step": 1587 }, { "epoch": 0.20223824760813155, "grad_norm": 6.131071545787127, "learning_rate": 1.973626606829713e-05, "loss": 0.9996, "step": 1588 }, { "epoch": 0.2023656016683382, "grad_norm": 5.296658401786897, "learning_rate": 1.9735795308329037e-05, "loss": 0.9626, "step": 1589 }, { "epoch": 0.2024929557285448, "grad_norm": 4.480860805236891, "learning_rate": 1.973532413421127e-05, "loss": 0.8541, "step": 1590 }, { "epoch": 0.20262030978875145, "grad_norm": 5.458814740488239, "learning_rate": 1.973485254596388e-05, "loss": 0.942, "step": 1591 }, { "epoch": 0.2027476638489581, "grad_norm": 5.60844851522458, "learning_rate": 1.9734380543606932e-05, "loss": 1.0469, "step": 1592 }, { "epoch": 0.2028750179091647, "grad_norm": 3.9807266586051577, "learning_rate": 1.9733908127160495e-05, "loss": 0.9929, "step": 1593 }, { "epoch": 0.20300237196937135, "grad_norm": 5.214944879929989, "learning_rate": 1.973343529664467e-05, "loss": 0.8852, "step": 1594 }, { "epoch": 0.203129726029578, "grad_norm": 5.241255425764933, "learning_rate": 1.9732962052079575e-05, "loss": 0.9598, "step": 1595 }, { "epoch": 0.2032570800897846, "grad_norm": 5.1585897050212255, "learning_rate": 1.9732488393485332e-05, "loss": 1.0306, "step": 1596 }, { "epoch": 0.20338443414999124, "grad_norm": 6.224674029413985, "learning_rate": 1.9732014320882096e-05, "loss": 0.9012, "step": 1597 }, { "epoch": 0.20351178821019789, "grad_norm": 4.540211622372655, "learning_rate": 1.9731539834290034e-05, "loss": 1.0735, "step": 1598 }, { "epoch": 0.2036391422704045, "grad_norm": 5.2255767211946855, "learning_rate": 1.9731064933729324e-05, "loss": 0.8904, "step": 1599 }, { "epoch": 0.20376649633061114, "grad_norm": 5.265525749018251, "learning_rate": 1.9730589619220177e-05, "loss": 0.9952, "step": 1600 }, { "epoch": 0.20389385039081778, "grad_norm": 4.648503766268739, "learning_rate": 1.9730113890782804e-05, "loss": 1.0357, "step": 1601 }, { "epoch": 0.2040212044510244, "grad_norm": 5.008336936877452, "learning_rate": 1.9729637748437448e-05, "loss": 0.9865, "step": 1602 }, { "epoch": 0.20414855851123104, "grad_norm": 4.832030263174414, "learning_rate": 1.972916119220436e-05, "loss": 1.0622, "step": 1603 }, { "epoch": 0.20427591257143768, "grad_norm": 8.765669824278218, "learning_rate": 1.9728684222103812e-05, "loss": 0.9047, "step": 1604 }, { "epoch": 0.2044032666316443, "grad_norm": 5.656930787651489, "learning_rate": 1.9728206838156098e-05, "loss": 0.9878, "step": 1605 }, { "epoch": 0.20453062069185093, "grad_norm": 3.652824427343459, "learning_rate": 1.9727729040381517e-05, "loss": 0.906, "step": 1606 }, { "epoch": 0.20465797475205758, "grad_norm": 4.8632365397041175, "learning_rate": 1.97272508288004e-05, "loss": 1.0788, "step": 1607 }, { "epoch": 0.2047853288122642, "grad_norm": 6.17770195662139, "learning_rate": 1.972677220343309e-05, "loss": 1.0285, "step": 1608 }, { "epoch": 0.20491268287247083, "grad_norm": 5.537076995055236, "learning_rate": 1.972629316429995e-05, "loss": 0.9512, "step": 1609 }, { "epoch": 0.20504003693267747, "grad_norm": 4.737506296762595, "learning_rate": 1.972581371142135e-05, "loss": 0.9401, "step": 1610 }, { "epoch": 0.20516739099288409, "grad_norm": 5.31851788920953, "learning_rate": 1.9725333844817688e-05, "loss": 1.0683, "step": 1611 }, { "epoch": 0.20529474505309073, "grad_norm": 5.594261103461401, "learning_rate": 1.972485356450938e-05, "loss": 0.8846, "step": 1612 }, { "epoch": 0.20542209911329737, "grad_norm": 5.099392439612177, "learning_rate": 1.972437287051685e-05, "loss": 1.0609, "step": 1613 }, { "epoch": 0.20554945317350398, "grad_norm": 6.0986471208048725, "learning_rate": 1.9723891762860557e-05, "loss": 1.0413, "step": 1614 }, { "epoch": 0.20567680723371062, "grad_norm": 5.832802375970654, "learning_rate": 1.9723410241560958e-05, "loss": 1.043, "step": 1615 }, { "epoch": 0.20580416129391726, "grad_norm": 5.085532270422708, "learning_rate": 1.972292830663854e-05, "loss": 0.9361, "step": 1616 }, { "epoch": 0.20593151535412388, "grad_norm": 7.483943281683596, "learning_rate": 1.9722445958113803e-05, "loss": 1.0771, "step": 1617 }, { "epoch": 0.20605886941433052, "grad_norm": 7.7429546203268815, "learning_rate": 1.9721963196007263e-05, "loss": 0.9862, "step": 1618 }, { "epoch": 0.20618622347453716, "grad_norm": 4.275373003118664, "learning_rate": 1.972148002033946e-05, "loss": 0.9736, "step": 1619 }, { "epoch": 0.20631357753474378, "grad_norm": 4.442061108412303, "learning_rate": 1.9720996431130946e-05, "loss": 0.962, "step": 1620 }, { "epoch": 0.20644093159495042, "grad_norm": 4.36612542464768, "learning_rate": 1.9720512428402293e-05, "loss": 0.9597, "step": 1621 }, { "epoch": 0.20656828565515703, "grad_norm": 5.255502269780313, "learning_rate": 1.9720028012174094e-05, "loss": 0.9574, "step": 1622 }, { "epoch": 0.20669563971536367, "grad_norm": 5.427818009431217, "learning_rate": 1.9719543182466944e-05, "loss": 1.0017, "step": 1623 }, { "epoch": 0.2068229937755703, "grad_norm": 5.819726533003532, "learning_rate": 1.9719057939301477e-05, "loss": 0.9151, "step": 1624 }, { "epoch": 0.20695034783577693, "grad_norm": 5.716644399764548, "learning_rate": 1.9718572282698335e-05, "loss": 1.0101, "step": 1625 }, { "epoch": 0.20707770189598357, "grad_norm": 4.933284650167188, "learning_rate": 1.971808621267817e-05, "loss": 0.9385, "step": 1626 }, { "epoch": 0.2072050559561902, "grad_norm": 4.628939626066382, "learning_rate": 1.9717599729261666e-05, "loss": 1.0506, "step": 1627 }, { "epoch": 0.20733241001639682, "grad_norm": 5.3615858672677215, "learning_rate": 1.971711283246951e-05, "loss": 1.0117, "step": 1628 }, { "epoch": 0.20745976407660346, "grad_norm": 6.214288232361231, "learning_rate": 1.971662552232242e-05, "loss": 0.9271, "step": 1629 }, { "epoch": 0.2075871181368101, "grad_norm": 6.244529771655492, "learning_rate": 1.971613779884113e-05, "loss": 1.1197, "step": 1630 }, { "epoch": 0.20771447219701672, "grad_norm": 6.029004879854435, "learning_rate": 1.9715649662046378e-05, "loss": 1.0605, "step": 1631 }, { "epoch": 0.20784182625722336, "grad_norm": 5.489495215773398, "learning_rate": 1.971516111195893e-05, "loss": 0.9659, "step": 1632 }, { "epoch": 0.20796918031743, "grad_norm": 5.247503867981655, "learning_rate": 1.971467214859957e-05, "loss": 0.8921, "step": 1633 }, { "epoch": 0.20809653437763662, "grad_norm": 7.338572393551121, "learning_rate": 1.97141827719891e-05, "loss": 1.0235, "step": 1634 }, { "epoch": 0.20822388843784326, "grad_norm": 6.658093694099257, "learning_rate": 1.971369298214834e-05, "loss": 1.0838, "step": 1635 }, { "epoch": 0.2083512424980499, "grad_norm": 4.663091357163081, "learning_rate": 1.9713202779098118e-05, "loss": 1.0795, "step": 1636 }, { "epoch": 0.2084785965582565, "grad_norm": 5.371071910466963, "learning_rate": 1.9712712162859287e-05, "loss": 1.1829, "step": 1637 }, { "epoch": 0.20860595061846315, "grad_norm": 5.334058751998837, "learning_rate": 1.9712221133452722e-05, "loss": 1.0557, "step": 1638 }, { "epoch": 0.2087333046786698, "grad_norm": 3.8505080445218827, "learning_rate": 1.971172969089931e-05, "loss": 0.9079, "step": 1639 }, { "epoch": 0.2088606587388764, "grad_norm": 5.877852700967215, "learning_rate": 1.9711237835219955e-05, "loss": 0.9663, "step": 1640 }, { "epoch": 0.20898801279908305, "grad_norm": 5.391358610350945, "learning_rate": 1.9710745566435578e-05, "loss": 0.993, "step": 1641 }, { "epoch": 0.2091153668592897, "grad_norm": 5.2566674900155075, "learning_rate": 1.9710252884567124e-05, "loss": 1.0123, "step": 1642 }, { "epoch": 0.2092427209194963, "grad_norm": 4.4334306997826625, "learning_rate": 1.970975978963555e-05, "loss": 0.928, "step": 1643 }, { "epoch": 0.20937007497970295, "grad_norm": 4.981977214882108, "learning_rate": 1.970926628166183e-05, "loss": 1.0223, "step": 1644 }, { "epoch": 0.2094974290399096, "grad_norm": 4.961837278454065, "learning_rate": 1.9708772360666958e-05, "loss": 0.9146, "step": 1645 }, { "epoch": 0.2096247831001162, "grad_norm": 5.469682363322358, "learning_rate": 1.970827802667194e-05, "loss": 0.9955, "step": 1646 }, { "epoch": 0.20975213716032284, "grad_norm": 7.40464849097527, "learning_rate": 1.9707783279697816e-05, "loss": 0.9915, "step": 1647 }, { "epoch": 0.20987949122052948, "grad_norm": 4.157092245708658, "learning_rate": 1.9707288119765625e-05, "loss": 1.0057, "step": 1648 }, { "epoch": 0.2100068452807361, "grad_norm": 5.203421186201981, "learning_rate": 1.9706792546896425e-05, "loss": 1.0078, "step": 1649 }, { "epoch": 0.21013419934094274, "grad_norm": 4.969316942031503, "learning_rate": 1.9706296561111308e-05, "loss": 1.0448, "step": 1650 }, { "epoch": 0.21026155340114938, "grad_norm": 4.850189456975334, "learning_rate": 1.9705800162431365e-05, "loss": 1.0313, "step": 1651 }, { "epoch": 0.210388907461356, "grad_norm": 6.328925489493655, "learning_rate": 1.9705303350877714e-05, "loss": 1.047, "step": 1652 }, { "epoch": 0.21051626152156264, "grad_norm": 5.55827205153297, "learning_rate": 1.970480612647149e-05, "loss": 1.0637, "step": 1653 }, { "epoch": 0.21064361558176928, "grad_norm": 6.3722659961571955, "learning_rate": 1.9704308489233846e-05, "loss": 1.047, "step": 1654 }, { "epoch": 0.2107709696419759, "grad_norm": 4.8103105593092375, "learning_rate": 1.9703810439185946e-05, "loss": 1.0598, "step": 1655 }, { "epoch": 0.21089832370218253, "grad_norm": 8.339356140604846, "learning_rate": 1.970331197634898e-05, "loss": 0.9091, "step": 1656 }, { "epoch": 0.21102567776238917, "grad_norm": 4.713283176254165, "learning_rate": 1.970281310074415e-05, "loss": 1.0329, "step": 1657 }, { "epoch": 0.2111530318225958, "grad_norm": 5.932060239417839, "learning_rate": 1.9702313812392683e-05, "loss": 1.0818, "step": 1658 }, { "epoch": 0.21128038588280243, "grad_norm": 5.183738355768398, "learning_rate": 1.9701814111315813e-05, "loss": 1.0107, "step": 1659 }, { "epoch": 0.21140773994300907, "grad_norm": 5.967140180385663, "learning_rate": 1.9701313997534798e-05, "loss": 0.9086, "step": 1660 }, { "epoch": 0.21153509400321568, "grad_norm": 4.887698336818495, "learning_rate": 1.9700813471070907e-05, "loss": 1.0905, "step": 1661 }, { "epoch": 0.21166244806342233, "grad_norm": 6.2653132715471616, "learning_rate": 1.9700312531945444e-05, "loss": 0.963, "step": 1662 }, { "epoch": 0.21178980212362897, "grad_norm": 6.325315553536174, "learning_rate": 1.9699811180179704e-05, "loss": 0.9581, "step": 1663 }, { "epoch": 0.21191715618383558, "grad_norm": 4.819017233161328, "learning_rate": 1.9699309415795027e-05, "loss": 1.023, "step": 1664 }, { "epoch": 0.21204451024404222, "grad_norm": 5.740437392739419, "learning_rate": 1.9698807238812748e-05, "loss": 0.9332, "step": 1665 }, { "epoch": 0.21217186430424884, "grad_norm": 5.980074858946459, "learning_rate": 1.9698304649254235e-05, "loss": 1.0194, "step": 1666 }, { "epoch": 0.21229921836445548, "grad_norm": 4.768733977244646, "learning_rate": 1.9697801647140865e-05, "loss": 1.0535, "step": 1667 }, { "epoch": 0.21242657242466212, "grad_norm": 5.7997365299760535, "learning_rate": 1.9697298232494037e-05, "loss": 0.9816, "step": 1668 }, { "epoch": 0.21255392648486873, "grad_norm": 5.853278729669982, "learning_rate": 1.969679440533516e-05, "loss": 0.9141, "step": 1669 }, { "epoch": 0.21268128054507537, "grad_norm": 5.938993888444609, "learning_rate": 1.9696290165685674e-05, "loss": 0.9569, "step": 1670 }, { "epoch": 0.21280863460528202, "grad_norm": 5.824831771058527, "learning_rate": 1.9695785513567024e-05, "loss": 1.076, "step": 1671 }, { "epoch": 0.21293598866548863, "grad_norm": 9.641584104438895, "learning_rate": 1.969528044900068e-05, "loss": 1.0234, "step": 1672 }, { "epoch": 0.21306334272569527, "grad_norm": 3.97168845182061, "learning_rate": 1.969477497200812e-05, "loss": 0.9538, "step": 1673 }, { "epoch": 0.2131906967859019, "grad_norm": 8.086281884272529, "learning_rate": 1.9694269082610856e-05, "loss": 0.9032, "step": 1674 }, { "epoch": 0.21331805084610853, "grad_norm": 6.182984833608822, "learning_rate": 1.9693762780830404e-05, "loss": 0.9367, "step": 1675 }, { "epoch": 0.21344540490631517, "grad_norm": 6.781903072851365, "learning_rate": 1.96932560666883e-05, "loss": 1.0375, "step": 1676 }, { "epoch": 0.2135727589665218, "grad_norm": 4.736966773952431, "learning_rate": 1.96927489402061e-05, "loss": 0.9725, "step": 1677 }, { "epoch": 0.21370011302672842, "grad_norm": 5.598186757942846, "learning_rate": 1.9692241401405376e-05, "loss": 0.9808, "step": 1678 }, { "epoch": 0.21382746708693506, "grad_norm": 5.052495985449508, "learning_rate": 1.9691733450307723e-05, "loss": 0.9624, "step": 1679 }, { "epoch": 0.2139548211471417, "grad_norm": 6.730913657850143, "learning_rate": 1.9691225086934743e-05, "loss": 0.9979, "step": 1680 }, { "epoch": 0.21408217520734832, "grad_norm": 6.456859330175002, "learning_rate": 1.969071631130806e-05, "loss": 1.0151, "step": 1681 }, { "epoch": 0.21420952926755496, "grad_norm": 5.2489839206361335, "learning_rate": 1.9690207123449322e-05, "loss": 1.0065, "step": 1682 }, { "epoch": 0.2143368833277616, "grad_norm": 5.993312287482855, "learning_rate": 1.968969752338019e-05, "loss": 1.0205, "step": 1683 }, { "epoch": 0.21446423738796822, "grad_norm": 6.105879854601475, "learning_rate": 1.968918751112233e-05, "loss": 0.9294, "step": 1684 }, { "epoch": 0.21459159144817486, "grad_norm": 4.22773700972384, "learning_rate": 1.968867708669745e-05, "loss": 0.9431, "step": 1685 }, { "epoch": 0.2147189455083815, "grad_norm": 4.66537789305537, "learning_rate": 1.968816625012726e-05, "loss": 0.9277, "step": 1686 }, { "epoch": 0.2148462995685881, "grad_norm": 5.7675083464920505, "learning_rate": 1.968765500143349e-05, "loss": 1.0131, "step": 1687 }, { "epoch": 0.21497365362879475, "grad_norm": 4.252982047072925, "learning_rate": 1.9687143340637885e-05, "loss": 0.9675, "step": 1688 }, { "epoch": 0.2151010076890014, "grad_norm": 4.236290755223603, "learning_rate": 1.9686631267762216e-05, "loss": 0.9622, "step": 1689 }, { "epoch": 0.215228361749208, "grad_norm": 6.703800763121683, "learning_rate": 1.968611878282826e-05, "loss": 1.0032, "step": 1690 }, { "epoch": 0.21535571580941465, "grad_norm": 3.940146147442758, "learning_rate": 1.968560588585782e-05, "loss": 0.9759, "step": 1691 }, { "epoch": 0.2154830698696213, "grad_norm": 4.799271273927135, "learning_rate": 1.9685092576872717e-05, "loss": 1.0655, "step": 1692 }, { "epoch": 0.2156104239298279, "grad_norm": 6.680157489861707, "learning_rate": 1.9684578855894783e-05, "loss": 0.9126, "step": 1693 }, { "epoch": 0.21573777799003455, "grad_norm": 4.623934442616402, "learning_rate": 1.968406472294587e-05, "loss": 0.8562, "step": 1694 }, { "epoch": 0.2158651320502412, "grad_norm": 6.912497891913811, "learning_rate": 1.9683550178047852e-05, "loss": 1.0111, "step": 1695 }, { "epoch": 0.2159924861104478, "grad_norm": 4.770563687140798, "learning_rate": 1.9683035221222617e-05, "loss": 0.8775, "step": 1696 }, { "epoch": 0.21611984017065444, "grad_norm": 6.298702154748113, "learning_rate": 1.9682519852492066e-05, "loss": 0.9678, "step": 1697 }, { "epoch": 0.21624719423086108, "grad_norm": 4.4266732538319795, "learning_rate": 1.9682004071878128e-05, "loss": 0.9295, "step": 1698 }, { "epoch": 0.2163745482910677, "grad_norm": 4.86071601836413, "learning_rate": 1.968148787940274e-05, "loss": 0.9146, "step": 1699 }, { "epoch": 0.21650190235127434, "grad_norm": 4.516499048667697, "learning_rate": 1.9680971275087862e-05, "loss": 1.0534, "step": 1700 }, { "epoch": 0.21662925641148098, "grad_norm": 3.7220862033214726, "learning_rate": 1.968045425895547e-05, "loss": 0.9642, "step": 1701 }, { "epoch": 0.2167566104716876, "grad_norm": 6.026022429395702, "learning_rate": 1.9679936831027558e-05, "loss": 1.0535, "step": 1702 }, { "epoch": 0.21688396453189424, "grad_norm": 6.490947875880115, "learning_rate": 1.9679418991326134e-05, "loss": 1.0937, "step": 1703 }, { "epoch": 0.21701131859210088, "grad_norm": 5.384847093145054, "learning_rate": 1.967890073987323e-05, "loss": 0.9663, "step": 1704 }, { "epoch": 0.2171386726523075, "grad_norm": 6.540822875724549, "learning_rate": 1.9678382076690883e-05, "loss": 0.9928, "step": 1705 }, { "epoch": 0.21726602671251413, "grad_norm": 5.064704509223324, "learning_rate": 1.9677863001801167e-05, "loss": 1.0063, "step": 1706 }, { "epoch": 0.21739338077272077, "grad_norm": 5.422335646351518, "learning_rate": 1.967734351522616e-05, "loss": 0.9905, "step": 1707 }, { "epoch": 0.2175207348329274, "grad_norm": 4.819619597306839, "learning_rate": 1.9676823616987958e-05, "loss": 1.0503, "step": 1708 }, { "epoch": 0.21764808889313403, "grad_norm": 4.844071309907478, "learning_rate": 1.967630330710868e-05, "loss": 1.0092, "step": 1709 }, { "epoch": 0.21777544295334067, "grad_norm": 6.182490314826213, "learning_rate": 1.967578258561045e-05, "loss": 0.9271, "step": 1710 }, { "epoch": 0.21790279701354728, "grad_norm": 4.720366952500596, "learning_rate": 1.9675261452515434e-05, "loss": 0.9913, "step": 1711 }, { "epoch": 0.21803015107375393, "grad_norm": 5.772697432269532, "learning_rate": 1.967473990784579e-05, "loss": 0.8759, "step": 1712 }, { "epoch": 0.21815750513396054, "grad_norm": 5.444777255796088, "learning_rate": 1.967421795162371e-05, "loss": 0.9836, "step": 1713 }, { "epoch": 0.21828485919416718, "grad_norm": 5.684695044720064, "learning_rate": 1.9673695583871392e-05, "loss": 0.9398, "step": 1714 }, { "epoch": 0.21841221325437382, "grad_norm": 5.116815311786866, "learning_rate": 1.9673172804611057e-05, "loss": 0.8838, "step": 1715 }, { "epoch": 0.21853956731458044, "grad_norm": 5.017741199544419, "learning_rate": 1.9672649613864946e-05, "loss": 0.988, "step": 1716 }, { "epoch": 0.21866692137478708, "grad_norm": 6.147091011630903, "learning_rate": 1.967212601165531e-05, "loss": 1.1042, "step": 1717 }, { "epoch": 0.21879427543499372, "grad_norm": 4.417358464217229, "learning_rate": 1.9671601998004436e-05, "loss": 0.9115, "step": 1718 }, { "epoch": 0.21892162949520033, "grad_norm": 6.183131669998234, "learning_rate": 1.96710775729346e-05, "loss": 1.053, "step": 1719 }, { "epoch": 0.21904898355540697, "grad_norm": 5.625086126727791, "learning_rate": 1.9670552736468117e-05, "loss": 1.0128, "step": 1720 }, { "epoch": 0.21917633761561361, "grad_norm": 4.117668804143339, "learning_rate": 1.9670027488627314e-05, "loss": 1.0047, "step": 1721 }, { "epoch": 0.21930369167582023, "grad_norm": 5.02721456437311, "learning_rate": 1.966950182943453e-05, "loss": 0.9749, "step": 1722 }, { "epoch": 0.21943104573602687, "grad_norm": 5.166146877763162, "learning_rate": 1.966897575891213e-05, "loss": 0.8736, "step": 1723 }, { "epoch": 0.2195583997962335, "grad_norm": 5.609421073703104, "learning_rate": 1.9668449277082492e-05, "loss": 0.8433, "step": 1724 }, { "epoch": 0.21968575385644012, "grad_norm": 5.286167721833049, "learning_rate": 1.966792238396801e-05, "loss": 0.8466, "step": 1725 }, { "epoch": 0.21981310791664677, "grad_norm": 4.961751171277027, "learning_rate": 1.9667395079591096e-05, "loss": 0.9572, "step": 1726 }, { "epoch": 0.2199404619768534, "grad_norm": 5.827625570298563, "learning_rate": 1.9666867363974187e-05, "loss": 1.0829, "step": 1727 }, { "epoch": 0.22006781603706002, "grad_norm": 6.267381835602661, "learning_rate": 1.9666339237139723e-05, "loss": 1.0272, "step": 1728 }, { "epoch": 0.22019517009726666, "grad_norm": 4.6393897470833165, "learning_rate": 1.966581069911018e-05, "loss": 0.941, "step": 1729 }, { "epoch": 0.2203225241574733, "grad_norm": 4.630764230227775, "learning_rate": 1.9665281749908034e-05, "loss": 1.0309, "step": 1730 }, { "epoch": 0.22044987821767992, "grad_norm": 5.536201975914104, "learning_rate": 1.966475238955579e-05, "loss": 1.0152, "step": 1731 }, { "epoch": 0.22057723227788656, "grad_norm": 4.906899767570099, "learning_rate": 1.9664222618075958e-05, "loss": 1.0782, "step": 1732 }, { "epoch": 0.2207045863380932, "grad_norm": 5.744214007840506, "learning_rate": 1.9663692435491084e-05, "loss": 0.9878, "step": 1733 }, { "epoch": 0.22083194039829981, "grad_norm": 4.846756918555281, "learning_rate": 1.966316184182372e-05, "loss": 1.0303, "step": 1734 }, { "epoch": 0.22095929445850646, "grad_norm": 3.883458903060236, "learning_rate": 1.966263083709643e-05, "loss": 1.0827, "step": 1735 }, { "epoch": 0.2210866485187131, "grad_norm": 5.049937733181349, "learning_rate": 1.966209942133181e-05, "loss": 1.0007, "step": 1736 }, { "epoch": 0.2212140025789197, "grad_norm": 3.77498605990836, "learning_rate": 1.966156759455246e-05, "loss": 0.9559, "step": 1737 }, { "epoch": 0.22134135663912635, "grad_norm": 6.192356246123623, "learning_rate": 1.9661035356781007e-05, "loss": 0.9042, "step": 1738 }, { "epoch": 0.221468710699333, "grad_norm": 7.090534035783035, "learning_rate": 1.9660502708040094e-05, "loss": 1.0616, "step": 1739 }, { "epoch": 0.2215960647595396, "grad_norm": 5.8361028726987545, "learning_rate": 1.965996964835237e-05, "loss": 1.0419, "step": 1740 }, { "epoch": 0.22172341881974625, "grad_norm": 5.605022984235532, "learning_rate": 1.9659436177740518e-05, "loss": 0.9657, "step": 1741 }, { "epoch": 0.2218507728799529, "grad_norm": 5.7205697588158575, "learning_rate": 1.965890229622723e-05, "loss": 0.9856, "step": 1742 }, { "epoch": 0.2219781269401595, "grad_norm": 5.185543196797189, "learning_rate": 1.9658368003835223e-05, "loss": 1.0353, "step": 1743 }, { "epoch": 0.22210548100036615, "grad_norm": 9.360388597426082, "learning_rate": 1.965783330058721e-05, "loss": 1.1027, "step": 1744 }, { "epoch": 0.2222328350605728, "grad_norm": 7.419286400959521, "learning_rate": 1.9657298186505952e-05, "loss": 1.0728, "step": 1745 }, { "epoch": 0.2223601891207794, "grad_norm": 6.056021463434238, "learning_rate": 1.96567626616142e-05, "loss": 1.0296, "step": 1746 }, { "epoch": 0.22248754318098604, "grad_norm": 4.617228809565627, "learning_rate": 1.9656226725934745e-05, "loss": 0.9012, "step": 1747 }, { "epoch": 0.22261489724119268, "grad_norm": 4.107763634438576, "learning_rate": 1.965569037949038e-05, "loss": 0.9253, "step": 1748 }, { "epoch": 0.2227422513013993, "grad_norm": 6.390962251377734, "learning_rate": 1.9655153622303918e-05, "loss": 1.0192, "step": 1749 }, { "epoch": 0.22286960536160594, "grad_norm": 4.793158368103816, "learning_rate": 1.9654616454398194e-05, "loss": 0.9457, "step": 1750 }, { "epoch": 0.22299695942181258, "grad_norm": 4.688639998095213, "learning_rate": 1.9654078875796064e-05, "loss": 0.9644, "step": 1751 }, { "epoch": 0.2231243134820192, "grad_norm": 5.905266573942803, "learning_rate": 1.9653540886520387e-05, "loss": 0.9638, "step": 1752 }, { "epoch": 0.22325166754222583, "grad_norm": 5.442243589670223, "learning_rate": 1.9653002486594057e-05, "loss": 1.0104, "step": 1753 }, { "epoch": 0.22337902160243248, "grad_norm": 4.397132693549584, "learning_rate": 1.965246367603997e-05, "loss": 0.9889, "step": 1754 }, { "epoch": 0.2235063756626391, "grad_norm": 5.403560785443919, "learning_rate": 1.965192445488105e-05, "loss": 0.9676, "step": 1755 }, { "epoch": 0.22363372972284573, "grad_norm": 6.487656831407575, "learning_rate": 1.9651384823140237e-05, "loss": 1.1338, "step": 1756 }, { "epoch": 0.22376108378305234, "grad_norm": 5.990041097203875, "learning_rate": 1.9650844780840475e-05, "loss": 1.0659, "step": 1757 }, { "epoch": 0.22388843784325899, "grad_norm": 6.69981491060953, "learning_rate": 1.9650304328004752e-05, "loss": 0.9826, "step": 1758 }, { "epoch": 0.22401579190346563, "grad_norm": 5.43961722710271, "learning_rate": 1.9649763464656052e-05, "loss": 0.9696, "step": 1759 }, { "epoch": 0.22414314596367224, "grad_norm": 5.3903222925085155, "learning_rate": 1.9649222190817382e-05, "loss": 1.0604, "step": 1760 }, { "epoch": 0.22427050002387888, "grad_norm": 6.023288987205302, "learning_rate": 1.9648680506511763e-05, "loss": 0.9395, "step": 1761 }, { "epoch": 0.22439785408408552, "grad_norm": 6.729129031364457, "learning_rate": 1.9648138411762245e-05, "loss": 1.0276, "step": 1762 }, { "epoch": 0.22452520814429214, "grad_norm": 5.303370631142946, "learning_rate": 1.9647595906591884e-05, "loss": 0.9666, "step": 1763 }, { "epoch": 0.22465256220449878, "grad_norm": 4.003173046434199, "learning_rate": 1.964705299102376e-05, "loss": 0.9826, "step": 1764 }, { "epoch": 0.22477991626470542, "grad_norm": 4.822631673720893, "learning_rate": 1.9646509665080967e-05, "loss": 0.9451, "step": 1765 }, { "epoch": 0.22490727032491203, "grad_norm": 11.400257950926209, "learning_rate": 1.9645965928786615e-05, "loss": 0.9215, "step": 1766 }, { "epoch": 0.22503462438511868, "grad_norm": 12.665581171471478, "learning_rate": 1.9645421782163838e-05, "loss": 0.8822, "step": 1767 }, { "epoch": 0.22516197844532532, "grad_norm": 6.241716923360686, "learning_rate": 1.964487722523578e-05, "loss": 0.9472, "step": 1768 }, { "epoch": 0.22528933250553193, "grad_norm": 4.949288889351066, "learning_rate": 1.9644332258025604e-05, "loss": 0.8934, "step": 1769 }, { "epoch": 0.22541668656573857, "grad_norm": 4.900761541238467, "learning_rate": 1.96437868805565e-05, "loss": 1.0991, "step": 1770 }, { "epoch": 0.2255440406259452, "grad_norm": 6.7477896568198945, "learning_rate": 1.9643241092851664e-05, "loss": 0.9397, "step": 1771 }, { "epoch": 0.22567139468615183, "grad_norm": 4.4663847952956806, "learning_rate": 1.964269489493431e-05, "loss": 0.9228, "step": 1772 }, { "epoch": 0.22579874874635847, "grad_norm": 4.780268150692939, "learning_rate": 1.9642148286827674e-05, "loss": 1.0703, "step": 1773 }, { "epoch": 0.2259261028065651, "grad_norm": 3.7715650378964707, "learning_rate": 1.964160126855501e-05, "loss": 0.9436, "step": 1774 }, { "epoch": 0.22605345686677172, "grad_norm": 4.992705937599941, "learning_rate": 1.964105384013959e-05, "loss": 1.0159, "step": 1775 }, { "epoch": 0.22618081092697837, "grad_norm": 4.893049230375066, "learning_rate": 1.9640506001604693e-05, "loss": 0.9875, "step": 1776 }, { "epoch": 0.226308164987185, "grad_norm": 5.730724700849259, "learning_rate": 1.9639957752973632e-05, "loss": 0.9165, "step": 1777 }, { "epoch": 0.22643551904739162, "grad_norm": 5.960767591062262, "learning_rate": 1.963940909426972e-05, "loss": 1.0084, "step": 1778 }, { "epoch": 0.22656287310759826, "grad_norm": 5.127943728705067, "learning_rate": 1.9638860025516305e-05, "loss": 0.9076, "step": 1779 }, { "epoch": 0.2266902271678049, "grad_norm": 6.308130546075289, "learning_rate": 1.963831054673674e-05, "loss": 0.9977, "step": 1780 }, { "epoch": 0.22681758122801152, "grad_norm": 7.182944803137629, "learning_rate": 1.96377606579544e-05, "loss": 1.0313, "step": 1781 }, { "epoch": 0.22694493528821816, "grad_norm": 4.015812375932955, "learning_rate": 1.9637210359192673e-05, "loss": 0.9307, "step": 1782 }, { "epoch": 0.2270722893484248, "grad_norm": 4.165843704331801, "learning_rate": 1.9636659650474973e-05, "loss": 0.9104, "step": 1783 }, { "epoch": 0.2271996434086314, "grad_norm": 5.612109524539742, "learning_rate": 1.9636108531824725e-05, "loss": 1.0234, "step": 1784 }, { "epoch": 0.22732699746883805, "grad_norm": 6.010012537582954, "learning_rate": 1.9635557003265374e-05, "loss": 0.8986, "step": 1785 }, { "epoch": 0.2274543515290447, "grad_norm": 6.456680316007987, "learning_rate": 1.9635005064820377e-05, "loss": 0.9593, "step": 1786 }, { "epoch": 0.2275817055892513, "grad_norm": 7.727566645127223, "learning_rate": 1.9634452716513215e-05, "loss": 1.0032, "step": 1787 }, { "epoch": 0.22770905964945795, "grad_norm": 5.337484691160166, "learning_rate": 1.9633899958367384e-05, "loss": 0.9968, "step": 1788 }, { "epoch": 0.2278364137096646, "grad_norm": 6.736345679884345, "learning_rate": 1.9633346790406402e-05, "loss": 0.9217, "step": 1789 }, { "epoch": 0.2279637677698712, "grad_norm": 5.726731364218154, "learning_rate": 1.9632793212653795e-05, "loss": 0.9961, "step": 1790 }, { "epoch": 0.22809112183007785, "grad_norm": 4.315122829558817, "learning_rate": 1.9632239225133116e-05, "loss": 0.9914, "step": 1791 }, { "epoch": 0.2282184758902845, "grad_norm": 5.034835959132728, "learning_rate": 1.9631684827867926e-05, "loss": 1.0849, "step": 1792 }, { "epoch": 0.2283458299504911, "grad_norm": 13.966661267215047, "learning_rate": 1.9631130020881806e-05, "loss": 1.0368, "step": 1793 }, { "epoch": 0.22847318401069774, "grad_norm": 5.465619221025032, "learning_rate": 1.963057480419837e-05, "loss": 0.9769, "step": 1794 }, { "epoch": 0.22860053807090439, "grad_norm": 4.252969714040979, "learning_rate": 1.9630019177841224e-05, "loss": 1.0516, "step": 1795 }, { "epoch": 0.228727892131111, "grad_norm": 5.119678432678006, "learning_rate": 1.9629463141834008e-05, "loss": 0.9679, "step": 1796 }, { "epoch": 0.22885524619131764, "grad_norm": 6.089616361413847, "learning_rate": 1.9628906696200375e-05, "loss": 1.0558, "step": 1797 }, { "epoch": 0.22898260025152428, "grad_norm": 3.8992771016674848, "learning_rate": 1.9628349840963997e-05, "loss": 1.0169, "step": 1798 }, { "epoch": 0.2291099543117309, "grad_norm": 4.4152406993673186, "learning_rate": 1.9627792576148558e-05, "loss": 0.9963, "step": 1799 }, { "epoch": 0.22923730837193754, "grad_norm": 4.569440187853551, "learning_rate": 1.9627234901777768e-05, "loss": 1.0469, "step": 1800 }, { "epoch": 0.22936466243214418, "grad_norm": 4.675970929345057, "learning_rate": 1.9626676817875343e-05, "loss": 0.9363, "step": 1801 }, { "epoch": 0.2294920164923508, "grad_norm": 6.711610771396723, "learning_rate": 1.9626118324465035e-05, "loss": 0.9359, "step": 1802 }, { "epoch": 0.22961937055255743, "grad_norm": 5.1523400480938175, "learning_rate": 1.9625559421570587e-05, "loss": 0.9964, "step": 1803 }, { "epoch": 0.22974672461276405, "grad_norm": 4.124765562851582, "learning_rate": 1.9625000109215787e-05, "loss": 0.9782, "step": 1804 }, { "epoch": 0.2298740786729707, "grad_norm": 5.117340877651308, "learning_rate": 1.962444038742442e-05, "loss": 0.9557, "step": 1805 }, { "epoch": 0.23000143273317733, "grad_norm": 5.100902941188911, "learning_rate": 1.96238802562203e-05, "loss": 1.0152, "step": 1806 }, { "epoch": 0.23012878679338394, "grad_norm": 7.678748631254456, "learning_rate": 1.962331971562725e-05, "loss": 0.9388, "step": 1807 }, { "epoch": 0.23025614085359059, "grad_norm": 4.230898623055076, "learning_rate": 1.9622758765669117e-05, "loss": 1.0215, "step": 1808 }, { "epoch": 0.23038349491379723, "grad_norm": 5.386940755826735, "learning_rate": 1.9622197406369764e-05, "loss": 1.0412, "step": 1809 }, { "epoch": 0.23051084897400384, "grad_norm": 4.193235881463476, "learning_rate": 1.962163563775307e-05, "loss": 0.9781, "step": 1810 }, { "epoch": 0.23063820303421048, "grad_norm": 8.132097636089389, "learning_rate": 1.962107345984293e-05, "loss": 0.9979, "step": 1811 }, { "epoch": 0.23076555709441712, "grad_norm": 5.556965678651519, "learning_rate": 1.962051087266326e-05, "loss": 0.9099, "step": 1812 }, { "epoch": 0.23089291115462374, "grad_norm": 6.530311918651575, "learning_rate": 1.9619947876237996e-05, "loss": 0.9762, "step": 1813 }, { "epoch": 0.23102026521483038, "grad_norm": 5.109577139959268, "learning_rate": 1.9619384470591082e-05, "loss": 1.0315, "step": 1814 }, { "epoch": 0.23114761927503702, "grad_norm": 5.855383406024149, "learning_rate": 1.9618820655746488e-05, "loss": 0.9857, "step": 1815 }, { "epoch": 0.23127497333524363, "grad_norm": 4.980952029248604, "learning_rate": 1.961825643172819e-05, "loss": 1.132, "step": 1816 }, { "epoch": 0.23140232739545027, "grad_norm": 3.5464613080863323, "learning_rate": 1.96176917985602e-05, "loss": 0.855, "step": 1817 }, { "epoch": 0.23152968145565692, "grad_norm": 5.782907939000674, "learning_rate": 1.9617126756266533e-05, "loss": 1.062, "step": 1818 }, { "epoch": 0.23165703551586353, "grad_norm": 5.962578743788539, "learning_rate": 1.961656130487122e-05, "loss": 0.9667, "step": 1819 }, { "epoch": 0.23178438957607017, "grad_norm": 6.864858464548359, "learning_rate": 1.9615995444398326e-05, "loss": 1.0125, "step": 1820 }, { "epoch": 0.2319117436362768, "grad_norm": 5.593194518399508, "learning_rate": 1.9615429174871915e-05, "loss": 0.9814, "step": 1821 }, { "epoch": 0.23203909769648343, "grad_norm": 4.107295797592486, "learning_rate": 1.961486249631607e-05, "loss": 0.9323, "step": 1822 }, { "epoch": 0.23216645175669007, "grad_norm": 6.355699531477541, "learning_rate": 1.9614295408754908e-05, "loss": 0.9149, "step": 1823 }, { "epoch": 0.2322938058168967, "grad_norm": 6.169294604965106, "learning_rate": 1.961372791221254e-05, "loss": 1.0927, "step": 1824 }, { "epoch": 0.23242115987710332, "grad_norm": 5.650601933916082, "learning_rate": 1.961316000671312e-05, "loss": 0.9797, "step": 1825 }, { "epoch": 0.23254851393730996, "grad_norm": 5.9618915879234455, "learning_rate": 1.9612591692280798e-05, "loss": 0.9699, "step": 1826 }, { "epoch": 0.2326758679975166, "grad_norm": 5.440465710283945, "learning_rate": 1.961202296893975e-05, "loss": 1.0496, "step": 1827 }, { "epoch": 0.23280322205772322, "grad_norm": 4.8719408266955835, "learning_rate": 1.961145383671417e-05, "loss": 1.0317, "step": 1828 }, { "epoch": 0.23293057611792986, "grad_norm": 6.786863473513039, "learning_rate": 1.961088429562827e-05, "loss": 1.0223, "step": 1829 }, { "epoch": 0.2330579301781365, "grad_norm": 5.715195181543214, "learning_rate": 1.9610314345706275e-05, "loss": 1.091, "step": 1830 }, { "epoch": 0.23318528423834312, "grad_norm": 5.537056154671483, "learning_rate": 1.9609743986972427e-05, "loss": 1.0621, "step": 1831 }, { "epoch": 0.23331263829854976, "grad_norm": 5.431294901742344, "learning_rate": 1.9609173219450998e-05, "loss": 0.9379, "step": 1832 }, { "epoch": 0.2334399923587564, "grad_norm": 4.635917562217512, "learning_rate": 1.960860204316626e-05, "loss": 1.034, "step": 1833 }, { "epoch": 0.233567346418963, "grad_norm": 5.937687921059879, "learning_rate": 1.960803045814251e-05, "loss": 0.9789, "step": 1834 }, { "epoch": 0.23369470047916965, "grad_norm": 4.805508377539422, "learning_rate": 1.9607458464404065e-05, "loss": 0.9514, "step": 1835 }, { "epoch": 0.2338220545393763, "grad_norm": 5.356028944192273, "learning_rate": 1.9606886061975258e-05, "loss": 0.8895, "step": 1836 }, { "epoch": 0.2339494085995829, "grad_norm": 4.521239278589203, "learning_rate": 1.960631325088044e-05, "loss": 1.1037, "step": 1837 }, { "epoch": 0.23407676265978955, "grad_norm": 4.408012308259316, "learning_rate": 1.9605740031143972e-05, "loss": 0.9899, "step": 1838 }, { "epoch": 0.2342041167199962, "grad_norm": 6.913754351801961, "learning_rate": 1.9605166402790242e-05, "loss": 0.8878, "step": 1839 }, { "epoch": 0.2343314707802028, "grad_norm": 5.3691903587638405, "learning_rate": 1.960459236584365e-05, "loss": 0.9946, "step": 1840 }, { "epoch": 0.23445882484040945, "grad_norm": 4.323790435116697, "learning_rate": 1.9604017920328613e-05, "loss": 0.944, "step": 1841 }, { "epoch": 0.2345861789006161, "grad_norm": 5.879725077942632, "learning_rate": 1.9603443066269575e-05, "loss": 0.9666, "step": 1842 }, { "epoch": 0.2347135329608227, "grad_norm": 6.673614632504906, "learning_rate": 1.960286780369098e-05, "loss": 0.9177, "step": 1843 }, { "epoch": 0.23484088702102934, "grad_norm": 5.059346665131627, "learning_rate": 1.96022921326173e-05, "loss": 0.9912, "step": 1844 }, { "epoch": 0.23496824108123598, "grad_norm": 5.094966438939124, "learning_rate": 1.9601716053073034e-05, "loss": 0.9824, "step": 1845 }, { "epoch": 0.2350955951414426, "grad_norm": 4.978471566473348, "learning_rate": 1.9601139565082677e-05, "loss": 1.0614, "step": 1846 }, { "epoch": 0.23522294920164924, "grad_norm": 6.046248529200191, "learning_rate": 1.9600562668670756e-05, "loss": 1.0039, "step": 1847 }, { "epoch": 0.23535030326185585, "grad_norm": 4.59692065788742, "learning_rate": 1.959998536386181e-05, "loss": 0.997, "step": 1848 }, { "epoch": 0.2354776573220625, "grad_norm": 4.304352956742825, "learning_rate": 1.9599407650680397e-05, "loss": 1.0137, "step": 1849 }, { "epoch": 0.23560501138226914, "grad_norm": 5.236432118220893, "learning_rate": 1.9598829529151096e-05, "loss": 0.9861, "step": 1850 }, { "epoch": 0.23573236544247575, "grad_norm": 4.803484113145805, "learning_rate": 1.9598250999298495e-05, "loss": 0.9934, "step": 1851 }, { "epoch": 0.2358597195026824, "grad_norm": 4.582977535424973, "learning_rate": 1.9597672061147207e-05, "loss": 0.9304, "step": 1852 }, { "epoch": 0.23598707356288903, "grad_norm": 4.685459150452763, "learning_rate": 1.9597092714721858e-05, "loss": 0.9713, "step": 1853 }, { "epoch": 0.23611442762309565, "grad_norm": 4.903927781934567, "learning_rate": 1.9596512960047092e-05, "loss": 0.914, "step": 1854 }, { "epoch": 0.2362417816833023, "grad_norm": 4.935888099725095, "learning_rate": 1.9595932797147573e-05, "loss": 0.9847, "step": 1855 }, { "epoch": 0.23636913574350893, "grad_norm": 4.720197645025626, "learning_rate": 1.959535222604798e-05, "loss": 1.0138, "step": 1856 }, { "epoch": 0.23649648980371554, "grad_norm": 7.660251365899786, "learning_rate": 1.959477124677301e-05, "loss": 0.928, "step": 1857 }, { "epoch": 0.23662384386392218, "grad_norm": 6.028727107533511, "learning_rate": 1.9594189859347376e-05, "loss": 1.0456, "step": 1858 }, { "epoch": 0.23675119792412883, "grad_norm": 5.285424382864863, "learning_rate": 1.9593608063795808e-05, "loss": 1.0047, "step": 1859 }, { "epoch": 0.23687855198433544, "grad_norm": 4.413545885489942, "learning_rate": 1.9593025860143058e-05, "loss": 0.9942, "step": 1860 }, { "epoch": 0.23700590604454208, "grad_norm": 6.422270899540137, "learning_rate": 1.9592443248413896e-05, "loss": 0.9535, "step": 1861 }, { "epoch": 0.23713326010474872, "grad_norm": 4.863549110971738, "learning_rate": 1.9591860228633093e-05, "loss": 0.8156, "step": 1862 }, { "epoch": 0.23726061416495534, "grad_norm": 5.639933789453035, "learning_rate": 1.959127680082546e-05, "loss": 0.9758, "step": 1863 }, { "epoch": 0.23738796822516198, "grad_norm": 5.272758021907773, "learning_rate": 1.9590692965015818e-05, "loss": 0.9138, "step": 1864 }, { "epoch": 0.23751532228536862, "grad_norm": 5.844045050205219, "learning_rate": 1.9590108721228994e-05, "loss": 1.0062, "step": 1865 }, { "epoch": 0.23764267634557523, "grad_norm": 5.212810494302638, "learning_rate": 1.958952406948985e-05, "loss": 1.0041, "step": 1866 }, { "epoch": 0.23777003040578187, "grad_norm": 4.192451169042947, "learning_rate": 1.9588939009823246e-05, "loss": 1.0265, "step": 1867 }, { "epoch": 0.23789738446598852, "grad_norm": 7.087079512352341, "learning_rate": 1.9588353542254076e-05, "loss": 1.1435, "step": 1868 }, { "epoch": 0.23802473852619513, "grad_norm": 4.905834010280595, "learning_rate": 1.9587767666807245e-05, "loss": 1.0076, "step": 1869 }, { "epoch": 0.23815209258640177, "grad_norm": 5.806849861160403, "learning_rate": 1.9587181383507678e-05, "loss": 1.0408, "step": 1870 }, { "epoch": 0.2382794466466084, "grad_norm": 4.947380706135362, "learning_rate": 1.958659469238031e-05, "loss": 0.9148, "step": 1871 }, { "epoch": 0.23840680070681503, "grad_norm": 4.263103256718209, "learning_rate": 1.9586007593450098e-05, "loss": 0.8835, "step": 1872 }, { "epoch": 0.23853415476702167, "grad_norm": 4.573667680364965, "learning_rate": 1.958542008674202e-05, "loss": 1.1058, "step": 1873 }, { "epoch": 0.2386615088272283, "grad_norm": 4.741512948188774, "learning_rate": 1.9584832172281064e-05, "loss": 1.0385, "step": 1874 }, { "epoch": 0.23878886288743492, "grad_norm": 5.806566388694644, "learning_rate": 1.9584243850092246e-05, "loss": 0.9899, "step": 1875 }, { "epoch": 0.23891621694764156, "grad_norm": 7.7527082079816125, "learning_rate": 1.9583655120200586e-05, "loss": 1.1014, "step": 1876 }, { "epoch": 0.2390435710078482, "grad_norm": 5.3338465245025795, "learning_rate": 1.9583065982631128e-05, "loss": 1.0165, "step": 1877 }, { "epoch": 0.23917092506805482, "grad_norm": 4.761253878751353, "learning_rate": 1.9582476437408937e-05, "loss": 1.1034, "step": 1878 }, { "epoch": 0.23929827912826146, "grad_norm": 6.169684144379374, "learning_rate": 1.958188648455909e-05, "loss": 0.9707, "step": 1879 }, { "epoch": 0.2394256331884681, "grad_norm": 5.183876702549582, "learning_rate": 1.9581296124106682e-05, "loss": 0.9681, "step": 1880 }, { "epoch": 0.23955298724867471, "grad_norm": 3.8059335661646267, "learning_rate": 1.9580705356076826e-05, "loss": 0.8875, "step": 1881 }, { "epoch": 0.23968034130888136, "grad_norm": 4.421657597350017, "learning_rate": 1.9580114180494655e-05, "loss": 0.86, "step": 1882 }, { "epoch": 0.239807695369088, "grad_norm": 6.634587673780053, "learning_rate": 1.9579522597385315e-05, "loss": 1.0404, "step": 1883 }, { "epoch": 0.2399350494292946, "grad_norm": 4.316824251536347, "learning_rate": 1.9578930606773975e-05, "loss": 0.9788, "step": 1884 }, { "epoch": 0.24006240348950125, "grad_norm": 4.444924175014784, "learning_rate": 1.957833820868581e-05, "loss": 1.0692, "step": 1885 }, { "epoch": 0.2401897575497079, "grad_norm": 5.789228552780425, "learning_rate": 1.9577745403146026e-05, "loss": 1.043, "step": 1886 }, { "epoch": 0.2403171116099145, "grad_norm": 5.8358917438801985, "learning_rate": 1.9577152190179837e-05, "loss": 0.9655, "step": 1887 }, { "epoch": 0.24044446567012115, "grad_norm": 3.9383483457232096, "learning_rate": 1.9576558569812484e-05, "loss": 0.9138, "step": 1888 }, { "epoch": 0.2405718197303278, "grad_norm": 4.634923238602164, "learning_rate": 1.957596454206921e-05, "loss": 1.0484, "step": 1889 }, { "epoch": 0.2406991737905344, "grad_norm": 6.206385603515261, "learning_rate": 1.9575370106975288e-05, "loss": 0.9826, "step": 1890 }, { "epoch": 0.24082652785074105, "grad_norm": 4.170206829403069, "learning_rate": 1.9574775264556005e-05, "loss": 0.9325, "step": 1891 }, { "epoch": 0.2409538819109477, "grad_norm": 5.2007134608348675, "learning_rate": 1.9574180014836668e-05, "loss": 0.9554, "step": 1892 }, { "epoch": 0.2410812359711543, "grad_norm": 6.073407615154317, "learning_rate": 1.9573584357842592e-05, "loss": 1.0859, "step": 1893 }, { "epoch": 0.24120859003136094, "grad_norm": 5.51343611815531, "learning_rate": 1.9572988293599124e-05, "loss": 1.0487, "step": 1894 }, { "epoch": 0.24133594409156756, "grad_norm": 5.406242238298257, "learning_rate": 1.9572391822131606e-05, "loss": 1.0153, "step": 1895 }, { "epoch": 0.2414632981517742, "grad_norm": 3.6163605966280175, "learning_rate": 1.9571794943465424e-05, "loss": 0.9876, "step": 1896 }, { "epoch": 0.24159065221198084, "grad_norm": 7.0188141436885605, "learning_rate": 1.9571197657625967e-05, "loss": 0.9274, "step": 1897 }, { "epoch": 0.24171800627218745, "grad_norm": 5.502167881358521, "learning_rate": 1.957059996463864e-05, "loss": 1.1126, "step": 1898 }, { "epoch": 0.2418453603323941, "grad_norm": 5.245404730091922, "learning_rate": 1.9570001864528863e-05, "loss": 0.9728, "step": 1899 }, { "epoch": 0.24197271439260074, "grad_norm": 6.017843417046496, "learning_rate": 1.956940335732209e-05, "loss": 0.9613, "step": 1900 }, { "epoch": 0.24210006845280735, "grad_norm": 5.594564708859503, "learning_rate": 1.9568804443043774e-05, "loss": 1.0798, "step": 1901 }, { "epoch": 0.242227422513014, "grad_norm": 5.132520043156988, "learning_rate": 1.956820512171939e-05, "loss": 0.9427, "step": 1902 }, { "epoch": 0.24235477657322063, "grad_norm": 6.049832352766647, "learning_rate": 1.956760539337444e-05, "loss": 1.0261, "step": 1903 }, { "epoch": 0.24248213063342725, "grad_norm": 5.616791054605702, "learning_rate": 1.9567005258034423e-05, "loss": 0.9481, "step": 1904 }, { "epoch": 0.2426094846936339, "grad_norm": 6.4642734241444035, "learning_rate": 1.9566404715724884e-05, "loss": 0.9793, "step": 1905 }, { "epoch": 0.24273683875384053, "grad_norm": 6.313155640649643, "learning_rate": 1.9565803766471355e-05, "loss": 1.035, "step": 1906 }, { "epoch": 0.24286419281404714, "grad_norm": 7.304047484077455, "learning_rate": 1.9565202410299415e-05, "loss": 1.0329, "step": 1907 }, { "epoch": 0.24299154687425378, "grad_norm": 6.860041766248072, "learning_rate": 1.9564600647234628e-05, "loss": 0.9587, "step": 1908 }, { "epoch": 0.24311890093446042, "grad_norm": 6.482658729007516, "learning_rate": 1.9563998477302604e-05, "loss": 0.9059, "step": 1909 }, { "epoch": 0.24324625499466704, "grad_norm": 6.916423778022661, "learning_rate": 1.9563395900528956e-05, "loss": 0.8524, "step": 1910 }, { "epoch": 0.24337360905487368, "grad_norm": 4.723823699658888, "learning_rate": 1.956279291693931e-05, "loss": 1.0333, "step": 1911 }, { "epoch": 0.24350096311508032, "grad_norm": 8.42233790326866, "learning_rate": 1.9562189526559333e-05, "loss": 0.9848, "step": 1912 }, { "epoch": 0.24362831717528693, "grad_norm": 4.949374635284291, "learning_rate": 1.9561585729414675e-05, "loss": 0.9388, "step": 1913 }, { "epoch": 0.24375567123549358, "grad_norm": 5.121845297875669, "learning_rate": 1.9560981525531027e-05, "loss": 1.0829, "step": 1914 }, { "epoch": 0.24388302529570022, "grad_norm": 4.631129559020309, "learning_rate": 1.9560376914934098e-05, "loss": 0.8969, "step": 1915 }, { "epoch": 0.24401037935590683, "grad_norm": 5.316361403994358, "learning_rate": 1.9559771897649592e-05, "loss": 1.0067, "step": 1916 }, { "epoch": 0.24413773341611347, "grad_norm": 6.211563635400579, "learning_rate": 1.9559166473703265e-05, "loss": 0.8899, "step": 1917 }, { "epoch": 0.24426508747632011, "grad_norm": 6.407445284356433, "learning_rate": 1.9558560643120855e-05, "loss": 0.951, "step": 1918 }, { "epoch": 0.24439244153652673, "grad_norm": 5.549064784254947, "learning_rate": 1.9557954405928142e-05, "loss": 0.967, "step": 1919 }, { "epoch": 0.24451979559673337, "grad_norm": 5.021470320988695, "learning_rate": 1.955734776215091e-05, "loss": 1.0486, "step": 1920 }, { "epoch": 0.24464714965694, "grad_norm": 6.492243026330688, "learning_rate": 1.955674071181497e-05, "loss": 0.9918, "step": 1921 }, { "epoch": 0.24477450371714662, "grad_norm": 6.736913640652619, "learning_rate": 1.955613325494614e-05, "loss": 0.9308, "step": 1922 }, { "epoch": 0.24490185777735327, "grad_norm": 5.582164850898567, "learning_rate": 1.955552539157026e-05, "loss": 0.9773, "step": 1923 }, { "epoch": 0.2450292118375599, "grad_norm": 4.496928968496731, "learning_rate": 1.9554917121713198e-05, "loss": 1.0006, "step": 1924 }, { "epoch": 0.24515656589776652, "grad_norm": 4.202497811302691, "learning_rate": 1.955430844540082e-05, "loss": 0.9419, "step": 1925 }, { "epoch": 0.24528391995797316, "grad_norm": 7.195137013501995, "learning_rate": 1.9553699362659016e-05, "loss": 0.9848, "step": 1926 }, { "epoch": 0.2454112740181798, "grad_norm": 5.762608511807093, "learning_rate": 1.9553089873513702e-05, "loss": 0.9671, "step": 1927 }, { "epoch": 0.24553862807838642, "grad_norm": 6.201908174791442, "learning_rate": 1.9552479977990802e-05, "loss": 0.9007, "step": 1928 }, { "epoch": 0.24566598213859306, "grad_norm": 6.22489587168181, "learning_rate": 1.955186967611626e-05, "loss": 0.9826, "step": 1929 }, { "epoch": 0.2457933361987997, "grad_norm": 7.069124878674373, "learning_rate": 1.955125896791604e-05, "loss": 1.1846, "step": 1930 }, { "epoch": 0.2459206902590063, "grad_norm": 5.212998927546745, "learning_rate": 1.955064785341612e-05, "loss": 0.8786, "step": 1931 }, { "epoch": 0.24604804431921296, "grad_norm": 4.894650103929042, "learning_rate": 1.9550036332642496e-05, "loss": 0.9747, "step": 1932 }, { "epoch": 0.2461753983794196, "grad_norm": 6.111738523392531, "learning_rate": 1.954942440562118e-05, "loss": 0.8886, "step": 1933 }, { "epoch": 0.2463027524396262, "grad_norm": 5.085283416081644, "learning_rate": 1.9548812072378208e-05, "loss": 1.0079, "step": 1934 }, { "epoch": 0.24643010649983285, "grad_norm": 7.121943838967712, "learning_rate": 1.954819933293962e-05, "loss": 1.0411, "step": 1935 }, { "epoch": 0.2465574605600395, "grad_norm": 4.765921361494564, "learning_rate": 1.954758618733148e-05, "loss": 0.978, "step": 1936 }, { "epoch": 0.2466848146202461, "grad_norm": 5.157511608796732, "learning_rate": 1.954697263557988e-05, "loss": 0.9591, "step": 1937 }, { "epoch": 0.24681216868045275, "grad_norm": 4.500125035562163, "learning_rate": 1.9546358677710917e-05, "loss": 0.9751, "step": 1938 }, { "epoch": 0.24693952274065936, "grad_norm": 5.388541794880216, "learning_rate": 1.9545744313750706e-05, "loss": 0.9856, "step": 1939 }, { "epoch": 0.247066876800866, "grad_norm": 5.033008339265417, "learning_rate": 1.954512954372538e-05, "loss": 0.9379, "step": 1940 }, { "epoch": 0.24719423086107264, "grad_norm": 5.048770702042888, "learning_rate": 1.954451436766109e-05, "loss": 1.0365, "step": 1941 }, { "epoch": 0.24732158492127926, "grad_norm": 5.618927623754484, "learning_rate": 1.954389878558401e-05, "loss": 0.9645, "step": 1942 }, { "epoch": 0.2474489389814859, "grad_norm": 6.407757539845989, "learning_rate": 1.954328279752032e-05, "loss": 1.1237, "step": 1943 }, { "epoch": 0.24757629304169254, "grad_norm": 6.006274121858267, "learning_rate": 1.9542666403496232e-05, "loss": 1.0167, "step": 1944 }, { "epoch": 0.24770364710189915, "grad_norm": 5.895080985693145, "learning_rate": 1.954204960353796e-05, "loss": 1.0035, "step": 1945 }, { "epoch": 0.2478310011621058, "grad_norm": 4.438704206380312, "learning_rate": 1.954143239767174e-05, "loss": 0.972, "step": 1946 }, { "epoch": 0.24795835522231244, "grad_norm": 4.267205808272698, "learning_rate": 1.9540814785923832e-05, "loss": 1.0307, "step": 1947 }, { "epoch": 0.24808570928251905, "grad_norm": 5.4795965260572705, "learning_rate": 1.954019676832051e-05, "loss": 0.9458, "step": 1948 }, { "epoch": 0.2482130633427257, "grad_norm": 5.286442839000025, "learning_rate": 1.9539578344888057e-05, "loss": 0.9832, "step": 1949 }, { "epoch": 0.24834041740293233, "grad_norm": 5.362946768691932, "learning_rate": 1.9538959515652786e-05, "loss": 1.0164, "step": 1950 }, { "epoch": 0.24846777146313895, "grad_norm": 5.207804457197322, "learning_rate": 1.9538340280641018e-05, "loss": 0.9356, "step": 1951 }, { "epoch": 0.2485951255233456, "grad_norm": 6.356636376204565, "learning_rate": 1.9537720639879096e-05, "loss": 1.0118, "step": 1952 }, { "epoch": 0.24872247958355223, "grad_norm": 4.992868296448806, "learning_rate": 1.953710059339338e-05, "loss": 1.0178, "step": 1953 }, { "epoch": 0.24884983364375884, "grad_norm": 5.213643939691587, "learning_rate": 1.9536480141210242e-05, "loss": 0.9866, "step": 1954 }, { "epoch": 0.24897718770396549, "grad_norm": 4.929728899266538, "learning_rate": 1.953585928335608e-05, "loss": 1.0418, "step": 1955 }, { "epoch": 0.24910454176417213, "grad_norm": 6.206567841810802, "learning_rate": 1.95352380198573e-05, "loss": 1.0212, "step": 1956 }, { "epoch": 0.24923189582437874, "grad_norm": 4.618441080749805, "learning_rate": 1.9534616350740336e-05, "loss": 1.0703, "step": 1957 }, { "epoch": 0.24935924988458538, "grad_norm": 9.527918699338851, "learning_rate": 1.953399427603163e-05, "loss": 1.0498, "step": 1958 }, { "epoch": 0.24948660394479202, "grad_norm": 4.475756507734566, "learning_rate": 1.953337179575764e-05, "loss": 0.9581, "step": 1959 }, { "epoch": 0.24961395800499864, "grad_norm": 5.777107362895541, "learning_rate": 1.953274890994485e-05, "loss": 1.068, "step": 1960 }, { "epoch": 0.24974131206520528, "grad_norm": 5.472723688857767, "learning_rate": 1.953212561861976e-05, "loss": 1.0794, "step": 1961 }, { "epoch": 0.24986866612541192, "grad_norm": 4.63711363630185, "learning_rate": 1.9531501921808877e-05, "loss": 0.9387, "step": 1962 }, { "epoch": 0.24999602018561853, "grad_norm": 5.787593845735321, "learning_rate": 1.9530877819538736e-05, "loss": 0.9188, "step": 1963 }, { "epoch": 0.25012337424582515, "grad_norm": 5.269182491991631, "learning_rate": 1.9530253311835884e-05, "loss": 0.9789, "step": 1964 }, { "epoch": 0.2502507283060318, "grad_norm": 4.672931462385088, "learning_rate": 1.9529628398726892e-05, "loss": 1.0085, "step": 1965 }, { "epoch": 0.25037808236623843, "grad_norm": 5.314662358638423, "learning_rate": 1.9529003080238337e-05, "loss": 0.9308, "step": 1966 }, { "epoch": 0.25050543642644507, "grad_norm": 4.658522083550714, "learning_rate": 1.9528377356396825e-05, "loss": 0.9868, "step": 1967 }, { "epoch": 0.2506327904866517, "grad_norm": 4.732259850713794, "learning_rate": 1.9527751227228964e-05, "loss": 0.9688, "step": 1968 }, { "epoch": 0.25076014454685835, "grad_norm": 4.549413437815234, "learning_rate": 1.95271246927614e-05, "loss": 0.9979, "step": 1969 }, { "epoch": 0.25088749860706494, "grad_norm": 5.563897118149959, "learning_rate": 1.9526497753020776e-05, "loss": 0.9984, "step": 1970 }, { "epoch": 0.2510148526672716, "grad_norm": 4.3159957207206885, "learning_rate": 1.952587040803377e-05, "loss": 1.0247, "step": 1971 }, { "epoch": 0.2511422067274782, "grad_norm": 3.989694432769271, "learning_rate": 1.9525242657827063e-05, "loss": 0.981, "step": 1972 }, { "epoch": 0.25126956078768486, "grad_norm": 4.9109895980746545, "learning_rate": 1.9524614502427358e-05, "loss": 1.0609, "step": 1973 }, { "epoch": 0.2513969148478915, "grad_norm": 6.58293704357173, "learning_rate": 1.9523985941861376e-05, "loss": 0.9695, "step": 1974 }, { "epoch": 0.25152426890809815, "grad_norm": 5.057107950233266, "learning_rate": 1.952335697615586e-05, "loss": 0.9499, "step": 1975 }, { "epoch": 0.25165162296830473, "grad_norm": 5.667578717304745, "learning_rate": 1.952272760533756e-05, "loss": 1.0221, "step": 1976 }, { "epoch": 0.2517789770285114, "grad_norm": 7.079981410977373, "learning_rate": 1.9522097829433252e-05, "loss": 0.9485, "step": 1977 }, { "epoch": 0.251906331088718, "grad_norm": 5.338151642733225, "learning_rate": 1.9521467648469728e-05, "loss": 0.9187, "step": 1978 }, { "epoch": 0.25203368514892466, "grad_norm": 5.3896529496096806, "learning_rate": 1.9520837062473788e-05, "loss": 0.9913, "step": 1979 }, { "epoch": 0.2521610392091313, "grad_norm": 6.123413503278127, "learning_rate": 1.9520206071472264e-05, "loss": 0.9032, "step": 1980 }, { "epoch": 0.25228839326933794, "grad_norm": 5.591531153636707, "learning_rate": 1.9519574675491995e-05, "loss": 0.9196, "step": 1981 }, { "epoch": 0.2524157473295445, "grad_norm": 7.5871264295038205, "learning_rate": 1.951894287455984e-05, "loss": 0.9486, "step": 1982 }, { "epoch": 0.25254310138975117, "grad_norm": 5.71256743209739, "learning_rate": 1.951831066870267e-05, "loss": 0.9122, "step": 1983 }, { "epoch": 0.2526704554499578, "grad_norm": 5.781656446861163, "learning_rate": 1.9517678057947385e-05, "loss": 0.8943, "step": 1984 }, { "epoch": 0.25279780951016445, "grad_norm": 5.738200478605293, "learning_rate": 1.9517045042320893e-05, "loss": 0.8755, "step": 1985 }, { "epoch": 0.2529251635703711, "grad_norm": 6.186663946830765, "learning_rate": 1.951641162185012e-05, "loss": 1.0455, "step": 1986 }, { "epoch": 0.25305251763057773, "grad_norm": 4.660019136315867, "learning_rate": 1.9515777796562016e-05, "loss": 0.9795, "step": 1987 }, { "epoch": 0.2531798716907843, "grad_norm": 4.994360222611413, "learning_rate": 1.951514356648354e-05, "loss": 0.9546, "step": 1988 }, { "epoch": 0.25330722575099096, "grad_norm": 5.91407604543528, "learning_rate": 1.951450893164167e-05, "loss": 0.99, "step": 1989 }, { "epoch": 0.2534345798111976, "grad_norm": 5.697067429917377, "learning_rate": 1.9513873892063403e-05, "loss": 0.9586, "step": 1990 }, { "epoch": 0.25356193387140424, "grad_norm": 5.019744037318567, "learning_rate": 1.9513238447775757e-05, "loss": 0.9406, "step": 1991 }, { "epoch": 0.2536892879316109, "grad_norm": 5.232808621741993, "learning_rate": 1.951260259880576e-05, "loss": 0.9392, "step": 1992 }, { "epoch": 0.2538166419918175, "grad_norm": 6.66558555738008, "learning_rate": 1.9511966345180457e-05, "loss": 0.9115, "step": 1993 }, { "epoch": 0.2539439960520241, "grad_norm": 4.94620991499485, "learning_rate": 1.9511329686926922e-05, "loss": 1.0263, "step": 1994 }, { "epoch": 0.25407135011223075, "grad_norm": 5.9253145158562175, "learning_rate": 1.951069262407223e-05, "loss": 1.0708, "step": 1995 }, { "epoch": 0.2541987041724374, "grad_norm": 4.381045061615913, "learning_rate": 1.9510055156643485e-05, "loss": 0.8709, "step": 1996 }, { "epoch": 0.25432605823264404, "grad_norm": 7.938926268266305, "learning_rate": 1.95094172846678e-05, "loss": 1.0671, "step": 1997 }, { "epoch": 0.2544534122928507, "grad_norm": 7.953716143689362, "learning_rate": 1.9508779008172314e-05, "loss": 0.9639, "step": 1998 }, { "epoch": 0.2545807663530573, "grad_norm": 7.371930016817276, "learning_rate": 1.950814032718418e-05, "loss": 0.9721, "step": 1999 }, { "epoch": 0.2547081204132639, "grad_norm": 5.248665594633744, "learning_rate": 1.9507501241730557e-05, "loss": 0.9776, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 15704, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.315219424473042e+22, "train_batch_size": 2, "trial_name": null, "trial_params": null }