{ "best_metric": 0.8285140562248996, "best_model_checkpoint": "/scratch/camembertv2/runs/results/xnli/camembertv2-base-bf16-p2-17000/max_seq_length-160-gradient_accumulation_steps-4-precision-fp32-learning_rate-1e-05-epochs-10-lr_scheduler-cosine-warmup_steps-0.1/SEED-666/checkpoint-61360", "epoch": 10.0, "eval_steps": 500, "global_step": 122720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008148631029986962, "grad_norm": 12.58836841583252, "learning_rate": 8.148631029986963e-08, "loss": 1.1012, "step": 100 }, { "epoch": 0.016297262059973925, "grad_norm": 1.359410285949707, "learning_rate": 1.6297262059973925e-07, "loss": 1.1011, "step": 200 }, { "epoch": 0.024445893089960886, "grad_norm": 1.128892183303833, "learning_rate": 2.4445893089960885e-07, "loss": 1.0978, "step": 300 }, { "epoch": 0.03259452411994785, "grad_norm": 1.3794234991073608, "learning_rate": 3.259452411994785e-07, "loss": 1.0999, "step": 400 }, { "epoch": 0.04074315514993481, "grad_norm": 1.3247599601745605, "learning_rate": 4.0743155149934816e-07, "loss": 1.0984, "step": 500 }, { "epoch": 0.04889178617992177, "grad_norm": 0.9611015319824219, "learning_rate": 4.889178617992177e-07, "loss": 1.1001, "step": 600 }, { "epoch": 0.05704041720990873, "grad_norm": 0.9682479500770569, "learning_rate": 5.704041720990874e-07, "loss": 1.0985, "step": 700 }, { "epoch": 0.0651890482398957, "grad_norm": 1.950333833694458, "learning_rate": 6.51890482398957e-07, "loss": 1.0989, "step": 800 }, { "epoch": 0.07333767926988266, "grad_norm": 1.4916733503341675, "learning_rate": 7.333767926988267e-07, "loss": 1.0964, "step": 900 }, { "epoch": 0.08148631029986962, "grad_norm": 1.1135200262069702, "learning_rate": 8.148631029986963e-07, "loss": 1.096, "step": 1000 }, { "epoch": 0.08963494132985658, "grad_norm": 1.773497462272644, "learning_rate": 8.963494132985659e-07, "loss": 1.094, "step": 1100 }, { "epoch": 0.09778357235984354, "grad_norm": 1.5511926412582397, "learning_rate": 9.778357235984354e-07, "loss": 1.093, "step": 1200 }, { "epoch": 0.1059322033898305, "grad_norm": 1.389298915863037, "learning_rate": 1.059322033898305e-06, "loss": 1.0871, "step": 1300 }, { "epoch": 0.11408083441981746, "grad_norm": 2.486689329147339, "learning_rate": 1.1408083441981747e-06, "loss": 1.0751, "step": 1400 }, { "epoch": 0.12222946544980444, "grad_norm": 2.697650194168091, "learning_rate": 1.2222946544980446e-06, "loss": 1.0505, "step": 1500 }, { "epoch": 0.1303780964797914, "grad_norm": 3.557525157928467, "learning_rate": 1.303780964797914e-06, "loss": 1.0393, "step": 1600 }, { "epoch": 0.13852672750977835, "grad_norm": 4.691379070281982, "learning_rate": 1.3852672750977837e-06, "loss": 1.0147, "step": 1700 }, { "epoch": 0.14667535853976532, "grad_norm": 5.234630107879639, "learning_rate": 1.4667535853976533e-06, "loss": 0.9971, "step": 1800 }, { "epoch": 0.15482398956975227, "grad_norm": 6.027713298797607, "learning_rate": 1.5482398956975228e-06, "loss": 1.0007, "step": 1900 }, { "epoch": 0.16297262059973924, "grad_norm": 13.33498477935791, "learning_rate": 1.6297262059973926e-06, "loss": 0.984, "step": 2000 }, { "epoch": 0.17112125162972622, "grad_norm": 9.432430267333984, "learning_rate": 1.7112125162972623e-06, "loss": 0.9633, "step": 2100 }, { "epoch": 0.17926988265971316, "grad_norm": 7.303864479064941, "learning_rate": 1.7926988265971317e-06, "loss": 0.9463, "step": 2200 }, { "epoch": 0.18741851368970014, "grad_norm": 6.125274181365967, "learning_rate": 1.8741851368970016e-06, "loss": 0.9336, "step": 2300 }, { "epoch": 0.19556714471968709, "grad_norm": 6.614850044250488, "learning_rate": 1.955671447196871e-06, "loss": 0.9388, "step": 2400 }, { "epoch": 0.20371577574967406, "grad_norm": 7.883510589599609, "learning_rate": 2.037157757496741e-06, "loss": 0.9122, "step": 2500 }, { "epoch": 0.211864406779661, "grad_norm": 6.615538597106934, "learning_rate": 2.11864406779661e-06, "loss": 0.8907, "step": 2600 }, { "epoch": 0.22001303780964798, "grad_norm": 6.040781021118164, "learning_rate": 2.20013037809648e-06, "loss": 0.8725, "step": 2700 }, { "epoch": 0.22816166883963493, "grad_norm": 9.688776016235352, "learning_rate": 2.2816166883963494e-06, "loss": 0.8674, "step": 2800 }, { "epoch": 0.2363102998696219, "grad_norm": 15.747467994689941, "learning_rate": 2.363102998696219e-06, "loss": 0.8199, "step": 2900 }, { "epoch": 0.24445893089960888, "grad_norm": 9.381732940673828, "learning_rate": 2.444589308996089e-06, "loss": 0.831, "step": 3000 }, { "epoch": 0.2526075619295958, "grad_norm": 8.603889465332031, "learning_rate": 2.5260756192959584e-06, "loss": 0.811, "step": 3100 }, { "epoch": 0.2607561929595828, "grad_norm": 11.614546775817871, "learning_rate": 2.607561929595828e-06, "loss": 0.789, "step": 3200 }, { "epoch": 0.2689048239895698, "grad_norm": 7.733945846557617, "learning_rate": 2.689048239895698e-06, "loss": 0.7947, "step": 3300 }, { "epoch": 0.2770534550195567, "grad_norm": 14.573506355285645, "learning_rate": 2.7705345501955674e-06, "loss": 0.7913, "step": 3400 }, { "epoch": 0.28520208604954367, "grad_norm": 11.938140869140625, "learning_rate": 2.852020860495437e-06, "loss": 0.793, "step": 3500 }, { "epoch": 0.29335071707953064, "grad_norm": 9.235187530517578, "learning_rate": 2.9335071707953067e-06, "loss": 0.7538, "step": 3600 }, { "epoch": 0.3014993481095176, "grad_norm": 9.092159271240234, "learning_rate": 3.0149934810951763e-06, "loss": 0.7547, "step": 3700 }, { "epoch": 0.30964797913950454, "grad_norm": 11.72921371459961, "learning_rate": 3.0964797913950456e-06, "loss": 0.7461, "step": 3800 }, { "epoch": 0.3177966101694915, "grad_norm": 15.118708610534668, "learning_rate": 3.1779661016949152e-06, "loss": 0.7171, "step": 3900 }, { "epoch": 0.3259452411994785, "grad_norm": 17.719839096069336, "learning_rate": 3.2594524119947853e-06, "loss": 0.7027, "step": 4000 }, { "epoch": 0.33409387222946546, "grad_norm": 10.063789367675781, "learning_rate": 3.340938722294655e-06, "loss": 0.7229, "step": 4100 }, { "epoch": 0.34224250325945244, "grad_norm": 8.052227020263672, "learning_rate": 3.4224250325945246e-06, "loss": 0.7218, "step": 4200 }, { "epoch": 0.35039113428943935, "grad_norm": 9.68342399597168, "learning_rate": 3.503911342894394e-06, "loss": 0.6873, "step": 4300 }, { "epoch": 0.35853976531942633, "grad_norm": 9.140670776367188, "learning_rate": 3.5853976531942635e-06, "loss": 0.702, "step": 4400 }, { "epoch": 0.3666883963494133, "grad_norm": 8.805059432983398, "learning_rate": 3.666883963494133e-06, "loss": 0.7245, "step": 4500 }, { "epoch": 0.3748370273794003, "grad_norm": 7.228201389312744, "learning_rate": 3.748370273794003e-06, "loss": 0.6651, "step": 4600 }, { "epoch": 0.3829856584093872, "grad_norm": 8.284133911132812, "learning_rate": 3.829856584093872e-06, "loss": 0.6956, "step": 4700 }, { "epoch": 0.39113428943937417, "grad_norm": 8.938249588012695, "learning_rate": 3.911342894393742e-06, "loss": 0.6777, "step": 4800 }, { "epoch": 0.39928292046936115, "grad_norm": 10.810254096984863, "learning_rate": 3.992829204693612e-06, "loss": 0.6803, "step": 4900 }, { "epoch": 0.4074315514993481, "grad_norm": 11.629922866821289, "learning_rate": 4.074315514993482e-06, "loss": 0.6659, "step": 5000 }, { "epoch": 0.4155801825293351, "grad_norm": 7.82265043258667, "learning_rate": 4.1558018252933515e-06, "loss": 0.6842, "step": 5100 }, { "epoch": 0.423728813559322, "grad_norm": 9.290712356567383, "learning_rate": 4.23728813559322e-06, "loss": 0.6711, "step": 5200 }, { "epoch": 0.431877444589309, "grad_norm": 10.643411636352539, "learning_rate": 4.31877444589309e-06, "loss": 0.6521, "step": 5300 }, { "epoch": 0.44002607561929596, "grad_norm": 8.533503532409668, "learning_rate": 4.40026075619296e-06, "loss": 0.6613, "step": 5400 }, { "epoch": 0.44817470664928294, "grad_norm": 12.260805130004883, "learning_rate": 4.48174706649283e-06, "loss": 0.6512, "step": 5500 }, { "epoch": 0.45632333767926986, "grad_norm": 7.977556228637695, "learning_rate": 4.563233376792699e-06, "loss": 0.6499, "step": 5600 }, { "epoch": 0.46447196870925683, "grad_norm": 7.418649673461914, "learning_rate": 4.6447196870925686e-06, "loss": 0.6591, "step": 5700 }, { "epoch": 0.4726205997392438, "grad_norm": 10.594202995300293, "learning_rate": 4.726205997392438e-06, "loss": 0.6497, "step": 5800 }, { "epoch": 0.4807692307692308, "grad_norm": 11.133523941040039, "learning_rate": 4.807692307692308e-06, "loss": 0.6538, "step": 5900 }, { "epoch": 0.48891786179921776, "grad_norm": 12.108560562133789, "learning_rate": 4.889178617992178e-06, "loss": 0.6195, "step": 6000 }, { "epoch": 0.4970664928292047, "grad_norm": 9.70545482635498, "learning_rate": 4.970664928292047e-06, "loss": 0.6351, "step": 6100 }, { "epoch": 0.5052151238591917, "grad_norm": 12.699902534484863, "learning_rate": 5.052151238591917e-06, "loss": 0.6557, "step": 6200 }, { "epoch": 0.5133637548891786, "grad_norm": 10.324420928955078, "learning_rate": 5.1336375488917865e-06, "loss": 0.6415, "step": 6300 }, { "epoch": 0.5215123859191656, "grad_norm": 10.3858642578125, "learning_rate": 5.215123859191656e-06, "loss": 0.624, "step": 6400 }, { "epoch": 0.5296610169491526, "grad_norm": 13.573092460632324, "learning_rate": 5.296610169491526e-06, "loss": 0.6622, "step": 6500 }, { "epoch": 0.5378096479791395, "grad_norm": 8.366503715515137, "learning_rate": 5.378096479791396e-06, "loss": 0.6166, "step": 6600 }, { "epoch": 0.5459582790091264, "grad_norm": 6.413454532623291, "learning_rate": 5.459582790091264e-06, "loss": 0.6315, "step": 6700 }, { "epoch": 0.5541069100391134, "grad_norm": 7.670026779174805, "learning_rate": 5.541069100391135e-06, "loss": 0.612, "step": 6800 }, { "epoch": 0.5622555410691004, "grad_norm": 10.53145694732666, "learning_rate": 5.622555410691004e-06, "loss": 0.6167, "step": 6900 }, { "epoch": 0.5704041720990873, "grad_norm": 6.5404462814331055, "learning_rate": 5.704041720990874e-06, "loss": 0.6226, "step": 7000 }, { "epoch": 0.5785528031290743, "grad_norm": 9.084834098815918, "learning_rate": 5.785528031290744e-06, "loss": 0.6214, "step": 7100 }, { "epoch": 0.5867014341590613, "grad_norm": 9.231087684631348, "learning_rate": 5.867014341590613e-06, "loss": 0.6245, "step": 7200 }, { "epoch": 0.5948500651890483, "grad_norm": 8.526376724243164, "learning_rate": 5.948500651890483e-06, "loss": 0.6205, "step": 7300 }, { "epoch": 0.6029986962190352, "grad_norm": 9.337794303894043, "learning_rate": 6.029986962190353e-06, "loss": 0.6156, "step": 7400 }, { "epoch": 0.6111473272490222, "grad_norm": 8.846671104431152, "learning_rate": 6.111473272490222e-06, "loss": 0.6142, "step": 7500 }, { "epoch": 0.6192959582790091, "grad_norm": 8.68179988861084, "learning_rate": 6.192959582790091e-06, "loss": 0.6218, "step": 7600 }, { "epoch": 0.627444589308996, "grad_norm": 9.76940631866455, "learning_rate": 6.274445893089961e-06, "loss": 0.587, "step": 7700 }, { "epoch": 0.635593220338983, "grad_norm": 7.811220169067383, "learning_rate": 6.3559322033898304e-06, "loss": 0.6002, "step": 7800 }, { "epoch": 0.64374185136897, "grad_norm": 8.950928688049316, "learning_rate": 6.4374185136897e-06, "loss": 0.6032, "step": 7900 }, { "epoch": 0.651890482398957, "grad_norm": 6.704097270965576, "learning_rate": 6.518904823989571e-06, "loss": 0.5993, "step": 8000 }, { "epoch": 0.660039113428944, "grad_norm": 11.18411922454834, "learning_rate": 6.60039113428944e-06, "loss": 0.6035, "step": 8100 }, { "epoch": 0.6681877444589309, "grad_norm": 8.417338371276855, "learning_rate": 6.68187744458931e-06, "loss": 0.624, "step": 8200 }, { "epoch": 0.6763363754889179, "grad_norm": 9.916496276855469, "learning_rate": 6.7633637548891795e-06, "loss": 0.6275, "step": 8300 }, { "epoch": 0.6844850065189049, "grad_norm": 8.701171875, "learning_rate": 6.844850065189049e-06, "loss": 0.5773, "step": 8400 }, { "epoch": 0.6926336375488917, "grad_norm": 10.245955467224121, "learning_rate": 6.926336375488918e-06, "loss": 0.6139, "step": 8500 }, { "epoch": 0.7007822685788787, "grad_norm": 6.190640926361084, "learning_rate": 7.007822685788788e-06, "loss": 0.5833, "step": 8600 }, { "epoch": 0.7089308996088657, "grad_norm": 10.875850677490234, "learning_rate": 7.089308996088657e-06, "loss": 0.6, "step": 8700 }, { "epoch": 0.7170795306388527, "grad_norm": 8.644452095031738, "learning_rate": 7.170795306388527e-06, "loss": 0.6097, "step": 8800 }, { "epoch": 0.7252281616688396, "grad_norm": 8.089356422424316, "learning_rate": 7.252281616688397e-06, "loss": 0.583, "step": 8900 }, { "epoch": 0.7333767926988266, "grad_norm": 12.513883590698242, "learning_rate": 7.333767926988266e-06, "loss": 0.5669, "step": 9000 }, { "epoch": 0.7415254237288136, "grad_norm": 9.404706001281738, "learning_rate": 7.415254237288137e-06, "loss": 0.5833, "step": 9100 }, { "epoch": 0.7496740547588006, "grad_norm": 6.789037227630615, "learning_rate": 7.496740547588006e-06, "loss": 0.5985, "step": 9200 }, { "epoch": 0.7578226857887875, "grad_norm": 7.355409145355225, "learning_rate": 7.578226857887876e-06, "loss": 0.5686, "step": 9300 }, { "epoch": 0.7659713168187744, "grad_norm": 7.175694465637207, "learning_rate": 7.659713168187744e-06, "loss": 0.5991, "step": 9400 }, { "epoch": 0.7741199478487614, "grad_norm": 6.2600274085998535, "learning_rate": 7.741199478487615e-06, "loss": 0.5803, "step": 9500 }, { "epoch": 0.7822685788787483, "grad_norm": 11.514883995056152, "learning_rate": 7.822685788787483e-06, "loss": 0.5802, "step": 9600 }, { "epoch": 0.7904172099087353, "grad_norm": 6.594653129577637, "learning_rate": 7.904172099087354e-06, "loss": 0.5772, "step": 9700 }, { "epoch": 0.7985658409387223, "grad_norm": 10.59202766418457, "learning_rate": 7.985658409387224e-06, "loss": 0.5848, "step": 9800 }, { "epoch": 0.8067144719687093, "grad_norm": 7.8735151290893555, "learning_rate": 8.067144719687093e-06, "loss": 0.5813, "step": 9900 }, { "epoch": 0.8148631029986962, "grad_norm": 9.064979553222656, "learning_rate": 8.148631029986964e-06, "loss": 0.5792, "step": 10000 }, { "epoch": 0.8230117340286832, "grad_norm": 10.0288667678833, "learning_rate": 8.230117340286832e-06, "loss": 0.5622, "step": 10100 }, { "epoch": 0.8311603650586702, "grad_norm": 8.7724609375, "learning_rate": 8.311603650586703e-06, "loss": 0.5767, "step": 10200 }, { "epoch": 0.8393089960886571, "grad_norm": 8.127886772155762, "learning_rate": 8.393089960886572e-06, "loss": 0.5721, "step": 10300 }, { "epoch": 0.847457627118644, "grad_norm": 7.77069616317749, "learning_rate": 8.47457627118644e-06, "loss": 0.5925, "step": 10400 }, { "epoch": 0.855606258148631, "grad_norm": 7.864415645599365, "learning_rate": 8.556062581486311e-06, "loss": 0.5805, "step": 10500 }, { "epoch": 0.863754889178618, "grad_norm": 7.0319952964782715, "learning_rate": 8.63754889178618e-06, "loss": 0.577, "step": 10600 }, { "epoch": 0.871903520208605, "grad_norm": 7.513912677764893, "learning_rate": 8.71903520208605e-06, "loss": 0.5978, "step": 10700 }, { "epoch": 0.8800521512385919, "grad_norm": 8.28197193145752, "learning_rate": 8.80052151238592e-06, "loss": 0.5912, "step": 10800 }, { "epoch": 0.8882007822685789, "grad_norm": 7.632150650024414, "learning_rate": 8.88200782268579e-06, "loss": 0.5706, "step": 10900 }, { "epoch": 0.8963494132985659, "grad_norm": 7.691524028778076, "learning_rate": 8.96349413298566e-06, "loss": 0.5612, "step": 11000 }, { "epoch": 0.9044980443285529, "grad_norm": 8.549062728881836, "learning_rate": 9.044980443285529e-06, "loss": 0.5494, "step": 11100 }, { "epoch": 0.9126466753585397, "grad_norm": 10.64492416381836, "learning_rate": 9.126466753585398e-06, "loss": 0.5629, "step": 11200 }, { "epoch": 0.9207953063885267, "grad_norm": 7.610856056213379, "learning_rate": 9.207953063885268e-06, "loss": 0.5627, "step": 11300 }, { "epoch": 0.9289439374185137, "grad_norm": 10.41044807434082, "learning_rate": 9.289439374185137e-06, "loss": 0.5756, "step": 11400 }, { "epoch": 0.9370925684485006, "grad_norm": 6.464520454406738, "learning_rate": 9.370925684485008e-06, "loss": 0.5817, "step": 11500 }, { "epoch": 0.9452411994784876, "grad_norm": 12.031845092773438, "learning_rate": 9.452411994784876e-06, "loss": 0.5761, "step": 11600 }, { "epoch": 0.9533898305084746, "grad_norm": 8.345417022705078, "learning_rate": 9.533898305084747e-06, "loss": 0.5789, "step": 11700 }, { "epoch": 0.9615384615384616, "grad_norm": 8.58055305480957, "learning_rate": 9.615384615384616e-06, "loss": 0.5745, "step": 11800 }, { "epoch": 0.9696870925684485, "grad_norm": 5.948461532592773, "learning_rate": 9.696870925684486e-06, "loss": 0.5695, "step": 11900 }, { "epoch": 0.9778357235984355, "grad_norm": 8.523883819580078, "learning_rate": 9.778357235984357e-06, "loss": 0.575, "step": 12000 }, { "epoch": 0.9859843546284224, "grad_norm": 8.530996322631836, "learning_rate": 9.859843546284224e-06, "loss": 0.5496, "step": 12100 }, { "epoch": 0.9941329856584094, "grad_norm": 8.197943687438965, "learning_rate": 9.941329856584094e-06, "loss": 0.5929, "step": 12200 }, { "epoch": 1.0, "eval_accuracy": 0.8028112449799196, "eval_loss": 0.5100582242012024, "eval_runtime": 7.5718, "eval_samples_per_second": 328.853, "eval_steps_per_second": 41.206, "step": 12272 }, { "epoch": 1.0022816166883963, "grad_norm": 8.692102432250977, "learning_rate": 9.999998414230423e-06, "loss": 0.5456, "step": 12300 }, { "epoch": 1.0104302477183833, "grad_norm": 6.663279056549072, "learning_rate": 9.999966860686959e-06, "loss": 0.546, "step": 12400 }, { "epoch": 1.0185788787483703, "grad_norm": 7.53484582901001, "learning_rate": 9.999894854131206e-06, "loss": 0.5182, "step": 12500 }, { "epoch": 1.0267275097783573, "grad_norm": 6.181861877441406, "learning_rate": 9.999782395145752e-06, "loss": 0.5093, "step": 12600 }, { "epoch": 1.0348761408083442, "grad_norm": 9.323958396911621, "learning_rate": 9.999629484640457e-06, "loss": 0.528, "step": 12700 }, { "epoch": 1.0430247718383312, "grad_norm": 10.739737510681152, "learning_rate": 9.999436123852473e-06, "loss": 0.5423, "step": 12800 }, { "epoch": 1.0511734028683182, "grad_norm": 7.653073787689209, "learning_rate": 9.99920231434621e-06, "loss": 0.5215, "step": 12900 }, { "epoch": 1.0593220338983051, "grad_norm": 6.83660888671875, "learning_rate": 9.998928058013346e-06, "loss": 0.5134, "step": 13000 }, { "epoch": 1.0674706649282921, "grad_norm": 10.44430923461914, "learning_rate": 9.99861335707279e-06, "loss": 0.5466, "step": 13100 }, { "epoch": 1.075619295958279, "grad_norm": 7.2710280418396, "learning_rate": 9.998258214070683e-06, "loss": 0.5364, "step": 13200 }, { "epoch": 1.083767926988266, "grad_norm": 5.829804420471191, "learning_rate": 9.997862631880362e-06, "loss": 0.5146, "step": 13300 }, { "epoch": 1.0919165580182528, "grad_norm": 8.52145767211914, "learning_rate": 9.997426613702348e-06, "loss": 0.5105, "step": 13400 }, { "epoch": 1.1000651890482398, "grad_norm": 6.255794525146484, "learning_rate": 9.996950163064313e-06, "loss": 0.532, "step": 13500 }, { "epoch": 1.1082138200782268, "grad_norm": 8.463394165039062, "learning_rate": 9.996433283821057e-06, "loss": 0.5265, "step": 13600 }, { "epoch": 1.1163624511082137, "grad_norm": 9.939913749694824, "learning_rate": 9.995875980154468e-06, "loss": 0.5297, "step": 13700 }, { "epoch": 1.1245110821382007, "grad_norm": 10.322543144226074, "learning_rate": 9.995278256573504e-06, "loss": 0.5413, "step": 13800 }, { "epoch": 1.1326597131681877, "grad_norm": 7.6662445068359375, "learning_rate": 9.994640117914139e-06, "loss": 0.5197, "step": 13900 }, { "epoch": 1.1408083441981747, "grad_norm": 12.555916786193848, "learning_rate": 9.99396156933933e-06, "loss": 0.5472, "step": 14000 }, { "epoch": 1.1489569752281616, "grad_norm": 12.246332168579102, "learning_rate": 9.993242616338983e-06, "loss": 0.5296, "step": 14100 }, { "epoch": 1.1571056062581486, "grad_norm": 11.406452178955078, "learning_rate": 9.992483264729902e-06, "loss": 0.5266, "step": 14200 }, { "epoch": 1.1652542372881356, "grad_norm": 7.620953559875488, "learning_rate": 9.991683520655735e-06, "loss": 0.5267, "step": 14300 }, { "epoch": 1.1734028683181226, "grad_norm": 7.820069789886475, "learning_rate": 9.990843390586938e-06, "loss": 0.5384, "step": 14400 }, { "epoch": 1.1815514993481095, "grad_norm": 8.187140464782715, "learning_rate": 9.989962881320714e-06, "loss": 0.5071, "step": 14500 }, { "epoch": 1.1897001303780965, "grad_norm": 8.322758674621582, "learning_rate": 9.989041999980964e-06, "loss": 0.5342, "step": 14600 }, { "epoch": 1.1978487614080835, "grad_norm": 9.802703857421875, "learning_rate": 9.988080754018218e-06, "loss": 0.5205, "step": 14700 }, { "epoch": 1.2059973924380705, "grad_norm": 9.249838829040527, "learning_rate": 9.987079151209588e-06, "loss": 0.5069, "step": 14800 }, { "epoch": 1.2141460234680574, "grad_norm": 4.855494022369385, "learning_rate": 9.986037199658698e-06, "loss": 0.5107, "step": 14900 }, { "epoch": 1.2222946544980444, "grad_norm": 9.250731468200684, "learning_rate": 9.984954907795619e-06, "loss": 0.5093, "step": 15000 }, { "epoch": 1.2304432855280312, "grad_norm": 5.86234712600708, "learning_rate": 9.983832284376804e-06, "loss": 0.5539, "step": 15100 }, { "epoch": 1.2385919165580184, "grad_norm": 13.074224472045898, "learning_rate": 9.982669338485012e-06, "loss": 0.5248, "step": 15200 }, { "epoch": 1.2467405475880051, "grad_norm": 12.13022518157959, "learning_rate": 9.981466079529236e-06, "loss": 0.5415, "step": 15300 }, { "epoch": 1.254889178617992, "grad_norm": 9.259481430053711, "learning_rate": 9.980222517244633e-06, "loss": 0.5224, "step": 15400 }, { "epoch": 1.263037809647979, "grad_norm": 7.281178951263428, "learning_rate": 9.978938661692439e-06, "loss": 0.5363, "step": 15500 }, { "epoch": 1.271186440677966, "grad_norm": 12.429268836975098, "learning_rate": 9.977614523259884e-06, "loss": 0.5257, "step": 15600 }, { "epoch": 1.279335071707953, "grad_norm": 8.357499122619629, "learning_rate": 9.97625011266012e-06, "loss": 0.5151, "step": 15700 }, { "epoch": 1.28748370273794, "grad_norm": 7.741194725036621, "learning_rate": 9.974845440932121e-06, "loss": 0.4973, "step": 15800 }, { "epoch": 1.295632333767927, "grad_norm": 12.34659481048584, "learning_rate": 9.973400519440605e-06, "loss": 0.5275, "step": 15900 }, { "epoch": 1.303780964797914, "grad_norm": 7.972919940948486, "learning_rate": 9.971915359875935e-06, "loss": 0.5196, "step": 16000 }, { "epoch": 1.311929595827901, "grad_norm": 6.398066520690918, "learning_rate": 9.970389974254025e-06, "loss": 0.5239, "step": 16100 }, { "epoch": 1.320078226857888, "grad_norm": 9.441793441772461, "learning_rate": 9.968824374916245e-06, "loss": 0.5141, "step": 16200 }, { "epoch": 1.3282268578878749, "grad_norm": 8.154695510864258, "learning_rate": 9.967218574529323e-06, "loss": 0.5179, "step": 16300 }, { "epoch": 1.3363754889178618, "grad_norm": 9.219006538391113, "learning_rate": 9.965572586085235e-06, "loss": 0.4859, "step": 16400 }, { "epoch": 1.3445241199478488, "grad_norm": 7.020698070526123, "learning_rate": 9.96388642290111e-06, "loss": 0.5128, "step": 16500 }, { "epoch": 1.3526727509778358, "grad_norm": 7.134260654449463, "learning_rate": 9.96216009861911e-06, "loss": 0.5067, "step": 16600 }, { "epoch": 1.3608213820078228, "grad_norm": 6.663614273071289, "learning_rate": 9.96039362720634e-06, "loss": 0.5352, "step": 16700 }, { "epoch": 1.3689700130378095, "grad_norm": 7.817680358886719, "learning_rate": 9.958587022954704e-06, "loss": 0.5143, "step": 16800 }, { "epoch": 1.3771186440677967, "grad_norm": 8.092264175415039, "learning_rate": 9.956740300480818e-06, "loss": 0.5111, "step": 16900 }, { "epoch": 1.3852672750977835, "grad_norm": 7.305174350738525, "learning_rate": 9.954853474725878e-06, "loss": 0.5432, "step": 17000 }, { "epoch": 1.3934159061277707, "grad_norm": 7.337920188903809, "learning_rate": 9.952926560955547e-06, "loss": 0.5279, "step": 17100 }, { "epoch": 1.4015645371577574, "grad_norm": 8.824036598205566, "learning_rate": 9.950959574759815e-06, "loss": 0.5073, "step": 17200 }, { "epoch": 1.4097131681877444, "grad_norm": 5.825498580932617, "learning_rate": 9.948952532052895e-06, "loss": 0.5208, "step": 17300 }, { "epoch": 1.4178617992177314, "grad_norm": 6.746844291687012, "learning_rate": 9.946905449073077e-06, "loss": 0.5245, "step": 17400 }, { "epoch": 1.4260104302477183, "grad_norm": 9.570401191711426, "learning_rate": 9.944818342382607e-06, "loss": 0.5056, "step": 17500 }, { "epoch": 1.4341590612777053, "grad_norm": 8.143331527709961, "learning_rate": 9.942691228867548e-06, "loss": 0.5066, "step": 17600 }, { "epoch": 1.4423076923076923, "grad_norm": 8.18307113647461, "learning_rate": 9.940524125737641e-06, "loss": 0.4933, "step": 17700 }, { "epoch": 1.4504563233376793, "grad_norm": 9.306159019470215, "learning_rate": 9.938317050526173e-06, "loss": 0.5092, "step": 17800 }, { "epoch": 1.4586049543676662, "grad_norm": 7.026943206787109, "learning_rate": 9.936070021089834e-06, "loss": 0.5071, "step": 17900 }, { "epoch": 1.4667535853976532, "grad_norm": 8.45121955871582, "learning_rate": 9.933783055608562e-06, "loss": 0.5193, "step": 18000 }, { "epoch": 1.4749022164276402, "grad_norm": 5.932709217071533, "learning_rate": 9.93145617258541e-06, "loss": 0.5311, "step": 18100 }, { "epoch": 1.4830508474576272, "grad_norm": 8.077872276306152, "learning_rate": 9.929089390846389e-06, "loss": 0.4887, "step": 18200 }, { "epoch": 1.4911994784876141, "grad_norm": 10.298677444458008, "learning_rate": 9.926682729540313e-06, "loss": 0.5006, "step": 18300 }, { "epoch": 1.4993481095176011, "grad_norm": 7.896773815155029, "learning_rate": 9.924236208138656e-06, "loss": 0.4828, "step": 18400 }, { "epoch": 1.5074967405475879, "grad_norm": 10.591178894042969, "learning_rate": 9.921749846435375e-06, "loss": 0.4936, "step": 18500 }, { "epoch": 1.515645371577575, "grad_norm": 8.356033325195312, "learning_rate": 9.919223664546774e-06, "loss": 0.5271, "step": 18600 }, { "epoch": 1.5237940026075618, "grad_norm": 9.826644897460938, "learning_rate": 9.916657682911317e-06, "loss": 0.5115, "step": 18700 }, { "epoch": 1.531942633637549, "grad_norm": 7.742495536804199, "learning_rate": 9.914051922289482e-06, "loss": 0.5037, "step": 18800 }, { "epoch": 1.5400912646675358, "grad_norm": 6.355010032653809, "learning_rate": 9.91140640376358e-06, "loss": 0.5047, "step": 18900 }, { "epoch": 1.548239895697523, "grad_norm": 11.718524932861328, "learning_rate": 9.908721148737591e-06, "loss": 0.5074, "step": 19000 }, { "epoch": 1.5563885267275097, "grad_norm": 6.173713207244873, "learning_rate": 9.905996178936991e-06, "loss": 0.5367, "step": 19100 }, { "epoch": 1.5645371577574967, "grad_norm": 10.962457656860352, "learning_rate": 9.903231516408576e-06, "loss": 0.4991, "step": 19200 }, { "epoch": 1.5726857887874837, "grad_norm": 6.949578285217285, "learning_rate": 9.900427183520276e-06, "loss": 0.4935, "step": 19300 }, { "epoch": 1.5808344198174706, "grad_norm": 6.240306854248047, "learning_rate": 9.897583202960985e-06, "loss": 0.5136, "step": 19400 }, { "epoch": 1.5889830508474576, "grad_norm": 6.609454154968262, "learning_rate": 9.89469959774037e-06, "loss": 0.4972, "step": 19500 }, { "epoch": 1.5971316818774446, "grad_norm": 8.191039085388184, "learning_rate": 9.891776391188694e-06, "loss": 0.5202, "step": 19600 }, { "epoch": 1.6052803129074316, "grad_norm": 7.624372959136963, "learning_rate": 9.888813606956612e-06, "loss": 0.515, "step": 19700 }, { "epoch": 1.6134289439374185, "grad_norm": 8.45014476776123, "learning_rate": 9.885811269014992e-06, "loss": 0.517, "step": 19800 }, { "epoch": 1.6215775749674055, "grad_norm": 6.690873146057129, "learning_rate": 9.882769401654719e-06, "loss": 0.5153, "step": 19900 }, { "epoch": 1.6297262059973925, "grad_norm": 6.8720808029174805, "learning_rate": 9.879688029486496e-06, "loss": 0.5288, "step": 20000 }, { "epoch": 1.6378748370273795, "grad_norm": 9.76561164855957, "learning_rate": 9.876567177440645e-06, "loss": 0.509, "step": 20100 }, { "epoch": 1.6460234680573662, "grad_norm": 12.810523986816406, "learning_rate": 9.873406870766906e-06, "loss": 0.5144, "step": 20200 }, { "epoch": 1.6541720990873534, "grad_norm": 6.44625997543335, "learning_rate": 9.870207135034235e-06, "loss": 0.5237, "step": 20300 }, { "epoch": 1.6623207301173402, "grad_norm": 9.6302490234375, "learning_rate": 9.86696799613059e-06, "loss": 0.5094, "step": 20400 }, { "epoch": 1.6704693611473274, "grad_norm": 10.308381080627441, "learning_rate": 9.863689480262734e-06, "loss": 0.498, "step": 20500 }, { "epoch": 1.6786179921773141, "grad_norm": 11.594625473022461, "learning_rate": 9.860371613956008e-06, "loss": 0.5224, "step": 20600 }, { "epoch": 1.6867666232073013, "grad_norm": 7.823093414306641, "learning_rate": 9.85701442405413e-06, "loss": 0.515, "step": 20700 }, { "epoch": 1.694915254237288, "grad_norm": 6.978199481964111, "learning_rate": 9.853617937718966e-06, "loss": 0.5103, "step": 20800 }, { "epoch": 1.7030638852672753, "grad_norm": 9.50684928894043, "learning_rate": 9.850182182430322e-06, "loss": 0.4876, "step": 20900 }, { "epoch": 1.711212516297262, "grad_norm": 9.167742729187012, "learning_rate": 9.84670718598571e-06, "loss": 0.521, "step": 21000 }, { "epoch": 1.719361147327249, "grad_norm": 9.103960990905762, "learning_rate": 9.843192976500131e-06, "loss": 0.4987, "step": 21100 }, { "epoch": 1.727509778357236, "grad_norm": 7.777735233306885, "learning_rate": 9.83963958240585e-06, "loss": 0.4838, "step": 21200 }, { "epoch": 1.735658409387223, "grad_norm": 3.7518503665924072, "learning_rate": 9.83604703245215e-06, "loss": 0.5019, "step": 21300 }, { "epoch": 1.74380704041721, "grad_norm": 8.239873886108398, "learning_rate": 9.832415355705118e-06, "loss": 0.5119, "step": 21400 }, { "epoch": 1.7519556714471969, "grad_norm": 7.265876293182373, "learning_rate": 9.828744581547407e-06, "loss": 0.4681, "step": 21500 }, { "epoch": 1.7601043024771839, "grad_norm": 9.064807891845703, "learning_rate": 9.825034739677984e-06, "loss": 0.4737, "step": 21600 }, { "epoch": 1.7682529335071708, "grad_norm": 6.92955207824707, "learning_rate": 9.821285860111903e-06, "loss": 0.4968, "step": 21700 }, { "epoch": 1.7764015645371578, "grad_norm": 10.282632827758789, "learning_rate": 9.817497973180062e-06, "loss": 0.4986, "step": 21800 }, { "epoch": 1.7845501955671446, "grad_norm": 5.6930084228515625, "learning_rate": 9.813671109528949e-06, "loss": 0.5135, "step": 21900 }, { "epoch": 1.7926988265971318, "grad_norm": 6.911000728607178, "learning_rate": 9.809805300120403e-06, "loss": 0.5046, "step": 22000 }, { "epoch": 1.8008474576271185, "grad_norm": 6.411030292510986, "learning_rate": 9.805900576231358e-06, "loss": 0.4926, "step": 22100 }, { "epoch": 1.8089960886571057, "grad_norm": 6.620294570922852, "learning_rate": 9.801956969453592e-06, "loss": 0.4788, "step": 22200 }, { "epoch": 1.8171447196870925, "grad_norm": 6.77543830871582, "learning_rate": 9.797974511693471e-06, "loss": 0.4896, "step": 22300 }, { "epoch": 1.8252933507170797, "grad_norm": 7.471630573272705, "learning_rate": 9.793953235171694e-06, "loss": 0.4979, "step": 22400 }, { "epoch": 1.8334419817470664, "grad_norm": 6.550878524780273, "learning_rate": 9.789893172423021e-06, "loss": 0.5081, "step": 22500 }, { "epoch": 1.8415906127770536, "grad_norm": 9.887825965881348, "learning_rate": 9.78579435629603e-06, "loss": 0.5089, "step": 22600 }, { "epoch": 1.8497392438070404, "grad_norm": 7.088003158569336, "learning_rate": 9.781656819952826e-06, "loss": 0.4811, "step": 22700 }, { "epoch": 1.8578878748370273, "grad_norm": 6.524052619934082, "learning_rate": 9.777480596868796e-06, "loss": 0.5018, "step": 22800 }, { "epoch": 1.8660365058670143, "grad_norm": 7.965360164642334, "learning_rate": 9.773265720832324e-06, "loss": 0.5144, "step": 22900 }, { "epoch": 1.8741851368970013, "grad_norm": 7.510045051574707, "learning_rate": 9.769012225944521e-06, "loss": 0.5002, "step": 23000 }, { "epoch": 1.8823337679269883, "grad_norm": 11.717968940734863, "learning_rate": 9.764720146618955e-06, "loss": 0.5003, "step": 23100 }, { "epoch": 1.8904823989569752, "grad_norm": 5.974288463592529, "learning_rate": 9.760389517581362e-06, "loss": 0.4912, "step": 23200 }, { "epoch": 1.8986310299869622, "grad_norm": 5.159633159637451, "learning_rate": 9.75602037386937e-06, "loss": 0.4861, "step": 23300 }, { "epoch": 1.9067796610169492, "grad_norm": 6.651115417480469, "learning_rate": 9.75161275083222e-06, "loss": 0.5153, "step": 23400 }, { "epoch": 1.9149282920469362, "grad_norm": 7.513479709625244, "learning_rate": 9.747166684130474e-06, "loss": 0.4931, "step": 23500 }, { "epoch": 1.9230769230769231, "grad_norm": 8.77505874633789, "learning_rate": 9.742682209735727e-06, "loss": 0.501, "step": 23600 }, { "epoch": 1.93122555410691, "grad_norm": 6.932135581970215, "learning_rate": 9.738159363930324e-06, "loss": 0.52, "step": 23700 }, { "epoch": 1.9393741851368969, "grad_norm": 10.359477996826172, "learning_rate": 9.73359818330705e-06, "loss": 0.4877, "step": 23800 }, { "epoch": 1.947522816166884, "grad_norm": 8.781031608581543, "learning_rate": 9.72899870476885e-06, "loss": 0.4891, "step": 23900 }, { "epoch": 1.9556714471968708, "grad_norm": 8.263874053955078, "learning_rate": 9.724360965528523e-06, "loss": 0.5061, "step": 24000 }, { "epoch": 1.963820078226858, "grad_norm": 7.766465663909912, "learning_rate": 9.719685003108423e-06, "loss": 0.4902, "step": 24100 }, { "epoch": 1.9719687092568448, "grad_norm": 4.978456974029541, "learning_rate": 9.714970855340152e-06, "loss": 0.4873, "step": 24200 }, { "epoch": 1.980117340286832, "grad_norm": 7.918380260467529, "learning_rate": 9.71021856036426e-06, "loss": 0.4941, "step": 24300 }, { "epoch": 1.9882659713168187, "grad_norm": 8.015583038330078, "learning_rate": 9.705428156629933e-06, "loss": 0.4833, "step": 24400 }, { "epoch": 1.996414602346806, "grad_norm": 7.768013954162598, "learning_rate": 9.700599682894675e-06, "loss": 0.4932, "step": 24500 }, { "epoch": 2.0, "eval_accuracy": 0.8008032128514057, "eval_loss": 0.5220404267311096, "eval_runtime": 6.9482, "eval_samples_per_second": 358.366, "eval_steps_per_second": 44.904, "step": 24544 }, { "epoch": 2.0045632333767927, "grad_norm": 7.617489337921143, "learning_rate": 9.695733178224009e-06, "loss": 0.4491, "step": 24600 }, { "epoch": 2.01271186440678, "grad_norm": 8.741541862487793, "learning_rate": 9.690828681991153e-06, "loss": 0.4068, "step": 24700 }, { "epoch": 2.0208604954367666, "grad_norm": 11.999881744384766, "learning_rate": 9.685886233876696e-06, "loss": 0.4138, "step": 24800 }, { "epoch": 2.029009126466754, "grad_norm": 9.766683578491211, "learning_rate": 9.680905873868287e-06, "loss": 0.3986, "step": 24900 }, { "epoch": 2.0371577574967406, "grad_norm": 6.533343315124512, "learning_rate": 9.675887642260306e-06, "loss": 0.4024, "step": 25000 }, { "epoch": 2.0453063885267273, "grad_norm": 9.137768745422363, "learning_rate": 9.670831579653539e-06, "loss": 0.4436, "step": 25100 }, { "epoch": 2.0534550195567145, "grad_norm": 9.635496139526367, "learning_rate": 9.665737726954852e-06, "loss": 0.4019, "step": 25200 }, { "epoch": 2.0616036505867013, "grad_norm": 7.93952751159668, "learning_rate": 9.66060612537685e-06, "loss": 0.4221, "step": 25300 }, { "epoch": 2.0697522816166884, "grad_norm": 9.508652687072754, "learning_rate": 9.65543681643756e-06, "loss": 0.4221, "step": 25400 }, { "epoch": 2.077900912646675, "grad_norm": 9.136526107788086, "learning_rate": 9.650229841960084e-06, "loss": 0.4239, "step": 25500 }, { "epoch": 2.0860495436766624, "grad_norm": 11.71844482421875, "learning_rate": 9.644985244072258e-06, "loss": 0.4047, "step": 25600 }, { "epoch": 2.094198174706649, "grad_norm": 4.190426826477051, "learning_rate": 9.639703065206323e-06, "loss": 0.4209, "step": 25700 }, { "epoch": 2.1023468057366363, "grad_norm": 11.736051559448242, "learning_rate": 9.63438334809857e-06, "loss": 0.4086, "step": 25800 }, { "epoch": 2.110495436766623, "grad_norm": 7.024579048156738, "learning_rate": 9.629026135789002e-06, "loss": 0.4346, "step": 25900 }, { "epoch": 2.1186440677966103, "grad_norm": 10.942073822021484, "learning_rate": 9.62363147162098e-06, "loss": 0.4242, "step": 26000 }, { "epoch": 2.126792698826597, "grad_norm": 12.155450820922852, "learning_rate": 9.618199399240876e-06, "loss": 0.4706, "step": 26100 }, { "epoch": 2.1349413298565842, "grad_norm": 6.733283519744873, "learning_rate": 9.612729962597721e-06, "loss": 0.4406, "step": 26200 }, { "epoch": 2.143089960886571, "grad_norm": 7.309271335601807, "learning_rate": 9.607223205942845e-06, "loss": 0.4169, "step": 26300 }, { "epoch": 2.151238591916558, "grad_norm": 7.154285907745361, "learning_rate": 9.601679173829522e-06, "loss": 0.4406, "step": 26400 }, { "epoch": 2.159387222946545, "grad_norm": 8.043559074401855, "learning_rate": 9.596097911112609e-06, "loss": 0.4264, "step": 26500 }, { "epoch": 2.167535853976532, "grad_norm": 9.203978538513184, "learning_rate": 9.590479462948185e-06, "loss": 0.4173, "step": 26600 }, { "epoch": 2.175684485006519, "grad_norm": 7.716718673706055, "learning_rate": 9.58482387479318e-06, "loss": 0.412, "step": 26700 }, { "epoch": 2.1838331160365057, "grad_norm": 10.910019874572754, "learning_rate": 9.57913119240501e-06, "loss": 0.3844, "step": 26800 }, { "epoch": 2.191981747066493, "grad_norm": 7.980166435241699, "learning_rate": 9.573401461841218e-06, "loss": 0.4441, "step": 26900 }, { "epoch": 2.2001303780964796, "grad_norm": 7.328435897827148, "learning_rate": 9.567634729459076e-06, "loss": 0.4118, "step": 27000 }, { "epoch": 2.208279009126467, "grad_norm": 7.026157379150391, "learning_rate": 9.561831041915238e-06, "loss": 0.4258, "step": 27100 }, { "epoch": 2.2164276401564535, "grad_norm": 10.100348472595215, "learning_rate": 9.555990446165339e-06, "loss": 0.4368, "step": 27200 }, { "epoch": 2.2245762711864407, "grad_norm": 11.21714973449707, "learning_rate": 9.550112989463633e-06, "loss": 0.4253, "step": 27300 }, { "epoch": 2.2327249022164275, "grad_norm": 7.6962127685546875, "learning_rate": 9.5441987193626e-06, "loss": 0.4273, "step": 27400 }, { "epoch": 2.2408735332464147, "grad_norm": 13.219654083251953, "learning_rate": 9.538247683712567e-06, "loss": 0.4369, "step": 27500 }, { "epoch": 2.2490221642764014, "grad_norm": 5.536248683929443, "learning_rate": 9.532259930661315e-06, "loss": 0.4493, "step": 27600 }, { "epoch": 2.2571707953063886, "grad_norm": 7.112065315246582, "learning_rate": 9.526235508653694e-06, "loss": 0.4325, "step": 27700 }, { "epoch": 2.2653194263363754, "grad_norm": 6.064886093139648, "learning_rate": 9.520174466431235e-06, "loss": 0.4353, "step": 27800 }, { "epoch": 2.2734680573663626, "grad_norm": 7.9532318115234375, "learning_rate": 9.51407685303174e-06, "loss": 0.4358, "step": 27900 }, { "epoch": 2.2816166883963493, "grad_norm": 8.64626693725586, "learning_rate": 9.507942717788907e-06, "loss": 0.4489, "step": 28000 }, { "epoch": 2.2897653194263365, "grad_norm": 9.648942947387695, "learning_rate": 9.50177211033191e-06, "loss": 0.4498, "step": 28100 }, { "epoch": 2.2979139504563233, "grad_norm": 7.498199939727783, "learning_rate": 9.495565080585017e-06, "loss": 0.4086, "step": 28200 }, { "epoch": 2.3060625814863105, "grad_norm": 8.632119178771973, "learning_rate": 9.489321678767167e-06, "loss": 0.4207, "step": 28300 }, { "epoch": 2.3142112125162972, "grad_norm": 8.807448387145996, "learning_rate": 9.48304195539158e-06, "loss": 0.428, "step": 28400 }, { "epoch": 2.322359843546284, "grad_norm": 7.809271812438965, "learning_rate": 9.476725961265332e-06, "loss": 0.4546, "step": 28500 }, { "epoch": 2.330508474576271, "grad_norm": 8.758193969726562, "learning_rate": 9.470373747488966e-06, "loss": 0.432, "step": 28600 }, { "epoch": 2.3386571056062584, "grad_norm": 8.046852111816406, "learning_rate": 9.463985365456057e-06, "loss": 0.4169, "step": 28700 }, { "epoch": 2.346805736636245, "grad_norm": 12.665115356445312, "learning_rate": 9.457560866852805e-06, "loss": 0.4242, "step": 28800 }, { "epoch": 2.354954367666232, "grad_norm": 10.333826065063477, "learning_rate": 9.45110030365762e-06, "loss": 0.4603, "step": 28900 }, { "epoch": 2.363102998696219, "grad_norm": 8.857953071594238, "learning_rate": 9.444603728140698e-06, "loss": 0.454, "step": 29000 }, { "epoch": 2.371251629726206, "grad_norm": 9.135393142700195, "learning_rate": 9.438071192863596e-06, "loss": 0.4574, "step": 29100 }, { "epoch": 2.379400260756193, "grad_norm": 6.3214921951293945, "learning_rate": 9.43150275067881e-06, "loss": 0.4299, "step": 29200 }, { "epoch": 2.38754889178618, "grad_norm": 7.322382926940918, "learning_rate": 9.42489845472935e-06, "loss": 0.4265, "step": 29300 }, { "epoch": 2.395697522816167, "grad_norm": 11.1491060256958, "learning_rate": 9.418258358448298e-06, "loss": 0.4233, "step": 29400 }, { "epoch": 2.4038461538461537, "grad_norm": 7.171163082122803, "learning_rate": 9.411582515558391e-06, "loss": 0.4271, "step": 29500 }, { "epoch": 2.411994784876141, "grad_norm": 5.758033275604248, "learning_rate": 9.404870980071579e-06, "loss": 0.4463, "step": 29600 }, { "epoch": 2.4201434159061277, "grad_norm": 11.001411437988281, "learning_rate": 9.398123806288588e-06, "loss": 0.42, "step": 29700 }, { "epoch": 2.428292046936115, "grad_norm": 6.28535795211792, "learning_rate": 9.39134104879848e-06, "loss": 0.4188, "step": 29800 }, { "epoch": 2.4364406779661016, "grad_norm": 6.2432861328125, "learning_rate": 9.38452276247821e-06, "loss": 0.4242, "step": 29900 }, { "epoch": 2.444589308996089, "grad_norm": 9.474976539611816, "learning_rate": 9.377669002492193e-06, "loss": 0.43, "step": 30000 }, { "epoch": 2.4527379400260756, "grad_norm": 7.984436988830566, "learning_rate": 9.37077982429184e-06, "loss": 0.4328, "step": 30100 }, { "epoch": 2.4608865710560623, "grad_norm": 8.237207412719727, "learning_rate": 9.363855283615124e-06, "loss": 0.4166, "step": 30200 }, { "epoch": 2.4690352020860495, "grad_norm": 7.6592936515808105, "learning_rate": 9.356895436486122e-06, "loss": 0.4253, "step": 30300 }, { "epoch": 2.4771838331160367, "grad_norm": 5.206706523895264, "learning_rate": 9.349900339214564e-06, "loss": 0.4414, "step": 30400 }, { "epoch": 2.4853324641460235, "grad_norm": 10.161866188049316, "learning_rate": 9.342870048395376e-06, "loss": 0.415, "step": 30500 }, { "epoch": 2.4934810951760102, "grad_norm": 4.225031852722168, "learning_rate": 9.335804620908222e-06, "loss": 0.4243, "step": 30600 }, { "epoch": 2.5016297262059974, "grad_norm": 7.489659786224365, "learning_rate": 9.328704113917046e-06, "loss": 0.4417, "step": 30700 }, { "epoch": 2.509778357235984, "grad_norm": 8.180109977722168, "learning_rate": 9.32156858486961e-06, "loss": 0.4217, "step": 30800 }, { "epoch": 2.5179269882659714, "grad_norm": 9.16032886505127, "learning_rate": 9.314398091497024e-06, "loss": 0.4297, "step": 30900 }, { "epoch": 2.526075619295958, "grad_norm": 8.16234302520752, "learning_rate": 9.307192691813285e-06, "loss": 0.4319, "step": 31000 }, { "epoch": 2.5342242503259453, "grad_norm": 10.111699104309082, "learning_rate": 9.299952444114802e-06, "loss": 0.4186, "step": 31100 }, { "epoch": 2.542372881355932, "grad_norm": 6.305666923522949, "learning_rate": 9.29267740697993e-06, "loss": 0.4382, "step": 31200 }, { "epoch": 2.5505215123859193, "grad_norm": 9.985565185546875, "learning_rate": 9.285367639268492e-06, "loss": 0.4272, "step": 31300 }, { "epoch": 2.558670143415906, "grad_norm": 10.670126914978027, "learning_rate": 9.278023200121305e-06, "loss": 0.4228, "step": 31400 }, { "epoch": 2.5668187744458932, "grad_norm": 7.42661714553833, "learning_rate": 9.2706441489597e-06, "loss": 0.4314, "step": 31500 }, { "epoch": 2.57496740547588, "grad_norm": 6.457535266876221, "learning_rate": 9.263230545485044e-06, "loss": 0.4401, "step": 31600 }, { "epoch": 2.583116036505867, "grad_norm": 11.822875022888184, "learning_rate": 9.25578244967825e-06, "loss": 0.3865, "step": 31700 }, { "epoch": 2.591264667535854, "grad_norm": 12.4473295211792, "learning_rate": 9.2482999217993e-06, "loss": 0.4272, "step": 31800 }, { "epoch": 2.5994132985658407, "grad_norm": 5.283376693725586, "learning_rate": 9.240783022386757e-06, "loss": 0.4084, "step": 31900 }, { "epoch": 2.607561929595828, "grad_norm": 8.190621376037598, "learning_rate": 9.233231812257266e-06, "loss": 0.4257, "step": 32000 }, { "epoch": 2.615710560625815, "grad_norm": 6.570192813873291, "learning_rate": 9.225646352505071e-06, "loss": 0.4464, "step": 32100 }, { "epoch": 2.623859191655802, "grad_norm": 10.470175743103027, "learning_rate": 9.218026704501519e-06, "loss": 0.4245, "step": 32200 }, { "epoch": 2.6320078226857886, "grad_norm": 7.662964820861816, "learning_rate": 9.210372929894561e-06, "loss": 0.4265, "step": 32300 }, { "epoch": 2.640156453715776, "grad_norm": 7.74278450012207, "learning_rate": 9.202685090608256e-06, "loss": 0.4293, "step": 32400 }, { "epoch": 2.648305084745763, "grad_norm": 6.661880970001221, "learning_rate": 9.194963248842266e-06, "loss": 0.4592, "step": 32500 }, { "epoch": 2.6564537157757497, "grad_norm": 8.020112991333008, "learning_rate": 9.18720746707136e-06, "loss": 0.4229, "step": 32600 }, { "epoch": 2.6646023468057365, "grad_norm": 5.921052932739258, "learning_rate": 9.179417808044897e-06, "loss": 0.4141, "step": 32700 }, { "epoch": 2.6727509778357237, "grad_norm": 10.444842338562012, "learning_rate": 9.17159433478633e-06, "loss": 0.4437, "step": 32800 }, { "epoch": 2.6808996088657104, "grad_norm": 7.524814605712891, "learning_rate": 9.163737110592697e-06, "loss": 0.4128, "step": 32900 }, { "epoch": 2.6890482398956976, "grad_norm": 10.936373710632324, "learning_rate": 9.155846199034086e-06, "loss": 0.4273, "step": 33000 }, { "epoch": 2.6971968709256844, "grad_norm": 7.02941370010376, "learning_rate": 9.147921663953157e-06, "loss": 0.4433, "step": 33100 }, { "epoch": 2.7053455019556716, "grad_norm": 10.595579147338867, "learning_rate": 9.139963569464593e-06, "loss": 0.4264, "step": 33200 }, { "epoch": 2.7134941329856583, "grad_norm": 5.312283992767334, "learning_rate": 9.131971979954603e-06, "loss": 0.4149, "step": 33300 }, { "epoch": 2.7216427640156455, "grad_norm": 7.464469909667969, "learning_rate": 9.123946960080387e-06, "loss": 0.4368, "step": 33400 }, { "epoch": 2.7297913950456323, "grad_norm": 7.507636547088623, "learning_rate": 9.115888574769623e-06, "loss": 0.4344, "step": 33500 }, { "epoch": 2.737940026075619, "grad_norm": 7.984206676483154, "learning_rate": 9.107796889219933e-06, "loss": 0.4165, "step": 33600 }, { "epoch": 2.7460886571056062, "grad_norm": 9.600481986999512, "learning_rate": 9.099671968898362e-06, "loss": 0.4212, "step": 33700 }, { "epoch": 2.7542372881355934, "grad_norm": 6.417558670043945, "learning_rate": 9.091513879540845e-06, "loss": 0.41, "step": 33800 }, { "epoch": 2.76238591916558, "grad_norm": 7.52598762512207, "learning_rate": 9.08332268715168e-06, "loss": 0.4443, "step": 33900 }, { "epoch": 2.770534550195567, "grad_norm": 8.766283988952637, "learning_rate": 9.075098458002988e-06, "loss": 0.4552, "step": 34000 }, { "epoch": 2.778683181225554, "grad_norm": 7.127804756164551, "learning_rate": 9.066841258634177e-06, "loss": 0.426, "step": 34100 }, { "epoch": 2.7868318122555413, "grad_norm": 8.190874099731445, "learning_rate": 9.058551155851405e-06, "loss": 0.4374, "step": 34200 }, { "epoch": 2.794980443285528, "grad_norm": 7.887624740600586, "learning_rate": 9.050228216727046e-06, "loss": 0.437, "step": 34300 }, { "epoch": 2.803129074315515, "grad_norm": 10.439249038696289, "learning_rate": 9.041872508599136e-06, "loss": 0.4165, "step": 34400 }, { "epoch": 2.811277705345502, "grad_norm": 9.891864776611328, "learning_rate": 9.033484099070839e-06, "loss": 0.4336, "step": 34500 }, { "epoch": 2.819426336375489, "grad_norm": 10.03987979888916, "learning_rate": 9.025063056009886e-06, "loss": 0.4365, "step": 34600 }, { "epoch": 2.827574967405476, "grad_norm": 6.188653469085693, "learning_rate": 9.016609447548046e-06, "loss": 0.41, "step": 34700 }, { "epoch": 2.8357235984354627, "grad_norm": 11.486917495727539, "learning_rate": 9.008123342080553e-06, "loss": 0.4343, "step": 34800 }, { "epoch": 2.84387222946545, "grad_norm": 9.972556114196777, "learning_rate": 8.99960480826557e-06, "loss": 0.4282, "step": 34900 }, { "epoch": 2.8520208604954367, "grad_norm": 7.771157741546631, "learning_rate": 8.991053915023625e-06, "loss": 0.4086, "step": 35000 }, { "epoch": 2.860169491525424, "grad_norm": 5.989213943481445, "learning_rate": 8.982470731537054e-06, "loss": 0.4647, "step": 35100 }, { "epoch": 2.8683181225554106, "grad_norm": 7.19948148727417, "learning_rate": 8.973855327249442e-06, "loss": 0.4086, "step": 35200 }, { "epoch": 2.8764667535853974, "grad_norm": 7.22706937789917, "learning_rate": 8.965207771865061e-06, "loss": 0.4225, "step": 35300 }, { "epoch": 2.8846153846153846, "grad_norm": 11.344962120056152, "learning_rate": 8.95652813534831e-06, "loss": 0.4275, "step": 35400 }, { "epoch": 2.8927640156453718, "grad_norm": 10.637499809265137, "learning_rate": 8.947816487923143e-06, "loss": 0.4347, "step": 35500 }, { "epoch": 2.9009126466753585, "grad_norm": 7.946286678314209, "learning_rate": 8.939072900072501e-06, "loss": 0.4218, "step": 35600 }, { "epoch": 2.9090612777053453, "grad_norm": 6.058999061584473, "learning_rate": 8.930297442537747e-06, "loss": 0.4212, "step": 35700 }, { "epoch": 2.9172099087353325, "grad_norm": 10.35421371459961, "learning_rate": 8.921490186318092e-06, "loss": 0.4028, "step": 35800 }, { "epoch": 2.9253585397653197, "grad_norm": 8.85345458984375, "learning_rate": 8.912651202670013e-06, "loss": 0.4455, "step": 35900 }, { "epoch": 2.9335071707953064, "grad_norm": 7.476600646972656, "learning_rate": 8.90378056310669e-06, "loss": 0.4212, "step": 36000 }, { "epoch": 2.941655801825293, "grad_norm": 8.27695369720459, "learning_rate": 8.894878339397416e-06, "loss": 0.4186, "step": 36100 }, { "epoch": 2.9498044328552804, "grad_norm": 8.344620704650879, "learning_rate": 8.885944603567023e-06, "loss": 0.4242, "step": 36200 }, { "epoch": 2.957953063885267, "grad_norm": 8.976387023925781, "learning_rate": 8.876979427895291e-06, "loss": 0.4359, "step": 36300 }, { "epoch": 2.9661016949152543, "grad_norm": 10.581543922424316, "learning_rate": 8.867982884916377e-06, "loss": 0.4171, "step": 36400 }, { "epoch": 2.974250325945241, "grad_norm": 6.423446178436279, "learning_rate": 8.858955047418217e-06, "loss": 0.4248, "step": 36500 }, { "epoch": 2.9823989569752283, "grad_norm": 6.647116184234619, "learning_rate": 8.849895988441933e-06, "loss": 0.4272, "step": 36600 }, { "epoch": 2.990547588005215, "grad_norm": 11.199699401855469, "learning_rate": 8.840805781281261e-06, "loss": 0.4336, "step": 36700 }, { "epoch": 2.9986962190352022, "grad_norm": 6.946083068847656, "learning_rate": 8.831684499481941e-06, "loss": 0.4278, "step": 36800 }, { "epoch": 3.0, "eval_accuracy": 0.8088353413654619, "eval_loss": 0.5133101940155029, "eval_runtime": 6.8742, "eval_samples_per_second": 362.222, "eval_steps_per_second": 45.387, "step": 36816 }, { "epoch": 3.006844850065189, "grad_norm": 8.117693901062012, "learning_rate": 8.822532216841124e-06, "loss": 0.3563, "step": 36900 }, { "epoch": 3.014993481095176, "grad_norm": 8.939483642578125, "learning_rate": 8.813349007406785e-06, "loss": 0.3693, "step": 37000 }, { "epoch": 3.023142112125163, "grad_norm": 5.619213104248047, "learning_rate": 8.80413494547711e-06, "loss": 0.359, "step": 37100 }, { "epoch": 3.03129074315515, "grad_norm": 7.458463191986084, "learning_rate": 8.794890105599905e-06, "loss": 0.3631, "step": 37200 }, { "epoch": 3.039439374185137, "grad_norm": 8.206454277038574, "learning_rate": 8.785614562571991e-06, "loss": 0.3513, "step": 37300 }, { "epoch": 3.047588005215124, "grad_norm": 8.663100242614746, "learning_rate": 8.776308391438597e-06, "loss": 0.3348, "step": 37400 }, { "epoch": 3.055736636245111, "grad_norm": 8.638208389282227, "learning_rate": 8.766971667492754e-06, "loss": 0.3618, "step": 37500 }, { "epoch": 3.0638852672750976, "grad_norm": 8.416321754455566, "learning_rate": 8.757604466274683e-06, "loss": 0.3671, "step": 37600 }, { "epoch": 3.0720338983050848, "grad_norm": 10.002084732055664, "learning_rate": 8.748206863571188e-06, "loss": 0.3462, "step": 37700 }, { "epoch": 3.0801825293350715, "grad_norm": 8.242202758789062, "learning_rate": 8.73877893541504e-06, "loss": 0.3524, "step": 37800 }, { "epoch": 3.0883311603650587, "grad_norm": 9.762850761413574, "learning_rate": 8.729320758084363e-06, "loss": 0.3844, "step": 37900 }, { "epoch": 3.0964797913950455, "grad_norm": 13.008197784423828, "learning_rate": 8.719832408102017e-06, "loss": 0.3489, "step": 38000 }, { "epoch": 3.1046284224250327, "grad_norm": 9.61468505859375, "learning_rate": 8.71031396223498e-06, "loss": 0.3386, "step": 38100 }, { "epoch": 3.1127770534550194, "grad_norm": 9.158555030822754, "learning_rate": 8.700765497493723e-06, "loss": 0.3542, "step": 38200 }, { "epoch": 3.1209256844850066, "grad_norm": 11.94726276397705, "learning_rate": 8.69118709113159e-06, "loss": 0.3591, "step": 38300 }, { "epoch": 3.1290743155149934, "grad_norm": 9.813300132751465, "learning_rate": 8.681578820644173e-06, "loss": 0.3625, "step": 38400 }, { "epoch": 3.1372229465449806, "grad_norm": 8.50658130645752, "learning_rate": 8.671940763768682e-06, "loss": 0.3789, "step": 38500 }, { "epoch": 3.1453715775749673, "grad_norm": 6.037990570068359, "learning_rate": 8.662272998483323e-06, "loss": 0.3635, "step": 38600 }, { "epoch": 3.1535202086049545, "grad_norm": 11.817001342773438, "learning_rate": 8.65257560300666e-06, "loss": 0.3526, "step": 38700 }, { "epoch": 3.1616688396349413, "grad_norm": 4.690389156341553, "learning_rate": 8.642848655796985e-06, "loss": 0.3634, "step": 38800 }, { "epoch": 3.1698174706649285, "grad_norm": 12.257222175598145, "learning_rate": 8.633092235551679e-06, "loss": 0.3626, "step": 38900 }, { "epoch": 3.1779661016949152, "grad_norm": 7.710871696472168, "learning_rate": 8.623306421206588e-06, "loss": 0.3571, "step": 39000 }, { "epoch": 3.1861147327249024, "grad_norm": 6.811945915222168, "learning_rate": 8.613491291935365e-06, "loss": 0.351, "step": 39100 }, { "epoch": 3.194263363754889, "grad_norm": 19.7229061126709, "learning_rate": 8.60364692714885e-06, "loss": 0.3348, "step": 39200 }, { "epoch": 3.2024119947848764, "grad_norm": 9.32421875, "learning_rate": 8.59377340649441e-06, "loss": 0.3437, "step": 39300 }, { "epoch": 3.210560625814863, "grad_norm": 9.309675216674805, "learning_rate": 8.583870809855306e-06, "loss": 0.3687, "step": 39400 }, { "epoch": 3.21870925684485, "grad_norm": 5.458558559417725, "learning_rate": 8.573939217350043e-06, "loss": 0.3584, "step": 39500 }, { "epoch": 3.226857887874837, "grad_norm": 8.717120170593262, "learning_rate": 8.563978709331717e-06, "loss": 0.3473, "step": 39600 }, { "epoch": 3.235006518904824, "grad_norm": 6.542947769165039, "learning_rate": 8.553989366387376e-06, "loss": 0.3806, "step": 39700 }, { "epoch": 3.243155149934811, "grad_norm": 11.504007339477539, "learning_rate": 8.543971269337355e-06, "loss": 0.3606, "step": 39800 }, { "epoch": 3.2513037809647978, "grad_norm": 9.393417358398438, "learning_rate": 8.533924499234633e-06, "loss": 0.3532, "step": 39900 }, { "epoch": 3.259452411994785, "grad_norm": 8.129273414611816, "learning_rate": 8.523849137364175e-06, "loss": 0.3473, "step": 40000 }, { "epoch": 3.2676010430247717, "grad_norm": 12.241875648498535, "learning_rate": 8.513745265242263e-06, "loss": 0.3576, "step": 40100 }, { "epoch": 3.275749674054759, "grad_norm": 9.895030975341797, "learning_rate": 8.503612964615858e-06, "loss": 0.3458, "step": 40200 }, { "epoch": 3.2838983050847457, "grad_norm": 5.42219877243042, "learning_rate": 8.493452317461914e-06, "loss": 0.3772, "step": 40300 }, { "epoch": 3.292046936114733, "grad_norm": 8.165868759155273, "learning_rate": 8.483263405986735e-06, "loss": 0.3561, "step": 40400 }, { "epoch": 3.3001955671447196, "grad_norm": 13.24457836151123, "learning_rate": 8.4730463126253e-06, "loss": 0.3587, "step": 40500 }, { "epoch": 3.308344198174707, "grad_norm": 12.287585258483887, "learning_rate": 8.462801120040595e-06, "loss": 0.3432, "step": 40600 }, { "epoch": 3.3164928292046936, "grad_norm": 8.932402610778809, "learning_rate": 8.452527911122953e-06, "loss": 0.3696, "step": 40700 }, { "epoch": 3.3246414602346808, "grad_norm": 8.847443580627441, "learning_rate": 8.442226768989373e-06, "loss": 0.362, "step": 40800 }, { "epoch": 3.3327900912646675, "grad_norm": 13.20019245147705, "learning_rate": 8.431897776982851e-06, "loss": 0.3543, "step": 40900 }, { "epoch": 3.3409387222946547, "grad_norm": 8.375232696533203, "learning_rate": 8.421541018671712e-06, "loss": 0.3741, "step": 41000 }, { "epoch": 3.3490873533246415, "grad_norm": 7.601521968841553, "learning_rate": 8.411156577848927e-06, "loss": 0.3518, "step": 41100 }, { "epoch": 3.3572359843546282, "grad_norm": 5.853700637817383, "learning_rate": 8.400744538531431e-06, "loss": 0.3556, "step": 41200 }, { "epoch": 3.3653846153846154, "grad_norm": 15.7562837600708, "learning_rate": 8.390304984959455e-06, "loss": 0.3591, "step": 41300 }, { "epoch": 3.373533246414602, "grad_norm": 7.048288822174072, "learning_rate": 8.379838001595837e-06, "loss": 0.3774, "step": 41400 }, { "epoch": 3.3816818774445894, "grad_norm": 8.532382011413574, "learning_rate": 8.369343673125339e-06, "loss": 0.3482, "step": 41500 }, { "epoch": 3.389830508474576, "grad_norm": 5.468735218048096, "learning_rate": 8.358822084453964e-06, "loss": 0.3637, "step": 41600 }, { "epoch": 3.3979791395045633, "grad_norm": 7.324248313903809, "learning_rate": 8.348273320708269e-06, "loss": 0.365, "step": 41700 }, { "epoch": 3.40612777053455, "grad_norm": 8.06946849822998, "learning_rate": 8.33769746723467e-06, "loss": 0.3661, "step": 41800 }, { "epoch": 3.4142764015645373, "grad_norm": 11.85434341430664, "learning_rate": 8.32709460959876e-06, "loss": 0.3542, "step": 41900 }, { "epoch": 3.422425032594524, "grad_norm": 8.629081726074219, "learning_rate": 8.316464833584618e-06, "loss": 0.3476, "step": 42000 }, { "epoch": 3.430573663624511, "grad_norm": 7.888760566711426, "learning_rate": 8.305808225194103e-06, "loss": 0.3752, "step": 42100 }, { "epoch": 3.438722294654498, "grad_norm": 8.756083488464355, "learning_rate": 8.295124870646168e-06, "loss": 0.359, "step": 42200 }, { "epoch": 3.446870925684485, "grad_norm": 8.682005882263184, "learning_rate": 8.284414856376161e-06, "loss": 0.3607, "step": 42300 }, { "epoch": 3.455019556714472, "grad_norm": 14.85304069519043, "learning_rate": 8.273678269035126e-06, "loss": 0.3417, "step": 42400 }, { "epoch": 3.463168187744459, "grad_norm": 10.479057312011719, "learning_rate": 8.262915195489097e-06, "loss": 0.3571, "step": 42500 }, { "epoch": 3.471316818774446, "grad_norm": 9.107665061950684, "learning_rate": 8.2521257228184e-06, "loss": 0.3655, "step": 42600 }, { "epoch": 3.479465449804433, "grad_norm": 10.01933765411377, "learning_rate": 8.241309938316947e-06, "loss": 0.363, "step": 42700 }, { "epoch": 3.48761408083442, "grad_norm": 7.9999189376831055, "learning_rate": 8.230467929491533e-06, "loss": 0.3753, "step": 42800 }, { "epoch": 3.4957627118644066, "grad_norm": 9.211396217346191, "learning_rate": 8.219599784061124e-06, "loss": 0.3389, "step": 42900 }, { "epoch": 3.5039113428943938, "grad_norm": 9.140076637268066, "learning_rate": 8.20870558995614e-06, "loss": 0.3683, "step": 43000 }, { "epoch": 3.512059973924381, "grad_norm": 9.534440040588379, "learning_rate": 8.197785435317766e-06, "loss": 0.3585, "step": 43100 }, { "epoch": 3.5202086049543677, "grad_norm": 10.818157196044922, "learning_rate": 8.186839408497213e-06, "loss": 0.3546, "step": 43200 }, { "epoch": 3.5283572359843545, "grad_norm": 11.54218578338623, "learning_rate": 8.175867598055021e-06, "loss": 0.3818, "step": 43300 }, { "epoch": 3.5365058670143417, "grad_norm": 10.037505149841309, "learning_rate": 8.164870092760336e-06, "loss": 0.347, "step": 43400 }, { "epoch": 3.5446544980443284, "grad_norm": 11.143013000488281, "learning_rate": 8.153846981590191e-06, "loss": 0.3633, "step": 43500 }, { "epoch": 3.5528031290743156, "grad_norm": 9.558606147766113, "learning_rate": 8.142798353728786e-06, "loss": 0.373, "step": 43600 }, { "epoch": 3.5609517601043024, "grad_norm": 13.201570510864258, "learning_rate": 8.131724298566767e-06, "loss": 0.3611, "step": 43700 }, { "epoch": 3.5691003911342896, "grad_norm": 10.490971565246582, "learning_rate": 8.120624905700511e-06, "loss": 0.3292, "step": 43800 }, { "epoch": 3.5772490221642763, "grad_norm": 3.778831958770752, "learning_rate": 8.109500264931387e-06, "loss": 0.3731, "step": 43900 }, { "epoch": 3.5853976531942635, "grad_norm": 10.723892211914062, "learning_rate": 8.098350466265034e-06, "loss": 0.3783, "step": 44000 }, { "epoch": 3.5935462842242503, "grad_norm": 9.849285125732422, "learning_rate": 8.087175599910642e-06, "loss": 0.337, "step": 44100 }, { "epoch": 3.601694915254237, "grad_norm": 11.700067520141602, "learning_rate": 8.07597575628021e-06, "loss": 0.3639, "step": 44200 }, { "epoch": 3.609843546284224, "grad_norm": 37.506065368652344, "learning_rate": 8.064751025987822e-06, "loss": 0.3644, "step": 44300 }, { "epoch": 3.6179921773142114, "grad_norm": 9.770977973937988, "learning_rate": 8.053501499848907e-06, "loss": 0.3838, "step": 44400 }, { "epoch": 3.626140808344198, "grad_norm": 14.631871223449707, "learning_rate": 8.042227268879516e-06, "loss": 0.3732, "step": 44500 }, { "epoch": 3.634289439374185, "grad_norm": 7.656193256378174, "learning_rate": 8.030928424295572e-06, "loss": 0.358, "step": 44600 }, { "epoch": 3.642438070404172, "grad_norm": 9.974722862243652, "learning_rate": 8.019605057512144e-06, "loss": 0.3588, "step": 44700 }, { "epoch": 3.6505867014341593, "grad_norm": 12.311222076416016, "learning_rate": 8.008257260142693e-06, "loss": 0.362, "step": 44800 }, { "epoch": 3.658735332464146, "grad_norm": 11.374334335327148, "learning_rate": 7.99688512399835e-06, "loss": 0.385, "step": 44900 }, { "epoch": 3.666883963494133, "grad_norm": 7.951153755187988, "learning_rate": 7.985488741087153e-06, "loss": 0.352, "step": 45000 }, { "epoch": 3.67503259452412, "grad_norm": 5.6287384033203125, "learning_rate": 7.97406820361332e-06, "loss": 0.3763, "step": 45100 }, { "epoch": 3.6831812255541068, "grad_norm": 9.33438777923584, "learning_rate": 7.962623603976491e-06, "loss": 0.3852, "step": 45200 }, { "epoch": 3.691329856584094, "grad_norm": 12.365875244140625, "learning_rate": 7.951155034770983e-06, "loss": 0.3775, "step": 45300 }, { "epoch": 3.6994784876140807, "grad_norm": 9.91942024230957, "learning_rate": 7.93966258878505e-06, "loss": 0.3678, "step": 45400 }, { "epoch": 3.707627118644068, "grad_norm": 9.160215377807617, "learning_rate": 7.928146359000117e-06, "loss": 0.36, "step": 45500 }, { "epoch": 3.7157757496740547, "grad_norm": 11.565260887145996, "learning_rate": 7.91660643859004e-06, "loss": 0.3531, "step": 45600 }, { "epoch": 3.723924380704042, "grad_norm": 4.027003765106201, "learning_rate": 7.905042920920344e-06, "loss": 0.3722, "step": 45700 }, { "epoch": 3.7320730117340286, "grad_norm": 13.809627532958984, "learning_rate": 7.893455899547476e-06, "loss": 0.3524, "step": 45800 }, { "epoch": 3.740221642764016, "grad_norm": 13.452054023742676, "learning_rate": 7.881845468218039e-06, "loss": 0.375, "step": 45900 }, { "epoch": 3.7483702737940026, "grad_norm": 9.63260269165039, "learning_rate": 7.87021172086804e-06, "loss": 0.3636, "step": 46000 }, { "epoch": 3.7565189048239898, "grad_norm": 8.539379119873047, "learning_rate": 7.85855475162213e-06, "loss": 0.3687, "step": 46100 }, { "epoch": 3.7646675358539765, "grad_norm": 7.635307788848877, "learning_rate": 7.846874654792835e-06, "loss": 0.3709, "step": 46200 }, { "epoch": 3.7728161668839633, "grad_norm": 8.707938194274902, "learning_rate": 7.835171524879805e-06, "loss": 0.3466, "step": 46300 }, { "epoch": 3.7809647979139505, "grad_norm": 6.248547077178955, "learning_rate": 7.823445456569036e-06, "loss": 0.3706, "step": 46400 }, { "epoch": 3.7891134289439377, "grad_norm": 11.434155464172363, "learning_rate": 7.811696544732115e-06, "loss": 0.3907, "step": 46500 }, { "epoch": 3.7972620599739244, "grad_norm": 5.250894546508789, "learning_rate": 7.799924884425447e-06, "loss": 0.377, "step": 46600 }, { "epoch": 3.805410691003911, "grad_norm": 6.875328063964844, "learning_rate": 7.788130570889488e-06, "loss": 0.3569, "step": 46700 }, { "epoch": 3.8135593220338984, "grad_norm": 8.773159980773926, "learning_rate": 7.776313699547971e-06, "loss": 0.3635, "step": 46800 }, { "epoch": 3.821707953063885, "grad_norm": 4.8134002685546875, "learning_rate": 7.764474366007138e-06, "loss": 0.345, "step": 46900 }, { "epoch": 3.8298565840938723, "grad_norm": 6.085391998291016, "learning_rate": 7.752612666054963e-06, "loss": 0.3699, "step": 47000 }, { "epoch": 3.838005215123859, "grad_norm": 8.958887100219727, "learning_rate": 7.740728695660389e-06, "loss": 0.3407, "step": 47100 }, { "epoch": 3.8461538461538463, "grad_norm": 6.2288994789123535, "learning_rate": 7.728822550972523e-06, "loss": 0.3633, "step": 47200 }, { "epoch": 3.854302477183833, "grad_norm": 9.540541648864746, "learning_rate": 7.716894328319893e-06, "loss": 0.3476, "step": 47300 }, { "epoch": 3.86245110821382, "grad_norm": 5.929731369018555, "learning_rate": 7.704944124209645e-06, "loss": 0.3929, "step": 47400 }, { "epoch": 3.870599739243807, "grad_norm": 7.797017574310303, "learning_rate": 7.692972035326772e-06, "loss": 0.3728, "step": 47500 }, { "epoch": 3.878748370273794, "grad_norm": 14.781734466552734, "learning_rate": 7.680978158533324e-06, "loss": 0.3546, "step": 47600 }, { "epoch": 3.886897001303781, "grad_norm": 10.41878890991211, "learning_rate": 7.668962590867636e-06, "loss": 0.3603, "step": 47700 }, { "epoch": 3.895045632333768, "grad_norm": 8.300308227539062, "learning_rate": 7.656925429543531e-06, "loss": 0.3546, "step": 47800 }, { "epoch": 3.903194263363755, "grad_norm": 9.709467887878418, "learning_rate": 7.644866771949544e-06, "loss": 0.3575, "step": 47900 }, { "epoch": 3.9113428943937416, "grad_norm": 7.606164455413818, "learning_rate": 7.632786715648128e-06, "loss": 0.3658, "step": 48000 }, { "epoch": 3.919491525423729, "grad_norm": 11.461851119995117, "learning_rate": 7.62068535837486e-06, "loss": 0.3653, "step": 48100 }, { "epoch": 3.927640156453716, "grad_norm": 11.35883617401123, "learning_rate": 7.608562798037662e-06, "loss": 0.3672, "step": 48200 }, { "epoch": 3.9357887874837028, "grad_norm": 9.994701385498047, "learning_rate": 7.596419132715997e-06, "loss": 0.3601, "step": 48300 }, { "epoch": 3.9439374185136895, "grad_norm": 12.242551803588867, "learning_rate": 7.584254460660092e-06, "loss": 0.3552, "step": 48400 }, { "epoch": 3.9520860495436767, "grad_norm": 11.628976821899414, "learning_rate": 7.572068880290118e-06, "loss": 0.3644, "step": 48500 }, { "epoch": 3.960234680573664, "grad_norm": 9.713350296020508, "learning_rate": 7.559862490195418e-06, "loss": 0.3463, "step": 48600 }, { "epoch": 3.9683833116036507, "grad_norm": 5.648345470428467, "learning_rate": 7.547635389133694e-06, "loss": 0.3483, "step": 48700 }, { "epoch": 3.9765319426336374, "grad_norm": 15.131999015808105, "learning_rate": 7.535387676030222e-06, "loss": 0.366, "step": 48800 }, { "epoch": 3.9846805736636246, "grad_norm": 8.72270393371582, "learning_rate": 7.523119449977028e-06, "loss": 0.3567, "step": 48900 }, { "epoch": 3.9928292046936114, "grad_norm": 10.733074188232422, "learning_rate": 7.510830810232112e-06, "loss": 0.37, "step": 49000 }, { "epoch": 4.0, "eval_accuracy": 0.8188755020080322, "eval_loss": 0.529120922088623, "eval_runtime": 6.8942, "eval_samples_per_second": 361.175, "eval_steps_per_second": 45.256, "step": 49088 }, { "epoch": 4.0009778357235986, "grad_norm": 7.13838529586792, "learning_rate": 7.498521856218637e-06, "loss": 0.355, "step": 49100 }, { "epoch": 4.009126466753585, "grad_norm": 5.439541816711426, "learning_rate": 7.486192687524112e-06, "loss": 0.3005, "step": 49200 }, { "epoch": 4.017275097783572, "grad_norm": 17.687950134277344, "learning_rate": 7.4738434038996e-06, "loss": 0.2864, "step": 49300 }, { "epoch": 4.02542372881356, "grad_norm": 11.162871360778809, "learning_rate": 7.461474105258911e-06, "loss": 0.3025, "step": 49400 }, { "epoch": 4.0335723598435465, "grad_norm": 9.104811668395996, "learning_rate": 7.449084891677785e-06, "loss": 0.2846, "step": 49500 }, { "epoch": 4.041720990873533, "grad_norm": 11.716981887817383, "learning_rate": 7.436675863393086e-06, "loss": 0.2984, "step": 49600 }, { "epoch": 4.04986962190352, "grad_norm": 6.521731376647949, "learning_rate": 7.424247120801997e-06, "loss": 0.2979, "step": 49700 }, { "epoch": 4.058018252933508, "grad_norm": 6.5696539878845215, "learning_rate": 7.4117987644611985e-06, "loss": 0.2898, "step": 49800 }, { "epoch": 4.066166883963494, "grad_norm": 9.98416805267334, "learning_rate": 7.399330895086061e-06, "loss": 0.3115, "step": 49900 }, { "epoch": 4.074315514993481, "grad_norm": 6.788928985595703, "learning_rate": 7.386843613549828e-06, "loss": 0.3158, "step": 50000 }, { "epoch": 4.082464146023468, "grad_norm": 9.002969741821289, "learning_rate": 7.374337020882798e-06, "loss": 0.2964, "step": 50100 }, { "epoch": 4.090612777053455, "grad_norm": 8.216889381408691, "learning_rate": 7.3618112182715115e-06, "loss": 0.3194, "step": 50200 }, { "epoch": 4.098761408083442, "grad_norm": 17.576051712036133, "learning_rate": 7.349266307057932e-06, "loss": 0.3093, "step": 50300 }, { "epoch": 4.106910039113429, "grad_norm": 14.113720893859863, "learning_rate": 7.336702388738619e-06, "loss": 0.2656, "step": 50400 }, { "epoch": 4.115058670143416, "grad_norm": 13.906309127807617, "learning_rate": 7.324119564963915e-06, "loss": 0.2977, "step": 50500 }, { "epoch": 4.1232073011734025, "grad_norm": 9.152776718139648, "learning_rate": 7.311517937537122e-06, "loss": 0.3067, "step": 50600 }, { "epoch": 4.13135593220339, "grad_norm": 10.242730140686035, "learning_rate": 7.29889760841367e-06, "loss": 0.301, "step": 50700 }, { "epoch": 4.139504563233377, "grad_norm": 11.567678451538086, "learning_rate": 7.2862586797003046e-06, "loss": 0.2997, "step": 50800 }, { "epoch": 4.147653194263364, "grad_norm": 6.842143535614014, "learning_rate": 7.27360125365425e-06, "loss": 0.3004, "step": 50900 }, { "epoch": 4.15580182529335, "grad_norm": 12.490499496459961, "learning_rate": 7.260925432682386e-06, "loss": 0.2959, "step": 51000 }, { "epoch": 4.163950456323338, "grad_norm": 7.078547477722168, "learning_rate": 7.248231319340422e-06, "loss": 0.2966, "step": 51100 }, { "epoch": 4.172099087353325, "grad_norm": 17.07299041748047, "learning_rate": 7.235519016332064e-06, "loss": 0.3241, "step": 51200 }, { "epoch": 4.1802477183833116, "grad_norm": 14.579496383666992, "learning_rate": 7.222788626508184e-06, "loss": 0.294, "step": 51300 }, { "epoch": 4.188396349413298, "grad_norm": 16.198028564453125, "learning_rate": 7.210040252865984e-06, "loss": 0.3049, "step": 51400 }, { "epoch": 4.196544980443286, "grad_norm": 12.001542091369629, "learning_rate": 7.197273998548174e-06, "loss": 0.2932, "step": 51500 }, { "epoch": 4.204693611473273, "grad_norm": 8.593428611755371, "learning_rate": 7.184489966842128e-06, "loss": 0.3147, "step": 51600 }, { "epoch": 4.2128422425032594, "grad_norm": 19.119985580444336, "learning_rate": 7.1716882611790475e-06, "loss": 0.2929, "step": 51700 }, { "epoch": 4.220990873533246, "grad_norm": 12.756973266601562, "learning_rate": 7.1588689851331305e-06, "loss": 0.2973, "step": 51800 }, { "epoch": 4.229139504563233, "grad_norm": 11.550286293029785, "learning_rate": 7.146032242420732e-06, "loss": 0.2996, "step": 51900 }, { "epoch": 4.237288135593221, "grad_norm": 8.533171653747559, "learning_rate": 7.133178136899522e-06, "loss": 0.3094, "step": 52000 }, { "epoch": 4.245436766623207, "grad_norm": 11.978692054748535, "learning_rate": 7.120306772567647e-06, "loss": 0.3013, "step": 52100 }, { "epoch": 4.253585397653194, "grad_norm": 10.963492393493652, "learning_rate": 7.107418253562889e-06, "loss": 0.3081, "step": 52200 }, { "epoch": 4.261734028683181, "grad_norm": 11.645411491394043, "learning_rate": 7.0945126841618225e-06, "loss": 0.2867, "step": 52300 }, { "epoch": 4.2698826597131685, "grad_norm": 11.48385238647461, "learning_rate": 7.081590168778973e-06, "loss": 0.3088, "step": 52400 }, { "epoch": 4.278031290743155, "grad_norm": 10.083149909973145, "learning_rate": 7.068650811965967e-06, "loss": 0.2954, "step": 52500 }, { "epoch": 4.286179921773142, "grad_norm": 10.841811180114746, "learning_rate": 7.055694718410688e-06, "loss": 0.2944, "step": 52600 }, { "epoch": 4.294328552803129, "grad_norm": 12.332331657409668, "learning_rate": 7.042721992936438e-06, "loss": 0.2857, "step": 52700 }, { "epoch": 4.302477183833116, "grad_norm": 13.689620971679688, "learning_rate": 7.029732740501073e-06, "loss": 0.3024, "step": 52800 }, { "epoch": 4.310625814863103, "grad_norm": 13.064624786376953, "learning_rate": 7.016727066196168e-06, "loss": 0.2917, "step": 52900 }, { "epoch": 4.31877444589309, "grad_norm": 8.214381217956543, "learning_rate": 7.003705075246163e-06, "loss": 0.3173, "step": 53000 }, { "epoch": 4.326923076923077, "grad_norm": 14.797425270080566, "learning_rate": 6.990666873007506e-06, "loss": 0.2734, "step": 53100 }, { "epoch": 4.335071707953064, "grad_norm": 10.985969543457031, "learning_rate": 6.977612564967808e-06, "loss": 0.2958, "step": 53200 }, { "epoch": 4.343220338983051, "grad_norm": 12.808884620666504, "learning_rate": 6.964542256744986e-06, "loss": 0.3169, "step": 53300 }, { "epoch": 4.351368970013038, "grad_norm": 21.643781661987305, "learning_rate": 6.9514560540864095e-06, "loss": 0.3154, "step": 53400 }, { "epoch": 4.3595176010430245, "grad_norm": 7.609200477600098, "learning_rate": 6.938354062868041e-06, "loss": 0.2985, "step": 53500 }, { "epoch": 4.367666232073011, "grad_norm": 13.469466209411621, "learning_rate": 6.925236389093588e-06, "loss": 0.3063, "step": 53600 }, { "epoch": 4.375814863102999, "grad_norm": 12.873883247375488, "learning_rate": 6.912103138893636e-06, "loss": 0.2903, "step": 53700 }, { "epoch": 4.383963494132986, "grad_norm": 8.953607559204102, "learning_rate": 6.898954418524797e-06, "loss": 0.2897, "step": 53800 }, { "epoch": 4.3921121251629724, "grad_norm": 21.484949111938477, "learning_rate": 6.885790334368844e-06, "loss": 0.2989, "step": 53900 }, { "epoch": 4.400260756192959, "grad_norm": 8.624776840209961, "learning_rate": 6.872610992931857e-06, "loss": 0.2811, "step": 54000 }, { "epoch": 4.408409387222947, "grad_norm": 13.120560646057129, "learning_rate": 6.859416500843351e-06, "loss": 0.3003, "step": 54100 }, { "epoch": 4.416558018252934, "grad_norm": 8.616204261779785, "learning_rate": 6.846206964855426e-06, "loss": 0.3191, "step": 54200 }, { "epoch": 4.42470664928292, "grad_norm": 7.0158233642578125, "learning_rate": 6.832982491841894e-06, "loss": 0.31, "step": 54300 }, { "epoch": 4.432855280312907, "grad_norm": 9.716617584228516, "learning_rate": 6.819743188797419e-06, "loss": 0.2949, "step": 54400 }, { "epoch": 4.441003911342895, "grad_norm": 10.602276802062988, "learning_rate": 6.806489162836649e-06, "loss": 0.3037, "step": 54500 }, { "epoch": 4.4491525423728815, "grad_norm": 8.699592590332031, "learning_rate": 6.793220521193347e-06, "loss": 0.313, "step": 54600 }, { "epoch": 4.457301173402868, "grad_norm": 8.307058334350586, "learning_rate": 6.779937371219532e-06, "loss": 0.2924, "step": 54700 }, { "epoch": 4.465449804432855, "grad_norm": 10.045998573303223, "learning_rate": 6.766639820384602e-06, "loss": 0.3124, "step": 54800 }, { "epoch": 4.473598435462843, "grad_norm": 15.478697776794434, "learning_rate": 6.753327976274467e-06, "loss": 0.2892, "step": 54900 }, { "epoch": 4.481747066492829, "grad_norm": 12.46609878540039, "learning_rate": 6.740001946590675e-06, "loss": 0.2809, "step": 55000 }, { "epoch": 4.489895697522816, "grad_norm": 11.292198181152344, "learning_rate": 6.726661839149556e-06, "loss": 0.2915, "step": 55100 }, { "epoch": 4.498044328552803, "grad_norm": 15.23190689086914, "learning_rate": 6.71330776188133e-06, "loss": 0.306, "step": 55200 }, { "epoch": 4.5061929595827905, "grad_norm": 11.232503890991211, "learning_rate": 6.69993982282924e-06, "loss": 0.2979, "step": 55300 }, { "epoch": 4.514341590612777, "grad_norm": 11.436495780944824, "learning_rate": 6.686558130148687e-06, "loss": 0.2976, "step": 55400 }, { "epoch": 4.522490221642764, "grad_norm": 11.90659236907959, "learning_rate": 6.673162792106341e-06, "loss": 0.3106, "step": 55500 }, { "epoch": 4.530638852672751, "grad_norm": 9.979248046875, "learning_rate": 6.6597539170792795e-06, "loss": 0.2948, "step": 55600 }, { "epoch": 4.5387874837027375, "grad_norm": 19.104442596435547, "learning_rate": 6.646331613554094e-06, "loss": 0.3248, "step": 55700 }, { "epoch": 4.546936114732725, "grad_norm": 9.139418601989746, "learning_rate": 6.632895990126028e-06, "loss": 0.2996, "step": 55800 }, { "epoch": 4.555084745762712, "grad_norm": 9.373650550842285, "learning_rate": 6.619447155498091e-06, "loss": 0.3127, "step": 55900 }, { "epoch": 4.563233376792699, "grad_norm": 12.213810920715332, "learning_rate": 6.605985218480179e-06, "loss": 0.3113, "step": 56000 }, { "epoch": 4.5713820078226854, "grad_norm": 9.15962028503418, "learning_rate": 6.5925102879881915e-06, "loss": 0.311, "step": 56100 }, { "epoch": 4.579530638852673, "grad_norm": 11.712223052978516, "learning_rate": 6.579022473043159e-06, "loss": 0.3074, "step": 56200 }, { "epoch": 4.58767926988266, "grad_norm": 9.559146881103516, "learning_rate": 6.565521882770355e-06, "loss": 0.3065, "step": 56300 }, { "epoch": 4.595827900912647, "grad_norm": 8.07590389251709, "learning_rate": 6.552008626398409e-06, "loss": 0.3195, "step": 56400 }, { "epoch": 4.603976531942633, "grad_norm": 13.063721656799316, "learning_rate": 6.5384828132584335e-06, "loss": 0.2778, "step": 56500 }, { "epoch": 4.612125162972621, "grad_norm": 13.26430892944336, "learning_rate": 6.524944552783129e-06, "loss": 0.3081, "step": 56600 }, { "epoch": 4.620273794002608, "grad_norm": 14.221997261047363, "learning_rate": 6.511393954505906e-06, "loss": 0.3072, "step": 56700 }, { "epoch": 4.6284224250325945, "grad_norm": 10.34438705444336, "learning_rate": 6.497831128059993e-06, "loss": 0.3078, "step": 56800 }, { "epoch": 4.636571056062581, "grad_norm": 15.65034294128418, "learning_rate": 6.4842561831775575e-06, "loss": 0.3035, "step": 56900 }, { "epoch": 4.644719687092568, "grad_norm": 10.238895416259766, "learning_rate": 6.470669229688809e-06, "loss": 0.2962, "step": 57000 }, { "epoch": 4.652868318122556, "grad_norm": 16.671092987060547, "learning_rate": 6.457070377521111e-06, "loss": 0.307, "step": 57100 }, { "epoch": 4.661016949152542, "grad_norm": 11.118473052978516, "learning_rate": 6.443459736698106e-06, "loss": 0.3079, "step": 57200 }, { "epoch": 4.669165580182529, "grad_norm": 7.511115550994873, "learning_rate": 6.429837417338804e-06, "loss": 0.2959, "step": 57300 }, { "epoch": 4.677314211212517, "grad_norm": 14.2573881149292, "learning_rate": 6.416203529656707e-06, "loss": 0.2948, "step": 57400 }, { "epoch": 4.6854628422425035, "grad_norm": 11.03162956237793, "learning_rate": 6.40255818395891e-06, "loss": 0.3095, "step": 57500 }, { "epoch": 4.69361147327249, "grad_norm": 11.995973587036133, "learning_rate": 6.388901490645214e-06, "loss": 0.3099, "step": 57600 }, { "epoch": 4.701760104302477, "grad_norm": 9.43193244934082, "learning_rate": 6.375233560207229e-06, "loss": 0.3276, "step": 57700 }, { "epoch": 4.709908735332464, "grad_norm": 10.617565155029297, "learning_rate": 6.361554503227475e-06, "loss": 0.3149, "step": 57800 }, { "epoch": 4.718057366362451, "grad_norm": 16.004545211791992, "learning_rate": 6.347864430378501e-06, "loss": 0.2907, "step": 57900 }, { "epoch": 4.726205997392438, "grad_norm": 18.075027465820312, "learning_rate": 6.334163452421978e-06, "loss": 0.3168, "step": 58000 }, { "epoch": 4.734354628422425, "grad_norm": 19.736661911010742, "learning_rate": 6.320451680207805e-06, "loss": 0.3077, "step": 58100 }, { "epoch": 4.742503259452412, "grad_norm": 6.202484607696533, "learning_rate": 6.306729224673217e-06, "loss": 0.3022, "step": 58200 }, { "epoch": 4.750651890482399, "grad_norm": 4.973538398742676, "learning_rate": 6.29299619684188e-06, "loss": 0.3032, "step": 58300 }, { "epoch": 4.758800521512386, "grad_norm": 9.67834186553955, "learning_rate": 6.2792527078230024e-06, "loss": 0.2937, "step": 58400 }, { "epoch": 4.766949152542373, "grad_norm": 7.5604777336120605, "learning_rate": 6.265498868810424e-06, "loss": 0.3132, "step": 58500 }, { "epoch": 4.77509778357236, "grad_norm": 11.391521453857422, "learning_rate": 6.251734791081728e-06, "loss": 0.3249, "step": 58600 }, { "epoch": 4.783246414602347, "grad_norm": 16.40961265563965, "learning_rate": 6.237960585997334e-06, "loss": 0.2951, "step": 58700 }, { "epoch": 4.791395045632334, "grad_norm": 4.114518165588379, "learning_rate": 6.224176364999595e-06, "loss": 0.3091, "step": 58800 }, { "epoch": 4.799543676662321, "grad_norm": 9.569024085998535, "learning_rate": 6.210382239611906e-06, "loss": 0.3093, "step": 58900 }, { "epoch": 4.8076923076923075, "grad_norm": 30.753637313842773, "learning_rate": 6.1965783214377895e-06, "loss": 0.2982, "step": 59000 }, { "epoch": 4.815840938722294, "grad_norm": 7.500620365142822, "learning_rate": 6.18276472216e-06, "loss": 0.2956, "step": 59100 }, { "epoch": 4.823989569752282, "grad_norm": 14.710212707519531, "learning_rate": 6.16894155353962e-06, "loss": 0.3078, "step": 59200 }, { "epoch": 4.832138200782269, "grad_norm": 7.550549507141113, "learning_rate": 6.1551089274151525e-06, "loss": 0.3093, "step": 59300 }, { "epoch": 4.840286831812255, "grad_norm": 8.313648223876953, "learning_rate": 6.141266955701616e-06, "loss": 0.2872, "step": 59400 }, { "epoch": 4.848435462842242, "grad_norm": 3.505223274230957, "learning_rate": 6.127415750389645e-06, "loss": 0.2991, "step": 59500 }, { "epoch": 4.85658409387223, "grad_norm": 10.405817985534668, "learning_rate": 6.113555423544576e-06, "loss": 0.3083, "step": 59600 }, { "epoch": 4.8647327249022165, "grad_norm": 9.818922996520996, "learning_rate": 6.0996860873055505e-06, "loss": 0.3131, "step": 59700 }, { "epoch": 4.872881355932203, "grad_norm": 8.345934867858887, "learning_rate": 6.085807853884595e-06, "loss": 0.2963, "step": 59800 }, { "epoch": 4.88102998696219, "grad_norm": 10.804642677307129, "learning_rate": 6.071920835565724e-06, "loss": 0.315, "step": 59900 }, { "epoch": 4.889178617992178, "grad_norm": 10.550320625305176, "learning_rate": 6.058025144704026e-06, "loss": 0.288, "step": 60000 }, { "epoch": 4.897327249022164, "grad_norm": 7.386425018310547, "learning_rate": 6.044120893724758e-06, "loss": 0.3175, "step": 60100 }, { "epoch": 4.905475880052151, "grad_norm": 16.652528762817383, "learning_rate": 6.030208195122433e-06, "loss": 0.3218, "step": 60200 }, { "epoch": 4.913624511082138, "grad_norm": 15.053431510925293, "learning_rate": 6.016287161459907e-06, "loss": 0.2769, "step": 60300 }, { "epoch": 4.921773142112125, "grad_norm": 7.756086349487305, "learning_rate": 6.002357905367481e-06, "loss": 0.289, "step": 60400 }, { "epoch": 4.929921773142112, "grad_norm": 10.426520347595215, "learning_rate": 5.9884205395419725e-06, "loss": 0.3169, "step": 60500 }, { "epoch": 4.938070404172099, "grad_norm": 12.334880828857422, "learning_rate": 5.974475176745813e-06, "loss": 0.3093, "step": 60600 }, { "epoch": 4.946219035202086, "grad_norm": 14.239689826965332, "learning_rate": 5.960521929806141e-06, "loss": 0.3036, "step": 60700 }, { "epoch": 4.9543676662320735, "grad_norm": 12.593892097473145, "learning_rate": 5.946560911613877e-06, "loss": 0.2911, "step": 60800 }, { "epoch": 4.96251629726206, "grad_norm": 4.950251579284668, "learning_rate": 5.9325922351228186e-06, "loss": 0.2942, "step": 60900 }, { "epoch": 4.970664928292047, "grad_norm": 10.60743522644043, "learning_rate": 5.918616013348719e-06, "loss": 0.302, "step": 61000 }, { "epoch": 4.978813559322034, "grad_norm": 18.459735870361328, "learning_rate": 5.904632359368388e-06, "loss": 0.2806, "step": 61100 }, { "epoch": 4.9869621903520205, "grad_norm": 10.454113006591797, "learning_rate": 5.890641386318756e-06, "loss": 0.3009, "step": 61200 }, { "epoch": 4.995110821382008, "grad_norm": 12.8052396774292, "learning_rate": 5.876643207395976e-06, "loss": 0.3122, "step": 61300 }, { "epoch": 5.0, "eval_accuracy": 0.8285140562248996, "eval_loss": 0.5347269773483276, "eval_runtime": 6.815, "eval_samples_per_second": 365.37, "eval_steps_per_second": 45.781, "step": 61360 }, { "epoch": 5.003259452411995, "grad_norm": 3.523259162902832, "learning_rate": 5.862637935854502e-06, "loss": 0.289, "step": 61400 }, { "epoch": 5.011408083441982, "grad_norm": 14.498679161071777, "learning_rate": 5.848625685006164e-06, "loss": 0.2673, "step": 61500 }, { "epoch": 5.019556714471968, "grad_norm": 15.165558815002441, "learning_rate": 5.834606568219269e-06, "loss": 0.2499, "step": 61600 }, { "epoch": 5.027705345501956, "grad_norm": 12.705721855163574, "learning_rate": 5.820580698917666e-06, "loss": 0.2486, "step": 61700 }, { "epoch": 5.035853976531943, "grad_norm": 15.987256050109863, "learning_rate": 5.806548190579842e-06, "loss": 0.2417, "step": 61800 }, { "epoch": 5.0440026075619295, "grad_norm": 8.831116676330566, "learning_rate": 5.792509156737997e-06, "loss": 0.2265, "step": 61900 }, { "epoch": 5.052151238591916, "grad_norm": 12.182964324951172, "learning_rate": 5.7784637109771225e-06, "loss": 0.2538, "step": 62000 }, { "epoch": 5.060299869621904, "grad_norm": 10.809981346130371, "learning_rate": 5.764411966934092e-06, "loss": 0.2603, "step": 62100 }, { "epoch": 5.068448500651891, "grad_norm": 5.705296039581299, "learning_rate": 5.750354038296733e-06, "loss": 0.2438, "step": 62200 }, { "epoch": 5.076597131681877, "grad_norm": 9.95255184173584, "learning_rate": 5.736290038802911e-06, "loss": 0.234, "step": 62300 }, { "epoch": 5.084745762711864, "grad_norm": 7.724064350128174, "learning_rate": 5.722220082239608e-06, "loss": 0.2488, "step": 62400 }, { "epoch": 5.092894393741851, "grad_norm": 10.82822036743164, "learning_rate": 5.708144282442006e-06, "loss": 0.2591, "step": 62500 }, { "epoch": 5.101043024771839, "grad_norm": 8.642077445983887, "learning_rate": 5.694062753292559e-06, "loss": 0.2581, "step": 62600 }, { "epoch": 5.109191655801825, "grad_norm": 10.630475044250488, "learning_rate": 5.679975608720078e-06, "loss": 0.2408, "step": 62700 }, { "epoch": 5.117340286831812, "grad_norm": 10.559286117553711, "learning_rate": 5.665882962698801e-06, "loss": 0.2417, "step": 62800 }, { "epoch": 5.125488917861799, "grad_norm": 6.505354881286621, "learning_rate": 5.651784929247486e-06, "loss": 0.2517, "step": 62900 }, { "epoch": 5.1336375488917865, "grad_norm": 10.710380554199219, "learning_rate": 5.637681622428468e-06, "loss": 0.235, "step": 63000 }, { "epoch": 5.141786179921773, "grad_norm": 4.721646785736084, "learning_rate": 5.6235731563467535e-06, "loss": 0.2577, "step": 63100 }, { "epoch": 5.14993481095176, "grad_norm": 8.588154792785645, "learning_rate": 5.609459645149089e-06, "loss": 0.2728, "step": 63200 }, { "epoch": 5.158083441981747, "grad_norm": 19.248777389526367, "learning_rate": 5.595341203023044e-06, "loss": 0.2371, "step": 63300 }, { "epoch": 5.166232073011734, "grad_norm": 8.45293140411377, "learning_rate": 5.581217944196071e-06, "loss": 0.264, "step": 63400 }, { "epoch": 5.174380704041721, "grad_norm": 8.333393096923828, "learning_rate": 5.567089982934605e-06, "loss": 0.2558, "step": 63500 }, { "epoch": 5.182529335071708, "grad_norm": 14.054290771484375, "learning_rate": 5.552957433543119e-06, "loss": 0.2524, "step": 63600 }, { "epoch": 5.190677966101695, "grad_norm": 12.668076515197754, "learning_rate": 5.538820410363214e-06, "loss": 0.2408, "step": 63700 }, { "epoch": 5.198826597131682, "grad_norm": 9.344785690307617, "learning_rate": 5.524679027772676e-06, "loss": 0.2538, "step": 63800 }, { "epoch": 5.206975228161669, "grad_norm": 9.552376747131348, "learning_rate": 5.510533400184572e-06, "loss": 0.2535, "step": 63900 }, { "epoch": 5.215123859191656, "grad_norm": 10.270748138427734, "learning_rate": 5.496383642046311e-06, "loss": 0.2672, "step": 64000 }, { "epoch": 5.2232724902216425, "grad_norm": 15.067427635192871, "learning_rate": 5.4822298678387174e-06, "loss": 0.2455, "step": 64100 }, { "epoch": 5.23142112125163, "grad_norm": 5.5667948722839355, "learning_rate": 5.468072192075111e-06, "loss": 0.2539, "step": 64200 }, { "epoch": 5.239569752281617, "grad_norm": 11.088788032531738, "learning_rate": 5.453910729300378e-06, "loss": 0.2523, "step": 64300 }, { "epoch": 5.247718383311604, "grad_norm": 24.676876068115234, "learning_rate": 5.439745594090042e-06, "loss": 0.2488, "step": 64400 }, { "epoch": 5.25586701434159, "grad_norm": 9.937374114990234, "learning_rate": 5.425576901049342e-06, "loss": 0.2575, "step": 64500 }, { "epoch": 5.264015645371577, "grad_norm": 13.66021728515625, "learning_rate": 5.411404764812299e-06, "loss": 0.2396, "step": 64600 }, { "epoch": 5.272164276401565, "grad_norm": 11.568852424621582, "learning_rate": 5.3972293000407945e-06, "loss": 0.2398, "step": 64700 }, { "epoch": 5.280312907431552, "grad_norm": 9.292428970336914, "learning_rate": 5.383050621423639e-06, "loss": 0.2696, "step": 64800 }, { "epoch": 5.288461538461538, "grad_norm": 21.01643180847168, "learning_rate": 5.368868843675642e-06, "loss": 0.2522, "step": 64900 }, { "epoch": 5.296610169491525, "grad_norm": 7.557727813720703, "learning_rate": 5.354684081536693e-06, "loss": 0.2709, "step": 65000 }, { "epoch": 5.304758800521513, "grad_norm": 7.703597545623779, "learning_rate": 5.340496449770824e-06, "loss": 0.2561, "step": 65100 }, { "epoch": 5.3129074315514995, "grad_norm": 11.133892059326172, "learning_rate": 5.3263060631652805e-06, "loss": 0.2595, "step": 65200 }, { "epoch": 5.321056062581486, "grad_norm": 15.144754409790039, "learning_rate": 5.312113036529604e-06, "loss": 0.2506, "step": 65300 }, { "epoch": 5.329204693611473, "grad_norm": 7.959693431854248, "learning_rate": 5.297917484694692e-06, "loss": 0.2644, "step": 65400 }, { "epoch": 5.337353324641461, "grad_norm": 15.450654029846191, "learning_rate": 5.28371952251187e-06, "loss": 0.2533, "step": 65500 }, { "epoch": 5.345501955671447, "grad_norm": 6.035745620727539, "learning_rate": 5.269519264851967e-06, "loss": 0.2507, "step": 65600 }, { "epoch": 5.353650586701434, "grad_norm": 8.266439437866211, "learning_rate": 5.255316826604385e-06, "loss": 0.2588, "step": 65700 }, { "epoch": 5.361799217731421, "grad_norm": 9.542835235595703, "learning_rate": 5.24111232267617e-06, "loss": 0.2584, "step": 65800 }, { "epoch": 5.369947848761408, "grad_norm": 11.433173179626465, "learning_rate": 5.2269058679910735e-06, "loss": 0.2451, "step": 65900 }, { "epoch": 5.378096479791395, "grad_norm": 12.72153377532959, "learning_rate": 5.212697577488635e-06, "loss": 0.2496, "step": 66000 }, { "epoch": 5.386245110821382, "grad_norm": 9.416111946105957, "learning_rate": 5.1984875661232495e-06, "loss": 0.2567, "step": 66100 }, { "epoch": 5.394393741851369, "grad_norm": 15.701902389526367, "learning_rate": 5.184275948863231e-06, "loss": 0.2432, "step": 66200 }, { "epoch": 5.4025423728813555, "grad_norm": 7.241784572601318, "learning_rate": 5.1700628406898835e-06, "loss": 0.2441, "step": 66300 }, { "epoch": 5.410691003911343, "grad_norm": 21.102312088012695, "learning_rate": 5.155848356596581e-06, "loss": 0.2695, "step": 66400 }, { "epoch": 5.41883963494133, "grad_norm": 12.834817886352539, "learning_rate": 5.1416326115878255e-06, "loss": 0.2705, "step": 66500 }, { "epoch": 5.426988265971317, "grad_norm": 29.203624725341797, "learning_rate": 5.127415720678319e-06, "loss": 0.2354, "step": 66600 }, { "epoch": 5.435136897001303, "grad_norm": 13.500927925109863, "learning_rate": 5.113197798892038e-06, "loss": 0.2508, "step": 66700 }, { "epoch": 5.443285528031291, "grad_norm": 7.524002552032471, "learning_rate": 5.098978961261296e-06, "loss": 0.2494, "step": 66800 }, { "epoch": 5.451434159061278, "grad_norm": 17.00074577331543, "learning_rate": 5.084759322825821e-06, "loss": 0.241, "step": 66900 }, { "epoch": 5.459582790091265, "grad_norm": 11.755769729614258, "learning_rate": 5.070538998631813e-06, "loss": 0.2658, "step": 67000 }, { "epoch": 5.467731421121251, "grad_norm": 13.64929485321045, "learning_rate": 5.056318103731028e-06, "loss": 0.2515, "step": 67100 }, { "epoch": 5.475880052151239, "grad_norm": 6.673364639282227, "learning_rate": 5.042096753179835e-06, "loss": 0.2505, "step": 67200 }, { "epoch": 5.484028683181226, "grad_norm": 6.764876365661621, "learning_rate": 5.02787506203829e-06, "loss": 0.2584, "step": 67300 }, { "epoch": 5.4921773142112125, "grad_norm": 11.133795738220215, "learning_rate": 5.013653145369204e-06, "loss": 0.2598, "step": 67400 }, { "epoch": 5.500325945241199, "grad_norm": 11.689901351928711, "learning_rate": 4.9994311182372145e-06, "loss": 0.2397, "step": 67500 }, { "epoch": 5.508474576271187, "grad_norm": 18.084266662597656, "learning_rate": 4.985209095707852e-06, "loss": 0.265, "step": 67600 }, { "epoch": 5.516623207301174, "grad_norm": 12.136353492736816, "learning_rate": 4.970987192846609e-06, "loss": 0.2372, "step": 67700 }, { "epoch": 5.52477183833116, "grad_norm": 14.060345649719238, "learning_rate": 4.95676552471801e-06, "loss": 0.2657, "step": 67800 }, { "epoch": 5.532920469361147, "grad_norm": 5.493065357208252, "learning_rate": 4.942544206384682e-06, "loss": 0.2377, "step": 67900 }, { "epoch": 5.541069100391134, "grad_norm": 13.543553352355957, "learning_rate": 4.928323352906421e-06, "loss": 0.2456, "step": 68000 }, { "epoch": 5.5492177314211215, "grad_norm": 12.011448860168457, "learning_rate": 4.9141030793392595e-06, "loss": 0.2695, "step": 68100 }, { "epoch": 5.557366362451108, "grad_norm": 7.862688064575195, "learning_rate": 4.899883500734542e-06, "loss": 0.2668, "step": 68200 }, { "epoch": 5.565514993481095, "grad_norm": 11.895374298095703, "learning_rate": 4.885664732137988e-06, "loss": 0.2581, "step": 68300 }, { "epoch": 5.573663624511082, "grad_norm": 19.049335479736328, "learning_rate": 4.871446888588762e-06, "loss": 0.2581, "step": 68400 }, { "epoch": 5.581812255541069, "grad_norm": 15.173524856567383, "learning_rate": 4.85723008511855e-06, "loss": 0.2374, "step": 68500 }, { "epoch": 5.589960886571056, "grad_norm": 15.82532024383545, "learning_rate": 4.84301443675062e-06, "loss": 0.2548, "step": 68600 }, { "epoch": 5.598109517601043, "grad_norm": 9.289793014526367, "learning_rate": 4.828800058498889e-06, "loss": 0.2585, "step": 68700 }, { "epoch": 5.60625814863103, "grad_norm": 13.010422706604004, "learning_rate": 4.814587065367009e-06, "loss": 0.264, "step": 68800 }, { "epoch": 5.614406779661017, "grad_norm": 10.556730270385742, "learning_rate": 4.800375572347414e-06, "loss": 0.2436, "step": 68900 }, { "epoch": 5.622555410691004, "grad_norm": 13.723767280578613, "learning_rate": 4.786165694420408e-06, "loss": 0.2477, "step": 69000 }, { "epoch": 5.630704041720991, "grad_norm": 11.722618103027344, "learning_rate": 4.771957546553226e-06, "loss": 0.2581, "step": 69100 }, { "epoch": 5.638852672750978, "grad_norm": 10.373120307922363, "learning_rate": 4.757751243699109e-06, "loss": 0.2606, "step": 69200 }, { "epoch": 5.647001303780964, "grad_norm": 15.857172966003418, "learning_rate": 4.743546900796364e-06, "loss": 0.2723, "step": 69300 }, { "epoch": 5.655149934810952, "grad_norm": 22.450532913208008, "learning_rate": 4.729344632767446e-06, "loss": 0.235, "step": 69400 }, { "epoch": 5.663298565840939, "grad_norm": 15.469109535217285, "learning_rate": 4.71514455451802e-06, "loss": 0.2455, "step": 69500 }, { "epoch": 5.6714471968709255, "grad_norm": 21.650880813598633, "learning_rate": 4.7009467809360375e-06, "loss": 0.2597, "step": 69600 }, { "epoch": 5.679595827900913, "grad_norm": 16.47661590576172, "learning_rate": 4.6867514268907995e-06, "loss": 0.2555, "step": 69700 }, { "epoch": 5.6877444589309, "grad_norm": 16.370121002197266, "learning_rate": 4.672558607232033e-06, "loss": 0.2411, "step": 69800 }, { "epoch": 5.695893089960887, "grad_norm": 10.867352485656738, "learning_rate": 4.658368436788963e-06, "loss": 0.2638, "step": 69900 }, { "epoch": 5.704041720990873, "grad_norm": 13.257880210876465, "learning_rate": 4.644181030369378e-06, "loss": 0.233, "step": 70000 }, { "epoch": 5.71219035202086, "grad_norm": 16.66828155517578, "learning_rate": 4.629996502758703e-06, "loss": 0.2549, "step": 70100 }, { "epoch": 5.720338983050848, "grad_norm": 8.0230712890625, "learning_rate": 4.615814968719071e-06, "loss": 0.251, "step": 70200 }, { "epoch": 5.7284876140808345, "grad_norm": 20.61688804626465, "learning_rate": 4.6016365429884e-06, "loss": 0.2617, "step": 70300 }, { "epoch": 5.736636245110821, "grad_norm": 4.916039943695068, "learning_rate": 4.587461340279457e-06, "loss": 0.2772, "step": 70400 }, { "epoch": 5.744784876140808, "grad_norm": 13.59726333618164, "learning_rate": 4.573289475278927e-06, "loss": 0.2654, "step": 70500 }, { "epoch": 5.752933507170796, "grad_norm": 21.178253173828125, "learning_rate": 4.559121062646499e-06, "loss": 0.237, "step": 70600 }, { "epoch": 5.761082138200782, "grad_norm": 15.958664894104004, "learning_rate": 4.544956217013927e-06, "loss": 0.2447, "step": 70700 }, { "epoch": 5.769230769230769, "grad_norm": 7.610626220703125, "learning_rate": 4.530795052984104e-06, "loss": 0.239, "step": 70800 }, { "epoch": 5.777379400260756, "grad_norm": 10.934889793395996, "learning_rate": 4.5166376851301385e-06, "loss": 0.2562, "step": 70900 }, { "epoch": 5.7855280312907436, "grad_norm": 7.9625244140625, "learning_rate": 4.502484227994426e-06, "loss": 0.2606, "step": 71000 }, { "epoch": 5.79367666232073, "grad_norm": 15.313315391540527, "learning_rate": 4.488334796087719e-06, "loss": 0.2454, "step": 71100 }, { "epoch": 5.801825293350717, "grad_norm": 16.183135986328125, "learning_rate": 4.474189503888207e-06, "loss": 0.2591, "step": 71200 }, { "epoch": 5.809973924380704, "grad_norm": 8.89918041229248, "learning_rate": 4.4600484658405815e-06, "loss": 0.2577, "step": 71300 }, { "epoch": 5.818122555410691, "grad_norm": 8.31811237335205, "learning_rate": 4.445911796355119e-06, "loss": 0.2382, "step": 71400 }, { "epoch": 5.826271186440678, "grad_norm": 9.141270637512207, "learning_rate": 4.431779609806751e-06, "loss": 0.2401, "step": 71500 }, { "epoch": 5.834419817470665, "grad_norm": 8.92165756225586, "learning_rate": 4.4176520205341365e-06, "loss": 0.2133, "step": 71600 }, { "epoch": 5.842568448500652, "grad_norm": 14.15666675567627, "learning_rate": 4.403529142838745e-06, "loss": 0.2536, "step": 71700 }, { "epoch": 5.8507170795306385, "grad_norm": 8.742586135864258, "learning_rate": 4.38941109098392e-06, "loss": 0.261, "step": 71800 }, { "epoch": 5.858865710560626, "grad_norm": 8.7103853225708, "learning_rate": 4.375297979193965e-06, "loss": 0.2331, "step": 71900 }, { "epoch": 5.867014341590613, "grad_norm": 13.822142601013184, "learning_rate": 4.361189921653215e-06, "loss": 0.2583, "step": 72000 }, { "epoch": 5.8751629726206, "grad_norm": 9.043753623962402, "learning_rate": 4.3470870325051084e-06, "loss": 0.2635, "step": 72100 }, { "epoch": 5.883311603650586, "grad_norm": 10.288004875183105, "learning_rate": 4.332989425851273e-06, "loss": 0.2644, "step": 72200 }, { "epoch": 5.891460234680574, "grad_norm": 18.826217651367188, "learning_rate": 4.318897215750593e-06, "loss": 0.2515, "step": 72300 }, { "epoch": 5.899608865710561, "grad_norm": 11.778913497924805, "learning_rate": 4.304810516218298e-06, "loss": 0.2628, "step": 72400 }, { "epoch": 5.9077574967405475, "grad_norm": 16.54121971130371, "learning_rate": 4.290729441225027e-06, "loss": 0.2792, "step": 72500 }, { "epoch": 5.915906127770534, "grad_norm": 12.631098747253418, "learning_rate": 4.276654104695915e-06, "loss": 0.2503, "step": 72600 }, { "epoch": 5.924054758800521, "grad_norm": 10.706419944763184, "learning_rate": 4.262584620509669e-06, "loss": 0.2564, "step": 72700 }, { "epoch": 5.932203389830509, "grad_norm": 8.69650650024414, "learning_rate": 4.248521102497649e-06, "loss": 0.2569, "step": 72800 }, { "epoch": 5.940352020860495, "grad_norm": 12.438202857971191, "learning_rate": 4.23446366444294e-06, "loss": 0.2531, "step": 72900 }, { "epoch": 5.948500651890482, "grad_norm": 22.02505874633789, "learning_rate": 4.220412420079438e-06, "loss": 0.2692, "step": 73000 }, { "epoch": 5.95664928292047, "grad_norm": 13.650114059448242, "learning_rate": 4.206367483090931e-06, "loss": 0.2663, "step": 73100 }, { "epoch": 5.9647979139504566, "grad_norm": 13.705251693725586, "learning_rate": 4.192328967110172e-06, "loss": 0.2295, "step": 73200 }, { "epoch": 5.972946544980443, "grad_norm": 7.683305263519287, "learning_rate": 4.178296985717967e-06, "loss": 0.2622, "step": 73300 }, { "epoch": 5.98109517601043, "grad_norm": 7.798497676849365, "learning_rate": 4.16427165244225e-06, "loss": 0.2431, "step": 73400 }, { "epoch": 5.989243807040417, "grad_norm": 8.129569053649902, "learning_rate": 4.150253080757172e-06, "loss": 0.2372, "step": 73500 }, { "epoch": 5.9973924380704045, "grad_norm": 14.516979217529297, "learning_rate": 4.136241384082174e-06, "loss": 0.2801, "step": 73600 }, { "epoch": 6.0, "eval_accuracy": 0.8168674698795181, "eval_loss": 0.6053332686424255, "eval_runtime": 7.0202, "eval_samples_per_second": 354.691, "eval_steps_per_second": 44.443, "step": 73632 }, { "epoch": 6.005541069100391, "grad_norm": 11.174201965332031, "learning_rate": 4.122236675781071e-06, "loss": 0.224, "step": 73700 }, { "epoch": 6.013689700130378, "grad_norm": 24.070091247558594, "learning_rate": 4.108239069161147e-06, "loss": 0.2289, "step": 73800 }, { "epoch": 6.021838331160365, "grad_norm": 14.804594993591309, "learning_rate": 4.09424867747222e-06, "loss": 0.2017, "step": 73900 }, { "epoch": 6.029986962190352, "grad_norm": 20.014951705932617, "learning_rate": 4.0802656139057385e-06, "loss": 0.2203, "step": 74000 }, { "epoch": 6.038135593220339, "grad_norm": 11.608116149902344, "learning_rate": 4.066289991593859e-06, "loss": 0.1983, "step": 74100 }, { "epoch": 6.046284224250326, "grad_norm": 10.88152027130127, "learning_rate": 4.052321923608539e-06, "loss": 0.2167, "step": 74200 }, { "epoch": 6.054432855280313, "grad_norm": 9.91988754272461, "learning_rate": 4.038361522960609e-06, "loss": 0.2114, "step": 74300 }, { "epoch": 6.0625814863103, "grad_norm": 10.7438383102417, "learning_rate": 4.024408902598871e-06, "loss": 0.2126, "step": 74400 }, { "epoch": 6.070730117340287, "grad_norm": 13.341911315917969, "learning_rate": 4.01046417540918e-06, "loss": 0.2099, "step": 74500 }, { "epoch": 6.078878748370274, "grad_norm": 14.30612564086914, "learning_rate": 3.996527454213522e-06, "loss": 0.2159, "step": 74600 }, { "epoch": 6.0870273794002605, "grad_norm": 14.352286338806152, "learning_rate": 3.98259885176912e-06, "loss": 0.2314, "step": 74700 }, { "epoch": 6.095176010430248, "grad_norm": 10.346816062927246, "learning_rate": 3.968678480767503e-06, "loss": 0.2111, "step": 74800 }, { "epoch": 6.103324641460235, "grad_norm": 16.672042846679688, "learning_rate": 3.954766453833608e-06, "loss": 0.199, "step": 74900 }, { "epoch": 6.111473272490222, "grad_norm": 14.719056129455566, "learning_rate": 3.94086288352486e-06, "loss": 0.1996, "step": 75000 }, { "epoch": 6.119621903520208, "grad_norm": 15.159549713134766, "learning_rate": 3.926967882330262e-06, "loss": 0.2246, "step": 75100 }, { "epoch": 6.127770534550195, "grad_norm": 8.278336524963379, "learning_rate": 3.913081562669492e-06, "loss": 0.229, "step": 75200 }, { "epoch": 6.135919165580183, "grad_norm": 17.559757232666016, "learning_rate": 3.899204036891989e-06, "loss": 0.2012, "step": 75300 }, { "epoch": 6.1440677966101696, "grad_norm": 11.502748489379883, "learning_rate": 3.885335417276037e-06, "loss": 0.202, "step": 75400 }, { "epoch": 6.152216427640156, "grad_norm": 10.84666633605957, "learning_rate": 3.871475816027868e-06, "loss": 0.2142, "step": 75500 }, { "epoch": 6.160365058670143, "grad_norm": 15.855389595031738, "learning_rate": 3.857625345280751e-06, "loss": 0.2287, "step": 75600 }, { "epoch": 6.168513689700131, "grad_norm": 12.554780960083008, "learning_rate": 3.843784117094081e-06, "loss": 0.1949, "step": 75700 }, { "epoch": 6.1766623207301175, "grad_norm": 7.536383628845215, "learning_rate": 3.829952243452475e-06, "loss": 0.2062, "step": 75800 }, { "epoch": 6.184810951760104, "grad_norm": 13.602145195007324, "learning_rate": 3.816129836264864e-06, "loss": 0.2211, "step": 75900 }, { "epoch": 6.192959582790091, "grad_norm": 10.88949966430664, "learning_rate": 3.802317007363593e-06, "loss": 0.2141, "step": 76000 }, { "epoch": 6.201108213820079, "grad_norm": 3.1079776287078857, "learning_rate": 3.7885138685035113e-06, "loss": 0.2121, "step": 76100 }, { "epoch": 6.209256844850065, "grad_norm": 10.546631813049316, "learning_rate": 3.774720531361063e-06, "loss": 0.2272, "step": 76200 }, { "epoch": 6.217405475880052, "grad_norm": 22.11454200744629, "learning_rate": 3.7609371075334e-06, "loss": 0.2118, "step": 76300 }, { "epoch": 6.225554106910039, "grad_norm": 16.33343505859375, "learning_rate": 3.7471637085374614e-06, "loss": 0.227, "step": 76400 }, { "epoch": 6.2337027379400265, "grad_norm": 14.43807315826416, "learning_rate": 3.7334004458090833e-06, "loss": 0.2287, "step": 76500 }, { "epoch": 6.241851368970013, "grad_norm": 14.813934326171875, "learning_rate": 3.719647430702089e-06, "loss": 0.2064, "step": 76600 }, { "epoch": 6.25, "grad_norm": 5.587681770324707, "learning_rate": 3.705904774487396e-06, "loss": 0.2051, "step": 76700 }, { "epoch": 6.258148631029987, "grad_norm": 7.330463409423828, "learning_rate": 3.6921725883521087e-06, "loss": 0.2225, "step": 76800 }, { "epoch": 6.2662972620599735, "grad_norm": 19.726444244384766, "learning_rate": 3.678450983398623e-06, "loss": 0.2131, "step": 76900 }, { "epoch": 6.274445893089961, "grad_norm": 15.526715278625488, "learning_rate": 3.664740070643723e-06, "loss": 0.2257, "step": 77000 }, { "epoch": 6.282594524119948, "grad_norm": 9.113424301147461, "learning_rate": 3.6510399610176906e-06, "loss": 0.2075, "step": 77100 }, { "epoch": 6.290743155149935, "grad_norm": 11.527823448181152, "learning_rate": 3.6373507653634e-06, "loss": 0.1921, "step": 77200 }, { "epoch": 6.298891786179921, "grad_norm": 5.839615345001221, "learning_rate": 3.6236725944354245e-06, "loss": 0.2426, "step": 77300 }, { "epoch": 6.307040417209909, "grad_norm": 16.31635284423828, "learning_rate": 3.6100055588991435e-06, "loss": 0.206, "step": 77400 }, { "epoch": 6.315189048239896, "grad_norm": 13.138345718383789, "learning_rate": 3.5963497693298386e-06, "loss": 0.2223, "step": 77500 }, { "epoch": 6.3233376792698825, "grad_norm": 3.202862024307251, "learning_rate": 3.5827053362118085e-06, "loss": 0.2095, "step": 77600 }, { "epoch": 6.331486310299869, "grad_norm": 11.949639320373535, "learning_rate": 3.5690723699374697e-06, "loss": 0.2176, "step": 77700 }, { "epoch": 6.339634941329857, "grad_norm": 17.555377960205078, "learning_rate": 3.5554509808064602e-06, "loss": 0.2204, "step": 77800 }, { "epoch": 6.347783572359844, "grad_norm": 6.945880889892578, "learning_rate": 3.5418412790247575e-06, "loss": 0.2006, "step": 77900 }, { "epoch": 6.3559322033898304, "grad_norm": 29.10856056213379, "learning_rate": 3.528243374703776e-06, "loss": 0.2089, "step": 78000 }, { "epoch": 6.364080834419817, "grad_norm": 21.48233413696289, "learning_rate": 3.5146573778594855e-06, "loss": 0.2091, "step": 78100 }, { "epoch": 6.372229465449805, "grad_norm": 10.77776050567627, "learning_rate": 3.5010833984115135e-06, "loss": 0.1919, "step": 78200 }, { "epoch": 6.380378096479792, "grad_norm": 22.155200958251953, "learning_rate": 3.4875215461822574e-06, "loss": 0.2269, "step": 78300 }, { "epoch": 6.388526727509778, "grad_norm": 12.029594421386719, "learning_rate": 3.473971930896001e-06, "loss": 0.2328, "step": 78400 }, { "epoch": 6.396675358539765, "grad_norm": 8.563623428344727, "learning_rate": 3.460434662178024e-06, "loss": 0.2202, "step": 78500 }, { "epoch": 6.404823989569753, "grad_norm": 6.394750118255615, "learning_rate": 3.4469098495537063e-06, "loss": 0.2324, "step": 78600 }, { "epoch": 6.4129726205997395, "grad_norm": 15.485038757324219, "learning_rate": 3.433397602447659e-06, "loss": 0.2152, "step": 78700 }, { "epoch": 6.421121251629726, "grad_norm": 15.386170387268066, "learning_rate": 3.4198980301828256e-06, "loss": 0.2065, "step": 78800 }, { "epoch": 6.429269882659713, "grad_norm": 11.893247604370117, "learning_rate": 3.406411241979603e-06, "loss": 0.2235, "step": 78900 }, { "epoch": 6.4374185136897, "grad_norm": 12.216060638427734, "learning_rate": 3.3929373469549554e-06, "loss": 0.211, "step": 79000 }, { "epoch": 6.445567144719687, "grad_norm": 9.018731117248535, "learning_rate": 3.379476454121533e-06, "loss": 0.2253, "step": 79100 }, { "epoch": 6.453715775749674, "grad_norm": 18.289003372192383, "learning_rate": 3.366028672386792e-06, "loss": 0.2265, "step": 79200 }, { "epoch": 6.461864406779661, "grad_norm": 6.403520584106445, "learning_rate": 3.35259411055211e-06, "loss": 0.2241, "step": 79300 }, { "epoch": 6.470013037809648, "grad_norm": 6.311509609222412, "learning_rate": 3.3391728773119037e-06, "loss": 0.2204, "step": 79400 }, { "epoch": 6.478161668839635, "grad_norm": 16.21648597717285, "learning_rate": 3.3257650812527566e-06, "loss": 0.2083, "step": 79500 }, { "epoch": 6.486310299869622, "grad_norm": 2.8797686100006104, "learning_rate": 3.3123708308525354e-06, "loss": 0.2152, "step": 79600 }, { "epoch": 6.494458930899609, "grad_norm": 12.40995979309082, "learning_rate": 3.298990234479514e-06, "loss": 0.2061, "step": 79700 }, { "epoch": 6.5026075619295955, "grad_norm": 13.1309814453125, "learning_rate": 3.2856234003914945e-06, "loss": 0.196, "step": 79800 }, { "epoch": 6.510756192959583, "grad_norm": 11.270479202270508, "learning_rate": 3.2722704367349357e-06, "loss": 0.1969, "step": 79900 }, { "epoch": 6.51890482398957, "grad_norm": 5.54075813293457, "learning_rate": 3.258931451544075e-06, "loss": 0.2345, "step": 80000 }, { "epoch": 6.527053455019557, "grad_norm": 9.90404987335205, "learning_rate": 3.245606552740053e-06, "loss": 0.2223, "step": 80100 }, { "epoch": 6.5352020860495434, "grad_norm": 16.18077850341797, "learning_rate": 3.2322958481300426e-06, "loss": 0.2163, "step": 80200 }, { "epoch": 6.54335071707953, "grad_norm": 6.288787841796875, "learning_rate": 3.2189994454063776e-06, "loss": 0.2093, "step": 80300 }, { "epoch": 6.551499348109518, "grad_norm": 21.265981674194336, "learning_rate": 3.205717452145679e-06, "loss": 0.1972, "step": 80400 }, { "epoch": 6.559647979139505, "grad_norm": 14.27213191986084, "learning_rate": 3.1924499758079863e-06, "loss": 0.2211, "step": 80500 }, { "epoch": 6.567796610169491, "grad_norm": 6.663931369781494, "learning_rate": 3.1791971237358893e-06, "loss": 0.2037, "step": 80600 }, { "epoch": 6.575945241199479, "grad_norm": 20.920997619628906, "learning_rate": 3.1659590031536546e-06, "loss": 0.2016, "step": 80700 }, { "epoch": 6.584093872229466, "grad_norm": 5.427749156951904, "learning_rate": 3.1527357211663647e-06, "loss": 0.2145, "step": 80800 }, { "epoch": 6.5922425032594525, "grad_norm": 5.944066524505615, "learning_rate": 3.1395273847590444e-06, "loss": 0.2243, "step": 80900 }, { "epoch": 6.600391134289439, "grad_norm": 4.4831366539001465, "learning_rate": 3.1263341007958015e-06, "loss": 0.2251, "step": 81000 }, { "epoch": 6.608539765319426, "grad_norm": 7.92203950881958, "learning_rate": 3.113155976018959e-06, "loss": 0.2202, "step": 81100 }, { "epoch": 6.616688396349414, "grad_norm": 7.755978584289551, "learning_rate": 3.0999931170481922e-06, "loss": 0.2099, "step": 81200 }, { "epoch": 6.6248370273794, "grad_norm": 12.123492240905762, "learning_rate": 3.086845630379668e-06, "loss": 0.2279, "step": 81300 }, { "epoch": 6.632985658409387, "grad_norm": 8.695425987243652, "learning_rate": 3.073713622385177e-06, "loss": 0.2171, "step": 81400 }, { "epoch": 6.641134289439374, "grad_norm": 12.858569145202637, "learning_rate": 3.0605971993112805e-06, "loss": 0.21, "step": 81500 }, { "epoch": 6.6492829204693615, "grad_norm": 20.741817474365234, "learning_rate": 3.0474964672784456e-06, "loss": 0.2101, "step": 81600 }, { "epoch": 6.657431551499348, "grad_norm": 12.694851875305176, "learning_rate": 3.034411532280193e-06, "loss": 0.2119, "step": 81700 }, { "epoch": 6.665580182529335, "grad_norm": 11.025914192199707, "learning_rate": 3.0213425001822266e-06, "loss": 0.1936, "step": 81800 }, { "epoch": 6.673728813559322, "grad_norm": 8.600627899169922, "learning_rate": 3.008289476721594e-06, "loss": 0.2239, "step": 81900 }, { "epoch": 6.681877444589309, "grad_norm": 5.949343681335449, "learning_rate": 2.9952525675058175e-06, "loss": 0.2103, "step": 82000 }, { "epoch": 6.690026075619296, "grad_norm": 9.281770706176758, "learning_rate": 2.9822318780120463e-06, "loss": 0.2252, "step": 82100 }, { "epoch": 6.698174706649283, "grad_norm": 8.222912788391113, "learning_rate": 2.9692275135862002e-06, "loss": 0.199, "step": 82200 }, { "epoch": 6.70632333767927, "grad_norm": 10.598749160766602, "learning_rate": 2.9562395794421193e-06, "loss": 0.2244, "step": 82300 }, { "epoch": 6.7144719687092564, "grad_norm": 11.608291625976562, "learning_rate": 2.9432681806607145e-06, "loss": 0.2176, "step": 82400 }, { "epoch": 6.722620599739244, "grad_norm": 9.24106216430664, "learning_rate": 2.9303134221891106e-06, "loss": 0.2222, "step": 82500 }, { "epoch": 6.730769230769231, "grad_norm": 2.6706371307373047, "learning_rate": 2.917375408839803e-06, "loss": 0.2159, "step": 82600 }, { "epoch": 6.738917861799218, "grad_norm": 11.834959030151367, "learning_rate": 2.904454245289805e-06, "loss": 0.216, "step": 82700 }, { "epoch": 6.747066492829204, "grad_norm": 3.9120168685913086, "learning_rate": 2.8915500360798117e-06, "loss": 0.2051, "step": 82800 }, { "epoch": 6.755215123859192, "grad_norm": 9.347685813903809, "learning_rate": 2.8786628856133404e-06, "loss": 0.238, "step": 82900 }, { "epoch": 6.763363754889179, "grad_norm": 7.142603874206543, "learning_rate": 2.8657928981558926e-06, "loss": 0.2076, "step": 83000 }, { "epoch": 6.7715123859191655, "grad_norm": 15.814796447753906, "learning_rate": 2.852940177834111e-06, "loss": 0.2018, "step": 83100 }, { "epoch": 6.779661016949152, "grad_norm": 11.722209930419922, "learning_rate": 2.8401048286349353e-06, "loss": 0.2275, "step": 83200 }, { "epoch": 6.78780964797914, "grad_norm": 10.187668800354004, "learning_rate": 2.8272869544047622e-06, "loss": 0.2093, "step": 83300 }, { "epoch": 6.795958279009127, "grad_norm": 15.927581787109375, "learning_rate": 2.814486658848603e-06, "loss": 0.2065, "step": 83400 }, { "epoch": 6.804106910039113, "grad_norm": 12.883095741271973, "learning_rate": 2.8017040455292465e-06, "loss": 0.2108, "step": 83500 }, { "epoch": 6.8122555410691, "grad_norm": 7.530974864959717, "learning_rate": 2.788939217866422e-06, "loss": 0.2139, "step": 83600 }, { "epoch": 6.820404172099087, "grad_norm": 20.07868766784668, "learning_rate": 2.7761922791359596e-06, "loss": 0.2205, "step": 83700 }, { "epoch": 6.8285528031290745, "grad_norm": 7.615067481994629, "learning_rate": 2.7634633324689563e-06, "loss": 0.2067, "step": 83800 }, { "epoch": 6.836701434159061, "grad_norm": 10.10435962677002, "learning_rate": 2.7507524808509416e-06, "loss": 0.2284, "step": 83900 }, { "epoch": 6.844850065189048, "grad_norm": 12.469111442565918, "learning_rate": 2.738059827121046e-06, "loss": 0.2086, "step": 84000 }, { "epoch": 6.852998696219036, "grad_norm": 8.140021324157715, "learning_rate": 2.7253854739711634e-06, "loss": 0.2162, "step": 84100 }, { "epoch": 6.861147327249022, "grad_norm": 14.818914413452148, "learning_rate": 2.7127295239451273e-06, "loss": 0.2153, "step": 84200 }, { "epoch": 6.869295958279009, "grad_norm": 8.947492599487305, "learning_rate": 2.700092079437877e-06, "loss": 0.2073, "step": 84300 }, { "epoch": 6.877444589308996, "grad_norm": 8.173857688903809, "learning_rate": 2.687473242694629e-06, "loss": 0.2136, "step": 84400 }, { "epoch": 6.885593220338983, "grad_norm": 4.175146579742432, "learning_rate": 2.6748731158100528e-06, "loss": 0.2082, "step": 84500 }, { "epoch": 6.89374185136897, "grad_norm": 8.696370124816895, "learning_rate": 2.6622918007274406e-06, "loss": 0.2128, "step": 84600 }, { "epoch": 6.901890482398957, "grad_norm": 8.253527641296387, "learning_rate": 2.649729399237886e-06, "loss": 0.1985, "step": 84700 }, { "epoch": 6.910039113428944, "grad_norm": 9.825946807861328, "learning_rate": 2.6371860129794585e-06, "loss": 0.2084, "step": 84800 }, { "epoch": 6.918187744458931, "grad_norm": 21.79430389404297, "learning_rate": 2.624661743436383e-06, "loss": 0.2154, "step": 84900 }, { "epoch": 6.926336375488918, "grad_norm": 17.554534912109375, "learning_rate": 2.6121566919382168e-06, "loss": 0.2073, "step": 85000 }, { "epoch": 6.934485006518905, "grad_norm": 14.525189399719238, "learning_rate": 2.599670959659032e-06, "loss": 0.2136, "step": 85100 }, { "epoch": 6.942633637548892, "grad_norm": 17.66045570373535, "learning_rate": 2.5872046476165926e-06, "loss": 0.2259, "step": 85200 }, { "epoch": 6.9507822685788785, "grad_norm": 12.12194538116455, "learning_rate": 2.574757856671542e-06, "loss": 0.2303, "step": 85300 }, { "epoch": 6.958930899608866, "grad_norm": 16.121667861938477, "learning_rate": 2.5623306875265865e-06, "loss": 0.209, "step": 85400 }, { "epoch": 6.967079530638853, "grad_norm": 37.0359001159668, "learning_rate": 2.5499232407256764e-06, "loss": 0.2135, "step": 85500 }, { "epoch": 6.97522816166884, "grad_norm": 9.753621101379395, "learning_rate": 2.5375356166531974e-06, "loss": 0.2246, "step": 85600 }, { "epoch": 6.983376792698826, "grad_norm": 11.933328628540039, "learning_rate": 2.525167915533153e-06, "loss": 0.2083, "step": 85700 }, { "epoch": 6.991525423728813, "grad_norm": 11.32873821258545, "learning_rate": 2.512820237428366e-06, "loss": 0.221, "step": 85800 }, { "epoch": 6.999674054758801, "grad_norm": 10.335704803466797, "learning_rate": 2.5004926822396468e-06, "loss": 0.218, "step": 85900 }, { "epoch": 7.0, "eval_accuracy": 0.8200803212851405, "eval_loss": 0.6657418608665466, "eval_runtime": 6.9032, "eval_samples_per_second": 360.703, "eval_steps_per_second": 45.196, "step": 85904 }, { "epoch": 7.0078226857887875, "grad_norm": 13.04452133178711, "learning_rate": 2.4881853497050074e-06, "loss": 0.1828, "step": 86000 }, { "epoch": 7.015971316818774, "grad_norm": 11.350065231323242, "learning_rate": 2.475898339398842e-06, "loss": 0.1981, "step": 86100 }, { "epoch": 7.024119947848761, "grad_norm": 3.5544838905334473, "learning_rate": 2.463631750731125e-06, "loss": 0.1873, "step": 86200 }, { "epoch": 7.032268578878749, "grad_norm": 6.474255084991455, "learning_rate": 2.451385682946606e-06, "loss": 0.205, "step": 86300 }, { "epoch": 7.040417209908735, "grad_norm": 10.676136016845703, "learning_rate": 2.43916023512401e-06, "loss": 0.1702, "step": 86400 }, { "epoch": 7.048565840938722, "grad_norm": 6.142400741577148, "learning_rate": 2.4269555061752303e-06, "loss": 0.2017, "step": 86500 }, { "epoch": 7.056714471968709, "grad_norm": 16.273656845092773, "learning_rate": 2.4147715948445323e-06, "loss": 0.1776, "step": 86600 }, { "epoch": 7.064863102998697, "grad_norm": 22.690208435058594, "learning_rate": 2.4026085997077486e-06, "loss": 0.1762, "step": 86700 }, { "epoch": 7.073011734028683, "grad_norm": 14.49307632446289, "learning_rate": 2.390466619171492e-06, "loss": 0.1664, "step": 86800 }, { "epoch": 7.08116036505867, "grad_norm": 14.948646545410156, "learning_rate": 2.378345751472351e-06, "loss": 0.1953, "step": 86900 }, { "epoch": 7.089308996088657, "grad_norm": 12.674484252929688, "learning_rate": 2.3662460946760962e-06, "loss": 0.1932, "step": 87000 }, { "epoch": 7.0974576271186445, "grad_norm": 14.729815483093262, "learning_rate": 2.354167746676892e-06, "loss": 0.1814, "step": 87100 }, { "epoch": 7.105606258148631, "grad_norm": 16.739356994628906, "learning_rate": 2.3421108051964974e-06, "loss": 0.1761, "step": 87200 }, { "epoch": 7.113754889178618, "grad_norm": 16.266368865966797, "learning_rate": 2.330075367783479e-06, "loss": 0.1947, "step": 87300 }, { "epoch": 7.121903520208605, "grad_norm": 12.137019157409668, "learning_rate": 2.318061531812422e-06, "loss": 0.2017, "step": 87400 }, { "epoch": 7.130052151238592, "grad_norm": 7.073469161987305, "learning_rate": 2.3060693944831404e-06, "loss": 0.1746, "step": 87500 }, { "epoch": 7.138200782268579, "grad_norm": 7.888490200042725, "learning_rate": 2.294099052819893e-06, "loss": 0.1882, "step": 87600 }, { "epoch": 7.146349413298566, "grad_norm": 18.83835792541504, "learning_rate": 2.282150603670596e-06, "loss": 0.182, "step": 87700 }, { "epoch": 7.154498044328553, "grad_norm": 9.491145133972168, "learning_rate": 2.2702241437060463e-06, "loss": 0.1817, "step": 87800 }, { "epoch": 7.162646675358539, "grad_norm": 11.629495620727539, "learning_rate": 2.2583197694191272e-06, "loss": 0.1737, "step": 87900 }, { "epoch": 7.170795306388527, "grad_norm": 3.3986611366271973, "learning_rate": 2.246437577124038e-06, "loss": 0.1839, "step": 88000 }, { "epoch": 7.178943937418514, "grad_norm": 3.2696523666381836, "learning_rate": 2.2345776629555085e-06, "loss": 0.1896, "step": 88100 }, { "epoch": 7.1870925684485005, "grad_norm": 9.869660377502441, "learning_rate": 2.2227401228680275e-06, "loss": 0.2028, "step": 88200 }, { "epoch": 7.195241199478487, "grad_norm": 8.699070930480957, "learning_rate": 2.2109250526350584e-06, "loss": 0.2025, "step": 88300 }, { "epoch": 7.203389830508475, "grad_norm": 3.9306254386901855, "learning_rate": 2.1991325478482695e-06, "loss": 0.1827, "step": 88400 }, { "epoch": 7.211538461538462, "grad_norm": 18.14926528930664, "learning_rate": 2.187362703916766e-06, "loss": 0.1843, "step": 88500 }, { "epoch": 7.219687092568448, "grad_norm": 15.083455085754395, "learning_rate": 2.175615616066305e-06, "loss": 0.1932, "step": 88600 }, { "epoch": 7.227835723598435, "grad_norm": 14.958844184875488, "learning_rate": 2.163891379338535e-06, "loss": 0.1839, "step": 88700 }, { "epoch": 7.235984354628423, "grad_norm": 9.219823837280273, "learning_rate": 2.1521900885902214e-06, "loss": 0.205, "step": 88800 }, { "epoch": 7.24413298565841, "grad_norm": 10.361544609069824, "learning_rate": 2.1405118384924858e-06, "loss": 0.1942, "step": 88900 }, { "epoch": 7.252281616688396, "grad_norm": 7.847745418548584, "learning_rate": 2.128856723530033e-06, "loss": 0.2046, "step": 89000 }, { "epoch": 7.260430247718383, "grad_norm": 8.953947067260742, "learning_rate": 2.1172248380003853e-06, "loss": 0.1903, "step": 89100 }, { "epoch": 7.26857887874837, "grad_norm": 6.825370788574219, "learning_rate": 2.105616276013133e-06, "loss": 0.178, "step": 89200 }, { "epoch": 7.2767275097783575, "grad_norm": 10.48969554901123, "learning_rate": 2.0940311314891574e-06, "loss": 0.1778, "step": 89300 }, { "epoch": 7.284876140808344, "grad_norm": 13.994695663452148, "learning_rate": 2.082469498159879e-06, "loss": 0.1673, "step": 89400 }, { "epoch": 7.293024771838331, "grad_norm": 17.321313858032227, "learning_rate": 2.0709314695664957e-06, "loss": 0.2043, "step": 89500 }, { "epoch": 7.301173402868318, "grad_norm": 10.52856731414795, "learning_rate": 2.0594171390592294e-06, "loss": 0.1942, "step": 89600 }, { "epoch": 7.309322033898305, "grad_norm": 23.261329650878906, "learning_rate": 2.047926599796568e-06, "loss": 0.1816, "step": 89700 }, { "epoch": 7.317470664928292, "grad_norm": 6.534886360168457, "learning_rate": 2.0364599447445126e-06, "loss": 0.1808, "step": 89800 }, { "epoch": 7.325619295958279, "grad_norm": 12.067914962768555, "learning_rate": 2.0250172666758267e-06, "loss": 0.187, "step": 89900 }, { "epoch": 7.333767926988266, "grad_norm": 11.018478393554688, "learning_rate": 2.0135986581692817e-06, "loss": 0.1865, "step": 90000 }, { "epoch": 7.341916558018253, "grad_norm": 9.79710865020752, "learning_rate": 2.002204211608913e-06, "loss": 0.1987, "step": 90100 }, { "epoch": 7.35006518904824, "grad_norm": 15.164643287658691, "learning_rate": 1.990834019183268e-06, "loss": 0.1973, "step": 90200 }, { "epoch": 7.358213820078227, "grad_norm": 22.170740127563477, "learning_rate": 1.9794881728846642e-06, "loss": 0.1702, "step": 90300 }, { "epoch": 7.3663624511082135, "grad_norm": 8.200043678283691, "learning_rate": 1.968166764508442e-06, "loss": 0.183, "step": 90400 }, { "epoch": 7.374511082138201, "grad_norm": 6.145725250244141, "learning_rate": 1.9568698856522215e-06, "loss": 0.1906, "step": 90500 }, { "epoch": 7.382659713168188, "grad_norm": 22.14548683166504, "learning_rate": 1.945597627715166e-06, "loss": 0.1947, "step": 90600 }, { "epoch": 7.390808344198175, "grad_norm": 10.075164794921875, "learning_rate": 1.934350081897237e-06, "loss": 0.171, "step": 90700 }, { "epoch": 7.398956975228161, "grad_norm": 6.933922290802002, "learning_rate": 1.923127339198459e-06, "loss": 0.1845, "step": 90800 }, { "epoch": 7.407105606258149, "grad_norm": 26.223041534423828, "learning_rate": 1.9119294904181847e-06, "loss": 0.1852, "step": 90900 }, { "epoch": 7.415254237288136, "grad_norm": 4.778967380523682, "learning_rate": 1.900756626154356e-06, "loss": 0.1958, "step": 91000 }, { "epoch": 7.423402868318123, "grad_norm": 29.773698806762695, "learning_rate": 1.889608836802776e-06, "loss": 0.1809, "step": 91100 }, { "epoch": 7.431551499348109, "grad_norm": 8.9940767288208, "learning_rate": 1.8784862125563734e-06, "loss": 0.1869, "step": 91200 }, { "epoch": 7.439700130378096, "grad_norm": 15.34753704071045, "learning_rate": 1.8673888434044756e-06, "loss": 0.1863, "step": 91300 }, { "epoch": 7.447848761408084, "grad_norm": 19.44320297241211, "learning_rate": 1.8563168191320823e-06, "loss": 0.1798, "step": 91400 }, { "epoch": 7.4559973924380705, "grad_norm": 12.468984603881836, "learning_rate": 1.8452702293191339e-06, "loss": 0.1808, "step": 91500 }, { "epoch": 7.464146023468057, "grad_norm": 8.79600715637207, "learning_rate": 1.8342491633397863e-06, "loss": 0.1823, "step": 91600 }, { "epoch": 7.472294654498044, "grad_norm": 15.76307487487793, "learning_rate": 1.8232537103616953e-06, "loss": 0.1959, "step": 91700 }, { "epoch": 7.480443285528032, "grad_norm": 9.05780029296875, "learning_rate": 1.8122839593452902e-06, "loss": 0.1797, "step": 91800 }, { "epoch": 7.488591916558018, "grad_norm": 11.826004981994629, "learning_rate": 1.8013399990430525e-06, "loss": 0.1639, "step": 91900 }, { "epoch": 7.496740547588005, "grad_norm": 20.31383514404297, "learning_rate": 1.7904219179988007e-06, "loss": 0.1916, "step": 92000 }, { "epoch": 7.504889178617992, "grad_norm": 18.240629196166992, "learning_rate": 1.7795298045469766e-06, "loss": 0.1791, "step": 92100 }, { "epoch": 7.5130378096479795, "grad_norm": 20.392873764038086, "learning_rate": 1.7686637468119223e-06, "loss": 0.2021, "step": 92200 }, { "epoch": 7.521186440677966, "grad_norm": 9.732405662536621, "learning_rate": 1.757823832707175e-06, "loss": 0.1818, "step": 92300 }, { "epoch": 7.529335071707953, "grad_norm": 21.23190689086914, "learning_rate": 1.7470101499347498e-06, "loss": 0.1692, "step": 92400 }, { "epoch": 7.53748370273794, "grad_norm": 7.4514641761779785, "learning_rate": 1.736222785984435e-06, "loss": 0.2084, "step": 92500 }, { "epoch": 7.5456323337679265, "grad_norm": 13.29001522064209, "learning_rate": 1.7254618281330838e-06, "loss": 0.1897, "step": 92600 }, { "epoch": 7.553780964797914, "grad_norm": 9.683525085449219, "learning_rate": 1.7147273634439021e-06, "loss": 0.156, "step": 92700 }, { "epoch": 7.561929595827901, "grad_norm": 12.022348403930664, "learning_rate": 1.7040194787657566e-06, "loss": 0.2136, "step": 92800 }, { "epoch": 7.570078226857888, "grad_norm": 11.087843894958496, "learning_rate": 1.6933382607324572e-06, "loss": 0.171, "step": 92900 }, { "epoch": 7.578226857887875, "grad_norm": 20.101045608520508, "learning_rate": 1.6826837957620662e-06, "loss": 0.2131, "step": 93000 }, { "epoch": 7.586375488917862, "grad_norm": 13.087589263916016, "learning_rate": 1.672056170056196e-06, "loss": 0.1791, "step": 93100 }, { "epoch": 7.594524119947849, "grad_norm": 9.458551406860352, "learning_rate": 1.6614554695993085e-06, "loss": 0.1746, "step": 93200 }, { "epoch": 7.602672750977836, "grad_norm": 12.884553909301758, "learning_rate": 1.6508817801580268e-06, "loss": 0.1673, "step": 93300 }, { "epoch": 7.610821382007822, "grad_norm": 10.40186595916748, "learning_rate": 1.6403351872804347e-06, "loss": 0.1659, "step": 93400 }, { "epoch": 7.61897001303781, "grad_norm": 12.832286834716797, "learning_rate": 1.6298157762953897e-06, "loss": 0.1693, "step": 93500 }, { "epoch": 7.627118644067797, "grad_norm": 13.989652633666992, "learning_rate": 1.6193236323118283e-06, "loss": 0.203, "step": 93600 }, { "epoch": 7.6352672750977835, "grad_norm": 13.184144020080566, "learning_rate": 1.6088588402180783e-06, "loss": 0.1983, "step": 93700 }, { "epoch": 7.64341590612777, "grad_norm": 50.71080017089844, "learning_rate": 1.5984214846811735e-06, "loss": 0.1837, "step": 93800 }, { "epoch": 7.651564537157758, "grad_norm": 8.608222007751465, "learning_rate": 1.588011650146169e-06, "loss": 0.1786, "step": 93900 }, { "epoch": 7.659713168187745, "grad_norm": 9.973206520080566, "learning_rate": 1.5776294208354537e-06, "loss": 0.1873, "step": 94000 }, { "epoch": 7.667861799217731, "grad_norm": 3.6279351711273193, "learning_rate": 1.5672748807480736e-06, "loss": 0.1754, "step": 94100 }, { "epoch": 7.676010430247718, "grad_norm": 13.710479736328125, "learning_rate": 1.5569481136590554e-06, "loss": 0.1973, "step": 94200 }, { "epoch": 7.684159061277706, "grad_norm": 20.849790573120117, "learning_rate": 1.5466492031187174e-06, "loss": 0.1953, "step": 94300 }, { "epoch": 7.6923076923076925, "grad_norm": 16.05866241455078, "learning_rate": 1.5363782324520033e-06, "loss": 0.1834, "step": 94400 }, { "epoch": 7.700456323337679, "grad_norm": 10.594083786010742, "learning_rate": 1.5261352847578044e-06, "loss": 0.196, "step": 94500 }, { "epoch": 7.708604954367666, "grad_norm": 14.200790405273438, "learning_rate": 1.5159204429082874e-06, "loss": 0.1793, "step": 94600 }, { "epoch": 7.716753585397653, "grad_norm": 3.8873071670532227, "learning_rate": 1.5057337895482255e-06, "loss": 0.1865, "step": 94700 }, { "epoch": 7.72490221642764, "grad_norm": 13.96704387664795, "learning_rate": 1.4955754070943268e-06, "loss": 0.1653, "step": 94800 }, { "epoch": 7.733050847457627, "grad_norm": 23.539247512817383, "learning_rate": 1.48544537773457e-06, "loss": 0.1713, "step": 94900 }, { "epoch": 7.741199478487614, "grad_norm": 14.154293060302734, "learning_rate": 1.4753437834275397e-06, "loss": 0.1894, "step": 95000 }, { "epoch": 7.749348109517601, "grad_norm": 8.608110427856445, "learning_rate": 1.4652707059017607e-06, "loss": 0.1887, "step": 95100 }, { "epoch": 7.757496740547588, "grad_norm": 9.453892707824707, "learning_rate": 1.4552262266550382e-06, "loss": 0.1769, "step": 95200 }, { "epoch": 7.765645371577575, "grad_norm": 12.239083290100098, "learning_rate": 1.4452104269538009e-06, "loss": 0.1699, "step": 95300 }, { "epoch": 7.773794002607562, "grad_norm": 10.937909126281738, "learning_rate": 1.4352233878324384e-06, "loss": 0.1667, "step": 95400 }, { "epoch": 7.781942633637549, "grad_norm": 21.223346710205078, "learning_rate": 1.4252651900926496e-06, "loss": 0.182, "step": 95500 }, { "epoch": 7.790091264667536, "grad_norm": 7.070313453674316, "learning_rate": 1.4153359143027879e-06, "loss": 0.1896, "step": 95600 }, { "epoch": 7.798239895697523, "grad_norm": 14.346339225769043, "learning_rate": 1.4054356407972086e-06, "loss": 0.1743, "step": 95700 }, { "epoch": 7.80638852672751, "grad_norm": 15.966556549072266, "learning_rate": 1.3955644496756199e-06, "loss": 0.1902, "step": 95800 }, { "epoch": 7.8145371577574965, "grad_norm": 16.198644638061523, "learning_rate": 1.3857224208024345e-06, "loss": 0.1945, "step": 95900 }, { "epoch": 7.822685788787483, "grad_norm": 8.803377151489258, "learning_rate": 1.3759096338061222e-06, "loss": 0.1793, "step": 96000 }, { "epoch": 7.830834419817471, "grad_norm": 19.771717071533203, "learning_rate": 1.3661261680785693e-06, "loss": 0.1809, "step": 96100 }, { "epoch": 7.838983050847458, "grad_norm": 11.52552318572998, "learning_rate": 1.3563721027744309e-06, "loss": 0.1887, "step": 96200 }, { "epoch": 7.847131681877444, "grad_norm": 17.998104095458984, "learning_rate": 1.3466475168104953e-06, "loss": 0.2107, "step": 96300 }, { "epoch": 7.855280312907432, "grad_norm": 6.081639289855957, "learning_rate": 1.3369524888650437e-06, "loss": 0.1849, "step": 96400 }, { "epoch": 7.863428943937419, "grad_norm": 6.099484443664551, "learning_rate": 1.3272870973772118e-06, "loss": 0.1847, "step": 96500 }, { "epoch": 7.8715775749674055, "grad_norm": 19.433902740478516, "learning_rate": 1.3176514205463586e-06, "loss": 0.2, "step": 96600 }, { "epoch": 7.879726205997392, "grad_norm": 6.365217208862305, "learning_rate": 1.3080455363314309e-06, "loss": 0.2062, "step": 96700 }, { "epoch": 7.887874837027379, "grad_norm": 9.893994331359863, "learning_rate": 1.2984695224503351e-06, "loss": 0.1721, "step": 96800 }, { "epoch": 7.896023468057367, "grad_norm": 22.75550079345703, "learning_rate": 1.2889234563793058e-06, "loss": 0.204, "step": 96900 }, { "epoch": 7.904172099087353, "grad_norm": 2.8168067932128906, "learning_rate": 1.279407415352279e-06, "loss": 0.1963, "step": 97000 }, { "epoch": 7.91232073011734, "grad_norm": 19.346757888793945, "learning_rate": 1.2699214763602741e-06, "loss": 0.1845, "step": 97100 }, { "epoch": 7.920469361147327, "grad_norm": 13.861513137817383, "learning_rate": 1.2604657161507566e-06, "loss": 0.1934, "step": 97200 }, { "epoch": 7.9286179921773146, "grad_norm": 12.996659278869629, "learning_rate": 1.2510402112270326e-06, "loss": 0.1808, "step": 97300 }, { "epoch": 7.936766623207301, "grad_norm": 16.255569458007812, "learning_rate": 1.2416450378476196e-06, "loss": 0.1919, "step": 97400 }, { "epoch": 7.944915254237288, "grad_norm": 9.47265625, "learning_rate": 1.2322802720256355e-06, "loss": 0.1887, "step": 97500 }, { "epoch": 7.953063885267275, "grad_norm": 13.006512641906738, "learning_rate": 1.2229459895281787e-06, "loss": 0.1927, "step": 97600 }, { "epoch": 7.9612125162972625, "grad_norm": 13.849684715270996, "learning_rate": 1.213642265875718e-06, "loss": 0.1906, "step": 97700 }, { "epoch": 7.969361147327249, "grad_norm": 25.117225646972656, "learning_rate": 1.2043691763414844e-06, "loss": 0.1659, "step": 97800 }, { "epoch": 7.977509778357236, "grad_norm": 9.633444786071777, "learning_rate": 1.1951267959508562e-06, "loss": 0.1923, "step": 97900 }, { "epoch": 7.985658409387223, "grad_norm": 9.853534698486328, "learning_rate": 1.185915199480751e-06, "loss": 0.1969, "step": 98000 }, { "epoch": 7.9938070404172095, "grad_norm": 12.424792289733887, "learning_rate": 1.1767344614590303e-06, "loss": 0.1772, "step": 98100 }, { "epoch": 8.0, "eval_accuracy": 0.8200803212851405, "eval_loss": 0.7215536236763, "eval_runtime": 7.0555, "eval_samples_per_second": 352.917, "eval_steps_per_second": 44.221, "step": 98176 }, { "epoch": 8.001955671447197, "grad_norm": 13.437636375427246, "learning_rate": 1.167584656163887e-06, "loss": 0.1774, "step": 98200 }, { "epoch": 8.010104302477183, "grad_norm": 14.577449798583984, "learning_rate": 1.1584658576232482e-06, "loss": 0.1693, "step": 98300 }, { "epoch": 8.01825293350717, "grad_norm": 18.45952606201172, "learning_rate": 1.1493781396141795e-06, "loss": 0.17, "step": 98400 }, { "epoch": 8.026401564537158, "grad_norm": 18.29120635986328, "learning_rate": 1.1403215756622804e-06, "loss": 0.178, "step": 98500 }, { "epoch": 8.034550195567144, "grad_norm": 11.486896514892578, "learning_rate": 1.1312962390410954e-06, "loss": 0.1815, "step": 98600 }, { "epoch": 8.042698826597132, "grad_norm": 19.90141487121582, "learning_rate": 1.1223022027715197e-06, "loss": 0.1682, "step": 98700 }, { "epoch": 8.05084745762712, "grad_norm": 11.248079299926758, "learning_rate": 1.1133395396212048e-06, "loss": 0.169, "step": 98800 }, { "epoch": 8.058996088657105, "grad_norm": 7.839399814605713, "learning_rate": 1.104408322103978e-06, "loss": 0.1684, "step": 98900 }, { "epoch": 8.067144719687093, "grad_norm": 8.082372665405273, "learning_rate": 1.095508622479247e-06, "loss": 0.1769, "step": 99000 }, { "epoch": 8.075293350717079, "grad_norm": 9.952238082885742, "learning_rate": 1.0866405127514234e-06, "loss": 0.1866, "step": 99100 }, { "epoch": 8.083441981747066, "grad_norm": 5.250309467315674, "learning_rate": 1.0778040646693316e-06, "loss": 0.162, "step": 99200 }, { "epoch": 8.091590612777054, "grad_norm": 9.988779067993164, "learning_rate": 1.0689993497256336e-06, "loss": 0.177, "step": 99300 }, { "epoch": 8.09973924380704, "grad_norm": 8.978513717651367, "learning_rate": 1.0602264391562506e-06, "loss": 0.151, "step": 99400 }, { "epoch": 8.107887874837028, "grad_norm": 23.60556983947754, "learning_rate": 1.051485403939786e-06, "loss": 0.1734, "step": 99500 }, { "epoch": 8.116036505867015, "grad_norm": 10.938061714172363, "learning_rate": 1.0427763147969467e-06, "loss": 0.1733, "step": 99600 }, { "epoch": 8.124185136897001, "grad_norm": 5.527510643005371, "learning_rate": 1.0340992421899776e-06, "loss": 0.1565, "step": 99700 }, { "epoch": 8.132333767926989, "grad_norm": 9.493518829345703, "learning_rate": 1.0254542563220922e-06, "loss": 0.181, "step": 99800 }, { "epoch": 8.140482398956975, "grad_norm": 7.9793548583984375, "learning_rate": 1.0168414271368953e-06, "loss": 0.1837, "step": 99900 }, { "epoch": 8.148631029986962, "grad_norm": 11.252303123474121, "learning_rate": 1.0082608243178276e-06, "loss": 0.1708, "step": 100000 }, { "epoch": 8.15677966101695, "grad_norm": 14.102470397949219, "learning_rate": 9.997125172875943e-07, "loss": 0.1884, "step": 100100 }, { "epoch": 8.164928292046936, "grad_norm": 38.51998519897461, "learning_rate": 9.91196575207608e-07, "loss": 0.184, "step": 100200 }, { "epoch": 8.173076923076923, "grad_norm": 7.0270466804504395, "learning_rate": 9.82713066977427e-07, "loss": 0.1489, "step": 100300 }, { "epoch": 8.18122555410691, "grad_norm": 14.944999694824219, "learning_rate": 9.742620612341992e-07, "loss": 0.1835, "step": 100400 }, { "epoch": 8.189374185136897, "grad_norm": 7.147238731384277, "learning_rate": 9.658436263521048e-07, "loss": 0.1512, "step": 100500 }, { "epoch": 8.197522816166884, "grad_norm": 5.465837001800537, "learning_rate": 9.574578304418063e-07, "loss": 0.1702, "step": 100600 }, { "epoch": 8.20567144719687, "grad_norm": 4.3965630531311035, "learning_rate": 9.491047413498933e-07, "loss": 0.1619, "step": 100700 }, { "epoch": 8.213820078226858, "grad_norm": 21.602157592773438, "learning_rate": 9.407844266583377e-07, "loss": 0.1726, "step": 100800 }, { "epoch": 8.221968709256846, "grad_norm": 16.533201217651367, "learning_rate": 9.324969536839435e-07, "loss": 0.1564, "step": 100900 }, { "epoch": 8.230117340286832, "grad_norm": 17.454898834228516, "learning_rate": 9.242423894778046e-07, "loss": 0.1847, "step": 101000 }, { "epoch": 8.23826597131682, "grad_norm": 17.726686477661133, "learning_rate": 9.160208008247618e-07, "loss": 0.1695, "step": 101100 }, { "epoch": 8.246414602346805, "grad_norm": 31.844257354736328, "learning_rate": 9.078322542428597e-07, "loss": 0.1698, "step": 101200 }, { "epoch": 8.254563233376793, "grad_norm": 9.689949989318848, "learning_rate": 8.99676815982814e-07, "loss": 0.153, "step": 101300 }, { "epoch": 8.26271186440678, "grad_norm": 13.61907958984375, "learning_rate": 8.915545520274699e-07, "loss": 0.177, "step": 101400 }, { "epoch": 8.270860495436766, "grad_norm": 11.14121150970459, "learning_rate": 8.834655280912718e-07, "loss": 0.1674, "step": 101500 }, { "epoch": 8.279009126466754, "grad_norm": 12.197967529296875, "learning_rate": 8.754098096197312e-07, "loss": 0.1787, "step": 101600 }, { "epoch": 8.28715775749674, "grad_norm": 12.565035820007324, "learning_rate": 8.67387461788895e-07, "loss": 0.1679, "step": 101700 }, { "epoch": 8.295306388526727, "grad_norm": 21.256549835205078, "learning_rate": 8.593985495048201e-07, "loss": 0.1695, "step": 101800 }, { "epoch": 8.303455019556715, "grad_norm": 4.485990524291992, "learning_rate": 8.514431374030496e-07, "loss": 0.1654, "step": 101900 }, { "epoch": 8.3116036505867, "grad_norm": 13.213761329650879, "learning_rate": 8.435212898480855e-07, "loss": 0.1626, "step": 102000 }, { "epoch": 8.319752281616688, "grad_norm": 19.035646438598633, "learning_rate": 8.356330709328725e-07, "loss": 0.1611, "step": 102100 }, { "epoch": 8.327900912646676, "grad_norm": 21.1912841796875, "learning_rate": 8.277785444782765e-07, "loss": 0.1607, "step": 102200 }, { "epoch": 8.336049543676662, "grad_norm": 19.324132919311523, "learning_rate": 8.199577740325703e-07, "loss": 0.1741, "step": 102300 }, { "epoch": 8.34419817470665, "grad_norm": 8.325228691101074, "learning_rate": 8.121708228709174e-07, "loss": 0.1808, "step": 102400 }, { "epoch": 8.352346805736635, "grad_norm": 11.028812408447266, "learning_rate": 8.044177539948617e-07, "loss": 0.169, "step": 102500 }, { "epoch": 8.360495436766623, "grad_norm": 20.587303161621094, "learning_rate": 7.966986301318158e-07, "loss": 0.1569, "step": 102600 }, { "epoch": 8.36864406779661, "grad_norm": 8.49282455444336, "learning_rate": 7.890135137345589e-07, "loss": 0.1584, "step": 102700 }, { "epoch": 8.376792698826597, "grad_norm": 14.866241455078125, "learning_rate": 7.813624669807246e-07, "loss": 0.1608, "step": 102800 }, { "epoch": 8.384941329856584, "grad_norm": 3.761150598526001, "learning_rate": 7.73745551772298e-07, "loss": 0.1533, "step": 102900 }, { "epoch": 8.393089960886572, "grad_norm": 17.36056900024414, "learning_rate": 7.66162829735122e-07, "loss": 0.1723, "step": 103000 }, { "epoch": 8.401238591916558, "grad_norm": 14.63774585723877, "learning_rate": 7.586143622183922e-07, "loss": 0.1769, "step": 103100 }, { "epoch": 8.409387222946545, "grad_norm": 15.453008651733398, "learning_rate": 7.511002102941639e-07, "loss": 0.1845, "step": 103200 }, { "epoch": 8.417535853976531, "grad_norm": 23.958969116210938, "learning_rate": 7.436204347568548e-07, "loss": 0.1829, "step": 103300 }, { "epoch": 8.425684485006519, "grad_norm": 22.29449462890625, "learning_rate": 7.361750961227587e-07, "loss": 0.1722, "step": 103400 }, { "epoch": 8.433833116036507, "grad_norm": 12.636420249938965, "learning_rate": 7.287642546295487e-07, "loss": 0.1614, "step": 103500 }, { "epoch": 8.441981747066492, "grad_norm": 12.580671310424805, "learning_rate": 7.213879702357951e-07, "loss": 0.1713, "step": 103600 }, { "epoch": 8.45013037809648, "grad_norm": 9.213543891906738, "learning_rate": 7.140463026204764e-07, "loss": 0.1619, "step": 103700 }, { "epoch": 8.458279009126466, "grad_norm": 15.926830291748047, "learning_rate": 7.067393111825016e-07, "loss": 0.1748, "step": 103800 }, { "epoch": 8.466427640156454, "grad_norm": 22.008920669555664, "learning_rate": 6.994670550402249e-07, "loss": 0.1926, "step": 103900 }, { "epoch": 8.474576271186441, "grad_norm": 4.002703666687012, "learning_rate": 6.922295930309691e-07, "loss": 0.1613, "step": 104000 }, { "epoch": 8.482724902216427, "grad_norm": 10.932751655578613, "learning_rate": 6.850269837105522e-07, "loss": 0.1635, "step": 104100 }, { "epoch": 8.490873533246415, "grad_norm": 20.70867347717285, "learning_rate": 6.778592853528077e-07, "loss": 0.1708, "step": 104200 }, { "epoch": 8.499022164276402, "grad_norm": 9.567403793334961, "learning_rate": 6.707265559491188e-07, "loss": 0.1814, "step": 104300 }, { "epoch": 8.507170795306388, "grad_norm": 24.9285888671875, "learning_rate": 6.63628853207946e-07, "loss": 0.1746, "step": 104400 }, { "epoch": 8.515319426336376, "grad_norm": 12.97628402709961, "learning_rate": 6.565662345543595e-07, "loss": 0.17, "step": 104500 }, { "epoch": 8.523468057366362, "grad_norm": 5.221209526062012, "learning_rate": 6.495387571295785e-07, "loss": 0.1726, "step": 104600 }, { "epoch": 8.53161668839635, "grad_norm": 12.438835144042969, "learning_rate": 6.42546477790506e-07, "loss": 0.1703, "step": 104700 }, { "epoch": 8.539765319426337, "grad_norm": 9.98957633972168, "learning_rate": 6.355894531092705e-07, "loss": 0.1883, "step": 104800 }, { "epoch": 8.547913950456323, "grad_norm": 8.844900131225586, "learning_rate": 6.286677393727653e-07, "loss": 0.1623, "step": 104900 }, { "epoch": 8.55606258148631, "grad_norm": 5.921658039093018, "learning_rate": 6.217813925821958e-07, "loss": 0.16, "step": 105000 }, { "epoch": 8.564211212516298, "grad_norm": 12.132319450378418, "learning_rate": 6.149304684526253e-07, "loss": 0.1843, "step": 105100 }, { "epoch": 8.572359843546284, "grad_norm": 13.31769847869873, "learning_rate": 6.081150224125254e-07, "loss": 0.1586, "step": 105200 }, { "epoch": 8.580508474576272, "grad_norm": 21.240800857543945, "learning_rate": 6.013351096033254e-07, "loss": 0.1783, "step": 105300 }, { "epoch": 8.588657105606258, "grad_norm": 9.178833961486816, "learning_rate": 5.945907848789667e-07, "loss": 0.1847, "step": 105400 }, { "epoch": 8.596805736636245, "grad_norm": 7.893414497375488, "learning_rate": 5.878821028054637e-07, "loss": 0.1474, "step": 105500 }, { "epoch": 8.604954367666233, "grad_norm": 17.363147735595703, "learning_rate": 5.812091176604551e-07, "loss": 0.1567, "step": 105600 }, { "epoch": 8.613102998696219, "grad_norm": 7.612610340118408, "learning_rate": 5.745718834327679e-07, "loss": 0.158, "step": 105700 }, { "epoch": 8.621251629726206, "grad_norm": 12.395828247070312, "learning_rate": 5.679704538219827e-07, "loss": 0.1817, "step": 105800 }, { "epoch": 8.629400260756192, "grad_norm": 2.951467514038086, "learning_rate": 5.614048822379947e-07, "loss": 0.1731, "step": 105900 }, { "epoch": 8.63754889178618, "grad_norm": 14.023295402526855, "learning_rate": 5.548752218005882e-07, "loss": 0.1638, "step": 106000 }, { "epoch": 8.645697522816167, "grad_norm": 21.505937576293945, "learning_rate": 5.483815253389957e-07, "loss": 0.1529, "step": 106100 }, { "epoch": 8.653846153846153, "grad_norm": 8.31225299835205, "learning_rate": 5.41923845391486e-07, "loss": 0.1563, "step": 106200 }, { "epoch": 8.661994784876141, "grad_norm": 9.446884155273438, "learning_rate": 5.355022342049249e-07, "loss": 0.1622, "step": 106300 }, { "epoch": 8.670143415906129, "grad_norm": 21.06761360168457, "learning_rate": 5.291167437343608e-07, "loss": 0.1602, "step": 106400 }, { "epoch": 8.678292046936114, "grad_norm": 13.025223731994629, "learning_rate": 5.227674256426002e-07, "loss": 0.1611, "step": 106500 }, { "epoch": 8.686440677966102, "grad_norm": 6.65778923034668, "learning_rate": 5.164543312997922e-07, "loss": 0.1677, "step": 106600 }, { "epoch": 8.694589308996088, "grad_norm": 25.8751220703125, "learning_rate": 5.101775117830121e-07, "loss": 0.1639, "step": 106700 }, { "epoch": 8.702737940026076, "grad_norm": 18.437524795532227, "learning_rate": 5.039370178758485e-07, "loss": 0.1651, "step": 106800 }, { "epoch": 8.710886571056063, "grad_norm": 31.746627807617188, "learning_rate": 4.977329000679903e-07, "loss": 0.1758, "step": 106900 }, { "epoch": 8.719035202086049, "grad_norm": 12.55679988861084, "learning_rate": 4.915652085548217e-07, "loss": 0.1571, "step": 107000 }, { "epoch": 8.727183833116037, "grad_norm": 1.4074722528457642, "learning_rate": 4.854339932370134e-07, "loss": 0.1526, "step": 107100 }, { "epoch": 8.735332464146023, "grad_norm": 5.811018466949463, "learning_rate": 4.793393037201194e-07, "loss": 0.1745, "step": 107200 }, { "epoch": 8.74348109517601, "grad_norm": 2.8639020919799805, "learning_rate": 4.7328118931417753e-07, "loss": 0.1695, "step": 107300 }, { "epoch": 8.751629726205998, "grad_norm": 20.180130004882812, "learning_rate": 4.672596990333073e-07, "loss": 0.1758, "step": 107400 }, { "epoch": 8.759778357235984, "grad_norm": 19.003700256347656, "learning_rate": 4.6127488159531495e-07, "loss": 0.1669, "step": 107500 }, { "epoch": 8.767926988265971, "grad_norm": 12.393278121948242, "learning_rate": 4.553267854213017e-07, "loss": 0.1827, "step": 107600 }, { "epoch": 8.776075619295959, "grad_norm": 23.79950714111328, "learning_rate": 4.494154586352667e-07, "loss": 0.1571, "step": 107700 }, { "epoch": 8.784224250325945, "grad_norm": 21.107633590698242, "learning_rate": 4.435409490637227e-07, "loss": 0.1744, "step": 107800 }, { "epoch": 8.792372881355933, "grad_norm": 15.573356628417969, "learning_rate": 4.3770330423530626e-07, "loss": 0.1675, "step": 107900 }, { "epoch": 8.800521512385918, "grad_norm": 14.63633918762207, "learning_rate": 4.3190257138039313e-07, "loss": 0.1667, "step": 108000 }, { "epoch": 8.808670143415906, "grad_norm": 15.823701858520508, "learning_rate": 4.2613879743071907e-07, "loss": 0.164, "step": 108100 }, { "epoch": 8.816818774445894, "grad_norm": 7.163984775543213, "learning_rate": 4.204120290189956e-07, "loss": 0.1648, "step": 108200 }, { "epoch": 8.82496740547588, "grad_norm": 10.87267780303955, "learning_rate": 4.147223124785366e-07, "loss": 0.1767, "step": 108300 }, { "epoch": 8.833116036505867, "grad_norm": 13.024577140808105, "learning_rate": 4.0906969384288396e-07, "loss": 0.1561, "step": 108400 }, { "epoch": 8.841264667535853, "grad_norm": 15.831514358520508, "learning_rate": 4.034542188454282e-07, "loss": 0.2002, "step": 108500 }, { "epoch": 8.84941329856584, "grad_norm": 8.199058532714844, "learning_rate": 3.9787593291904793e-07, "loss": 0.1823, "step": 108600 }, { "epoch": 8.857561929595828, "grad_norm": 14.69583511352539, "learning_rate": 3.9233488119573506e-07, "loss": 0.1779, "step": 108700 }, { "epoch": 8.865710560625814, "grad_norm": 12.765257835388184, "learning_rate": 3.868311085062337e-07, "loss": 0.1626, "step": 108800 }, { "epoch": 8.873859191655802, "grad_norm": 31.990026473999023, "learning_rate": 3.8136465937967657e-07, "loss": 0.1856, "step": 108900 }, { "epoch": 8.88200782268579, "grad_norm": 24.627126693725586, "learning_rate": 3.7593557804322167e-07, "loss": 0.1518, "step": 109000 }, { "epoch": 8.890156453715775, "grad_norm": 32.763092041015625, "learning_rate": 3.705439084217016e-07, "loss": 0.1526, "step": 109100 }, { "epoch": 8.898305084745763, "grad_norm": 14.418821334838867, "learning_rate": 3.6518969413725905e-07, "loss": 0.1602, "step": 109200 }, { "epoch": 8.906453715775749, "grad_norm": 9.382340431213379, "learning_rate": 3.5987297850900217e-07, "loss": 0.1742, "step": 109300 }, { "epoch": 8.914602346805736, "grad_norm": 22.482595443725586, "learning_rate": 3.5459380455264594e-07, "loss": 0.1737, "step": 109400 }, { "epoch": 8.922750977835724, "grad_norm": 18.5339412689209, "learning_rate": 3.4935221498017316e-07, "loss": 0.1581, "step": 109500 }, { "epoch": 8.93089960886571, "grad_norm": 21.965267181396484, "learning_rate": 3.4414825219948153e-07, "loss": 0.1597, "step": 109600 }, { "epoch": 8.939048239895698, "grad_norm": 13.353527069091797, "learning_rate": 3.3898195831404354e-07, "loss": 0.1747, "step": 109700 }, { "epoch": 8.947196870925685, "grad_norm": 7.977973461151123, "learning_rate": 3.3385337512256863e-07, "loss": 0.1562, "step": 109800 }, { "epoch": 8.955345501955671, "grad_norm": 9.263310432434082, "learning_rate": 3.287625441186576e-07, "loss": 0.1772, "step": 109900 }, { "epoch": 8.963494132985659, "grad_norm": 13.787714958190918, "learning_rate": 3.2370950649047383e-07, "loss": 0.1976, "step": 110000 }, { "epoch": 8.971642764015645, "grad_norm": 20.066761016845703, "learning_rate": 3.1869430312040816e-07, "loss": 0.1596, "step": 110100 }, { "epoch": 8.979791395045632, "grad_norm": 20.64689826965332, "learning_rate": 3.137169745847435e-07, "loss": 0.1704, "step": 110200 }, { "epoch": 8.98794002607562, "grad_norm": 46.617713928222656, "learning_rate": 3.08777561153335e-07, "loss": 0.1889, "step": 110300 }, { "epoch": 8.996088657105606, "grad_norm": 14.401327133178711, "learning_rate": 3.0387610278927725e-07, "loss": 0.1702, "step": 110400 }, { "epoch": 9.0, "eval_accuracy": 0.8196787148594378, "eval_loss": 0.7465346455574036, "eval_runtime": 7.168, "eval_samples_per_second": 347.378, "eval_steps_per_second": 43.527, "step": 110448 }, { "epoch": 9.004237288135593, "grad_norm": 15.593995094299316, "learning_rate": 2.990126391485848e-07, "loss": 0.1722, "step": 110500 }, { "epoch": 9.01238591916558, "grad_norm": 4.0746636390686035, "learning_rate": 2.941872095798698e-07, "loss": 0.1346, "step": 110600 }, { "epoch": 9.020534550195567, "grad_norm": 6.78621768951416, "learning_rate": 2.893998531240222e-07, "loss": 0.1819, "step": 110700 }, { "epoch": 9.028683181225555, "grad_norm": 16.810945510864258, "learning_rate": 2.8465060851389725e-07, "loss": 0.152, "step": 110800 }, { "epoch": 9.03683181225554, "grad_norm": 2.5170655250549316, "learning_rate": 2.7993951417400025e-07, "loss": 0.1737, "step": 110900 }, { "epoch": 9.044980443285528, "grad_norm": 5.630674362182617, "learning_rate": 2.752666082201727e-07, "loss": 0.1703, "step": 111000 }, { "epoch": 9.053129074315516, "grad_norm": 29.249120712280273, "learning_rate": 2.7063192845929286e-07, "loss": 0.1648, "step": 111100 }, { "epoch": 9.061277705345502, "grad_norm": 7.27542781829834, "learning_rate": 2.660355123889585e-07, "loss": 0.1483, "step": 111200 }, { "epoch": 9.06942633637549, "grad_norm": 27.242809295654297, "learning_rate": 2.614773971971929e-07, "loss": 0.1693, "step": 111300 }, { "epoch": 9.077574967405475, "grad_norm": 15.899724006652832, "learning_rate": 2.5695761976213704e-07, "loss": 0.1562, "step": 111400 }, { "epoch": 9.085723598435463, "grad_norm": 20.975248336791992, "learning_rate": 2.5247621665175636e-07, "loss": 0.1558, "step": 111500 }, { "epoch": 9.09387222946545, "grad_norm": 17.303001403808594, "learning_rate": 2.4803322412354227e-07, "loss": 0.1594, "step": 111600 }, { "epoch": 9.102020860495436, "grad_norm": 14.3364839553833, "learning_rate": 2.436286781242192e-07, "loss": 0.1558, "step": 111700 }, { "epoch": 9.110169491525424, "grad_norm": 18.47357940673828, "learning_rate": 2.3926261428945386e-07, "loss": 0.1713, "step": 111800 }, { "epoch": 9.118318122555412, "grad_norm": 2.021436929702759, "learning_rate": 2.3493506794356745e-07, "loss": 0.1577, "step": 111900 }, { "epoch": 9.126466753585397, "grad_norm": 4.512004852294922, "learning_rate": 2.3064607409924888e-07, "loss": 0.1552, "step": 112000 }, { "epoch": 9.134615384615385, "grad_norm": 21.13969612121582, "learning_rate": 2.2639566745727203e-07, "loss": 0.1504, "step": 112100 }, { "epoch": 9.142764015645371, "grad_norm": 17.030675888061523, "learning_rate": 2.2218388240621558e-07, "loss": 0.1785, "step": 112200 }, { "epoch": 9.150912646675359, "grad_norm": 11.586610794067383, "learning_rate": 2.1801075302218423e-07, "loss": 0.174, "step": 112300 }, { "epoch": 9.159061277705346, "grad_norm": 19.795167922973633, "learning_rate": 2.1387631306853174e-07, "loss": 0.1672, "step": 112400 }, { "epoch": 9.167209908735332, "grad_norm": 23.909713745117188, "learning_rate": 2.0978059599559065e-07, "loss": 0.1684, "step": 112500 }, { "epoch": 9.17535853976532, "grad_norm": 5.545074939727783, "learning_rate": 2.057236349403985e-07, "loss": 0.165, "step": 112600 }, { "epoch": 9.183507170795306, "grad_norm": 12.588091850280762, "learning_rate": 2.0170546272643256e-07, "loss": 0.167, "step": 112700 }, { "epoch": 9.191655801825293, "grad_norm": 12.73204517364502, "learning_rate": 1.9772611186334168e-07, "loss": 0.1535, "step": 112800 }, { "epoch": 9.19980443285528, "grad_norm": 11.712594985961914, "learning_rate": 1.9378561454668598e-07, "loss": 0.1629, "step": 112900 }, { "epoch": 9.207953063885267, "grad_norm": 6.922073841094971, "learning_rate": 1.8988400265767316e-07, "loss": 0.1544, "step": 113000 }, { "epoch": 9.216101694915254, "grad_norm": 14.258295059204102, "learning_rate": 1.8602130776290362e-07, "loss": 0.1575, "step": 113100 }, { "epoch": 9.224250325945242, "grad_norm": 20.113460540771484, "learning_rate": 1.8219756111411357e-07, "loss": 0.151, "step": 113200 }, { "epoch": 9.232398956975228, "grad_norm": 9.496116638183594, "learning_rate": 1.784127936479213e-07, "loss": 0.1791, "step": 113300 }, { "epoch": 9.240547588005215, "grad_norm": 7.643208026885986, "learning_rate": 1.7466703598557898e-07, "loss": 0.1752, "step": 113400 }, { "epoch": 9.248696219035201, "grad_norm": 21.511184692382812, "learning_rate": 1.709603184327241e-07, "loss": 0.1538, "step": 113500 }, { "epoch": 9.256844850065189, "grad_norm": 18.147607803344727, "learning_rate": 1.6729267097913338e-07, "loss": 0.1606, "step": 113600 }, { "epoch": 9.264993481095177, "grad_norm": 13.48155689239502, "learning_rate": 1.6366412329848035e-07, "loss": 0.1661, "step": 113700 }, { "epoch": 9.273142112125162, "grad_norm": 21.713895797729492, "learning_rate": 1.6007470474809772e-07, "loss": 0.157, "step": 113800 }, { "epoch": 9.28129074315515, "grad_norm": 11.30298137664795, "learning_rate": 1.565244443687347e-07, "loss": 0.1802, "step": 113900 }, { "epoch": 9.289439374185136, "grad_norm": 15.809433937072754, "learning_rate": 1.5301337088432787e-07, "loss": 0.1723, "step": 114000 }, { "epoch": 9.297588005215124, "grad_norm": 8.747072219848633, "learning_rate": 1.4954151270176686e-07, "loss": 0.1616, "step": 114100 }, { "epoch": 9.305736636245111, "grad_norm": 1.6549293994903564, "learning_rate": 1.4610889791066008e-07, "loss": 0.1732, "step": 114200 }, { "epoch": 9.313885267275097, "grad_norm": 13.10067367553711, "learning_rate": 1.4271555428311323e-07, "loss": 0.1618, "step": 114300 }, { "epoch": 9.322033898305085, "grad_norm": 13.006690979003906, "learning_rate": 1.39361509273504e-07, "loss": 0.1806, "step": 114400 }, { "epoch": 9.330182529335072, "grad_norm": 23.973905563354492, "learning_rate": 1.3604679001825605e-07, "loss": 0.1678, "step": 114500 }, { "epoch": 9.338331160365058, "grad_norm": 10.249641418457031, "learning_rate": 1.3277142333562253e-07, "loss": 0.1646, "step": 114600 }, { "epoch": 9.346479791395046, "grad_norm": 30.132413864135742, "learning_rate": 1.2953543572546968e-07, "loss": 0.1635, "step": 114700 }, { "epoch": 9.354628422425032, "grad_norm": 13.259139060974121, "learning_rate": 1.2633885336906014e-07, "loss": 0.172, "step": 114800 }, { "epoch": 9.36277705345502, "grad_norm": 19.1724853515625, "learning_rate": 1.2318170212884285e-07, "loss": 0.1633, "step": 114900 }, { "epoch": 9.370925684485007, "grad_norm": 14.311450004577637, "learning_rate": 1.2006400754824177e-07, "loss": 0.1747, "step": 115000 }, { "epoch": 9.379074315514993, "grad_norm": 8.39560317993164, "learning_rate": 1.1698579485145134e-07, "loss": 0.1441, "step": 115100 }, { "epoch": 9.38722294654498, "grad_norm": 10.600957870483398, "learning_rate": 1.1394708894323314e-07, "loss": 0.1923, "step": 115200 }, { "epoch": 9.395371577574968, "grad_norm": 9.45894718170166, "learning_rate": 1.1094791440871e-07, "loss": 0.1476, "step": 115300 }, { "epoch": 9.403520208604954, "grad_norm": 6.497547149658203, "learning_rate": 1.079882955131728e-07, "loss": 0.1621, "step": 115400 }, { "epoch": 9.411668839634942, "grad_norm": 5.700404644012451, "learning_rate": 1.0506825620187954e-07, "loss": 0.1569, "step": 115500 }, { "epoch": 9.419817470664928, "grad_norm": 5.055960655212402, "learning_rate": 1.0218782009986494e-07, "loss": 0.1439, "step": 115600 }, { "epoch": 9.427966101694915, "grad_norm": 0.8036000728607178, "learning_rate": 9.93470105117461e-08, "loss": 0.163, "step": 115700 }, { "epoch": 9.436114732724903, "grad_norm": 21.1984920501709, "learning_rate": 9.654585042153663e-08, "loss": 0.153, "step": 115800 }, { "epoch": 9.444263363754889, "grad_norm": 3.3010308742523193, "learning_rate": 9.378436249245892e-08, "loss": 0.1584, "step": 115900 }, { "epoch": 9.452411994784876, "grad_norm": 9.636171340942383, "learning_rate": 9.106256906676159e-08, "loss": 0.1765, "step": 116000 }, { "epoch": 9.460560625814864, "grad_norm": 1.7043323516845703, "learning_rate": 8.838049216554123e-08, "loss": 0.1604, "step": 116100 }, { "epoch": 9.46870925684485, "grad_norm": 9.73293399810791, "learning_rate": 8.573815348855818e-08, "loss": 0.1703, "step": 116200 }, { "epoch": 9.476857887874838, "grad_norm": 7.777896404266357, "learning_rate": 8.313557441406606e-08, "loss": 0.1632, "step": 116300 }, { "epoch": 9.485006518904823, "grad_norm": 17.46415901184082, "learning_rate": 8.057277599863744e-08, "loss": 0.1536, "step": 116400 }, { "epoch": 9.493155149934811, "grad_norm": 10.912395477294922, "learning_rate": 7.804977897699295e-08, "loss": 0.1611, "step": 116500 }, { "epoch": 9.501303780964799, "grad_norm": 12.858296394348145, "learning_rate": 7.556660376183301e-08, "loss": 0.1458, "step": 116600 }, { "epoch": 9.509452411994785, "grad_norm": 7.577301025390625, "learning_rate": 7.312327044367463e-08, "loss": 0.1408, "step": 116700 }, { "epoch": 9.517601043024772, "grad_norm": 13.470318794250488, "learning_rate": 7.071979879068769e-08, "loss": 0.1568, "step": 116800 }, { "epoch": 9.525749674054758, "grad_norm": 16.199295043945312, "learning_rate": 6.835620824853451e-08, "loss": 0.161, "step": 116900 }, { "epoch": 9.533898305084746, "grad_norm": 15.154216766357422, "learning_rate": 6.603251794021381e-08, "loss": 0.1783, "step": 117000 }, { "epoch": 9.542046936114733, "grad_norm": 9.926989555358887, "learning_rate": 6.374874666590369e-08, "loss": 0.149, "step": 117100 }, { "epoch": 9.55019556714472, "grad_norm": 14.719680786132812, "learning_rate": 6.15049129028128e-08, "loss": 0.1459, "step": 117200 }, { "epoch": 9.558344198174707, "grad_norm": 23.45909881591797, "learning_rate": 5.93010348050288e-08, "loss": 0.1624, "step": 117300 }, { "epoch": 9.566492829204694, "grad_norm": 22.256080627441406, "learning_rate": 5.7137130203370194e-08, "loss": 0.1536, "step": 117400 }, { "epoch": 9.57464146023468, "grad_norm": 5.540316581726074, "learning_rate": 5.501321660524583e-08, "loss": 0.1541, "step": 117500 }, { "epoch": 9.582790091264668, "grad_norm": 3.839772939682007, "learning_rate": 5.292931119451006e-08, "loss": 0.1577, "step": 117600 }, { "epoch": 9.590938722294654, "grad_norm": 4.665050029754639, "learning_rate": 5.088543083132502e-08, "loss": 0.1547, "step": 117700 }, { "epoch": 9.599087353324641, "grad_norm": 18.975759506225586, "learning_rate": 4.888159205202303e-08, "loss": 0.1652, "step": 117800 }, { "epoch": 9.607235984354629, "grad_norm": 13.844809532165527, "learning_rate": 4.691781106897497e-08, "loss": 0.1528, "step": 117900 }, { "epoch": 9.615384615384615, "grad_norm": 5.203334331512451, "learning_rate": 4.499410377045765e-08, "loss": 0.1484, "step": 118000 }, { "epoch": 9.623533246414603, "grad_norm": 17.595108032226562, "learning_rate": 4.311048572052501e-08, "loss": 0.1547, "step": 118100 }, { "epoch": 9.631681877444588, "grad_norm": 10.652242660522461, "learning_rate": 4.1266972158883204e-08, "loss": 0.1658, "step": 118200 }, { "epoch": 9.639830508474576, "grad_norm": 15.711381912231445, "learning_rate": 3.9463578000765724e-08, "loss": 0.1493, "step": 118300 }, { "epoch": 9.647979139504564, "grad_norm": 18.064918518066406, "learning_rate": 3.7700317836814605e-08, "loss": 0.1558, "step": 118400 }, { "epoch": 9.65612777053455, "grad_norm": 11.699357986450195, "learning_rate": 3.5977205932962164e-08, "loss": 0.1465, "step": 118500 }, { "epoch": 9.664276401564537, "grad_norm": 9.775052070617676, "learning_rate": 3.429425623031335e-08, "loss": 0.1456, "step": 118600 }, { "epoch": 9.672425032594525, "grad_norm": 19.886598587036133, "learning_rate": 3.265148234503579e-08, "loss": 0.165, "step": 118700 }, { "epoch": 9.68057366362451, "grad_norm": 13.31386661529541, "learning_rate": 3.104889756824825e-08, "loss": 0.1682, "step": 118800 }, { "epoch": 9.688722294654498, "grad_norm": 16.752405166625977, "learning_rate": 2.9486514865912364e-08, "loss": 0.1498, "step": 118900 }, { "epoch": 9.696870925684484, "grad_norm": 12.920425415039062, "learning_rate": 2.7964346878729952e-08, "loss": 0.1573, "step": 119000 }, { "epoch": 9.705019556714472, "grad_norm": 5.0780110359191895, "learning_rate": 2.64824059220381e-08, "loss": 0.159, "step": 119100 }, { "epoch": 9.71316818774446, "grad_norm": 13.475509643554688, "learning_rate": 2.504070398571201e-08, "loss": 0.1997, "step": 119200 }, { "epoch": 9.721316818774445, "grad_norm": 20.931211471557617, "learning_rate": 2.3639252734065644e-08, "loss": 0.1957, "step": 119300 }, { "epoch": 9.729465449804433, "grad_norm": 20.29063606262207, "learning_rate": 2.227806350575956e-08, "loss": 0.1388, "step": 119400 }, { "epoch": 9.737614080834419, "grad_norm": 0.7664732336997986, "learning_rate": 2.0957147313707127e-08, "loss": 0.166, "step": 119500 }, { "epoch": 9.745762711864407, "grad_norm": 18.868257522583008, "learning_rate": 1.9676514844987338e-08, "loss": 0.1618, "step": 119600 }, { "epoch": 9.753911342894394, "grad_norm": 15.741533279418945, "learning_rate": 1.8436176460756572e-08, "loss": 0.1589, "step": 119700 }, { "epoch": 9.76205997392438, "grad_norm": 11.955362319946289, "learning_rate": 1.723614219616754e-08, "loss": 0.168, "step": 119800 }, { "epoch": 9.770208604954368, "grad_norm": 26.171483993530273, "learning_rate": 1.6076421760283234e-08, "loss": 0.157, "step": 119900 }, { "epoch": 9.778357235984355, "grad_norm": 14.887884140014648, "learning_rate": 1.4957024536003674e-08, "loss": 0.1383, "step": 120000 }, { "epoch": 9.786505867014341, "grad_norm": 9.518312454223633, "learning_rate": 1.3877959579985944e-08, "loss": 0.1385, "step": 120100 }, { "epoch": 9.794654498044329, "grad_norm": 18.155826568603516, "learning_rate": 1.283923562257483e-08, "loss": 0.1623, "step": 120200 }, { "epoch": 9.802803129074315, "grad_norm": 17.2945613861084, "learning_rate": 1.1840861067727306e-08, "loss": 0.1551, "step": 120300 }, { "epoch": 9.810951760104302, "grad_norm": 24.658214569091797, "learning_rate": 1.0882843992949255e-08, "loss": 0.1499, "step": 120400 }, { "epoch": 9.81910039113429, "grad_norm": 6.880736351013184, "learning_rate": 9.9651921492272e-09, "loss": 0.1501, "step": 120500 }, { "epoch": 9.827249022164276, "grad_norm": 25.12505531311035, "learning_rate": 9.087912960967227e-09, "loss": 0.1571, "step": 120600 }, { "epoch": 9.835397653194264, "grad_norm": 28.05438995361328, "learning_rate": 8.251013525932273e-09, "loss": 0.1637, "step": 120700 }, { "epoch": 9.843546284224251, "grad_norm": 10.58689022064209, "learning_rate": 7.454500615188264e-09, "loss": 0.1509, "step": 120800 }, { "epoch": 9.851694915254237, "grad_norm": 24.10919761657715, "learning_rate": 6.698380673048066e-09, "loss": 0.1691, "step": 120900 }, { "epoch": 9.859843546284225, "grad_norm": 0.43672606348991394, "learning_rate": 5.982659817017067e-09, "loss": 0.1746, "step": 121000 }, { "epoch": 9.86799217731421, "grad_norm": 12.899723052978516, "learning_rate": 5.307343837747115e-09, "loss": 0.1497, "step": 121100 }, { "epoch": 9.876140808344198, "grad_norm": 18.292190551757812, "learning_rate": 4.672438198987661e-09, "loss": 0.1594, "step": 121200 }, { "epoch": 9.884289439374186, "grad_norm": 18.396923065185547, "learning_rate": 4.077948037541357e-09, "loss": 0.1574, "step": 121300 }, { "epoch": 9.892438070404172, "grad_norm": 22.605993270874023, "learning_rate": 3.5238781632240813e-09, "loss": 0.1642, "step": 121400 }, { "epoch": 9.90058670143416, "grad_norm": 23.427574157714844, "learning_rate": 3.010233058824419e-09, "loss": 0.1765, "step": 121500 }, { "epoch": 9.908735332464147, "grad_norm": 3.2891268730163574, "learning_rate": 2.5370168800681325e-09, "loss": 0.1743, "step": 121600 }, { "epoch": 9.916883963494133, "grad_norm": 19.58220100402832, "learning_rate": 2.1042334555848585e-09, "loss": 0.1596, "step": 121700 }, { "epoch": 9.92503259452412, "grad_norm": 15.260977745056152, "learning_rate": 1.711886286876463e-09, "loss": 0.1486, "step": 121800 }, { "epoch": 9.933181225554106, "grad_norm": 5.988215446472168, "learning_rate": 1.3599785482881767e-09, "loss": 0.1518, "step": 121900 }, { "epoch": 9.941329856584094, "grad_norm": 5.850574970245361, "learning_rate": 1.0485130869858362e-09, "loss": 0.1588, "step": 122000 }, { "epoch": 9.949478487614082, "grad_norm": 11.288055419921875, "learning_rate": 7.774924229281278e-10, "loss": 0.1734, "step": 122100 }, { "epoch": 9.957627118644067, "grad_norm": 10.142143249511719, "learning_rate": 5.469187488510441e-10, "loss": 0.173, "step": 122200 }, { "epoch": 9.965775749674055, "grad_norm": 14.460721015930176, "learning_rate": 3.5679393024623533e-10, "loss": 0.1603, "step": 122300 }, { "epoch": 9.973924380704041, "grad_norm": 23.698572158813477, "learning_rate": 2.071195053482411e-10, "loss": 0.1616, "step": 122400 }, { "epoch": 9.982073011734029, "grad_norm": 9.158120155334473, "learning_rate": 9.789668512116823e-11, "loss": 0.1702, "step": 122500 }, { "epoch": 9.990221642764016, "grad_norm": 32.79683303833008, "learning_rate": 2.912635325036384e-11, "loss": 0.1718, "step": 122600 }, { "epoch": 9.998370273794002, "grad_norm": 17.385313034057617, "learning_rate": 8.090661318682636e-13, "loss": 0.1656, "step": 122700 }, { "epoch": 10.0, "eval_accuracy": 0.8172690763052208, "eval_loss": 0.7754501700401306, "eval_runtime": 7.1115, "eval_samples_per_second": 350.138, "eval_steps_per_second": 43.873, "step": 122720 }, { "epoch": 10.0, "step": 122720, "total_flos": 1.617427903829713e+17, "train_loss": 0.3309724763735177, "train_runtime": 41426.0671, "train_samples_per_second": 94.796, "train_steps_per_second": 2.962 } ], "logging_steps": 100, "max_steps": 122720, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.617427903829713e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }