{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 1.9420166015625, "accuracy": 0.46875, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 9.666593551635742, "learning_rate": 2.5000000000000004e-07, "loss": 0.7076, "step": 1 }, { "Batch Mean": 1.9312744140625, "accuracy": 0.4375, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 7.076696395874023, "learning_rate": 5.000000000000001e-07, "loss": 0.7073, "step": 2 }, { "Batch Mean": 1.9347686767578125, "accuracy": 0.5, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 6.997435092926025, "learning_rate": 7.5e-07, "loss": 0.6918, "step": 3 }, { "Batch Mean": 1.94329833984375, "accuracy": 0.53125, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 7.863523006439209, "learning_rate": 1.0000000000000002e-06, "loss": 0.6877, "step": 4 }, { "Batch Mean": 1.94287109375, "accuracy": 0.5703125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 6.760534286499023, "learning_rate": 1.25e-06, "loss": 0.6861, "step": 5 }, { "Batch Mean": 1.97332763671875, "accuracy": 0.53125, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 8.08924674987793, "learning_rate": 1.5e-06, "loss": 0.6869, "step": 6 }, { "Batch Mean": 2.018096923828125, "accuracy": 0.46875, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 6.236896991729736, "learning_rate": 1.75e-06, "loss": 0.7053, "step": 7 }, { "Batch Mean": 2.14398193359375, "accuracy": 0.5390625, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.7978620529174805, "learning_rate": 2.0000000000000003e-06, "loss": 0.6925, "step": 8 }, { "Batch Mean": 2.214202880859375, "accuracy": 0.4453125, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 7.25023889541626, "learning_rate": 2.25e-06, "loss": 0.7048, "step": 9 }, { "Batch Mean": 2.3272705078125, "accuracy": 0.5859375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 6.383249759674072, "learning_rate": 2.5e-06, "loss": 0.6815, "step": 10 }, { "Batch Mean": 2.49114990234375, "accuracy": 0.6640625, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 9.219279289245605, "learning_rate": 2.7500000000000004e-06, "loss": 0.6547, "step": 11 }, { "Batch Mean": 2.550567626953125, "accuracy": 0.6328125, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 7.704563617706299, "learning_rate": 3e-06, "loss": 0.6432, "step": 12 }, { "Batch Mean": 2.645782470703125, "accuracy": 0.671875, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 7.639222145080566, "learning_rate": 3.2500000000000002e-06, "loss": 0.6365, "step": 13 }, { "Batch Mean": 2.8451995849609375, "accuracy": 0.6171875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 13.006075859069824, "learning_rate": 3.5e-06, "loss": 0.659, "step": 14 }, { "Batch Mean": 3.0160064697265625, "accuracy": 0.609375, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 11.644806861877441, "learning_rate": 3.7500000000000005e-06, "loss": 0.6574, "step": 15 }, { "Batch Mean": 3.0444812774658203, "accuracy": 0.671875, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 12.809576034545898, "learning_rate": 4.000000000000001e-06, "loss": 0.6138, "step": 16 }, { "Batch Mean": 3.1425399780273438, "accuracy": 0.6875, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 10.398422241210938, "learning_rate": 4.25e-06, "loss": 0.5373, "step": 17 }, { "Batch Mean": 2.997763156890869, "accuracy": 0.6015625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 9.611076354980469, "learning_rate": 4.5e-06, "loss": 0.5884, "step": 18 }, { "Batch Mean": 2.822673797607422, "accuracy": 0.6953125, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 11.101300239562988, "learning_rate": 4.75e-06, "loss": 0.5819, "step": 19 }, { "Batch Mean": 2.9509963989257812, "accuracy": 0.6796875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 11.16808032989502, "learning_rate": 5e-06, "loss": 0.5849, "step": 20 }, { "Batch Mean": 2.7339065074920654, "accuracy": 0.609375, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 13.900754928588867, "learning_rate": 4.986842105263158e-06, "loss": 0.6193, "step": 21 }, { "Batch Mean": 2.5777530670166016, "accuracy": 0.71875, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 9.619379997253418, "learning_rate": 4.973684210526316e-06, "loss": 0.599, "step": 22 }, { "Batch Mean": 2.6994237899780273, "accuracy": 0.65625, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 9.464532852172852, "learning_rate": 4.960526315789474e-06, "loss": 0.6118, "step": 23 }, { "Batch Mean": 2.70928955078125, "accuracy": 0.6484375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 8.05068588256836, "learning_rate": 4.947368421052632e-06, "loss": 0.6214, "step": 24 }, { "Batch Mean": 2.8170089721679688, "accuracy": 0.6796875, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 9.21177864074707, "learning_rate": 4.9342105263157895e-06, "loss": 0.6165, "step": 25 }, { "Batch Mean": 2.7590713500976562, "accuracy": 0.6171875, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 6.078210353851318, "learning_rate": 4.921052631578948e-06, "loss": 0.6027, "step": 26 }, { "Batch Mean": 2.8484420776367188, "accuracy": 0.6796875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 5.933135032653809, "learning_rate": 4.907894736842106e-06, "loss": 0.5721, "step": 27 }, { "Batch Mean": 2.9572601318359375, "accuracy": 0.6796875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.4003143310546875, "learning_rate": 4.894736842105264e-06, "loss": 0.5626, "step": 28 }, { "Batch Mean": 2.9868011474609375, "accuracy": 0.6796875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 4.7936859130859375, "learning_rate": 4.881578947368422e-06, "loss": 0.5809, "step": 29 }, { "Batch Mean": 3.106109619140625, "accuracy": 0.65625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 5.339725017547607, "learning_rate": 4.8684210526315795e-06, "loss": 0.6132, "step": 30 }, { "Batch Mean": 3.29296875, "accuracy": 0.734375, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 5.850327491760254, "learning_rate": 4.855263157894737e-06, "loss": 0.584, "step": 31 }, { "Batch Mean": 3.51275634765625, "accuracy": 0.65625, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 6.158403396606445, "learning_rate": 4.842105263157895e-06, "loss": 0.5911, "step": 32 }, { "Batch Mean": 3.4871044158935547, "accuracy": 0.71875, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 5.687869071960449, "learning_rate": 4.828947368421053e-06, "loss": 0.5243, "step": 33 }, { "Batch Mean": 3.6311264038085938, "accuracy": 0.8203125, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 6.883860111236572, "learning_rate": 4.815789473684211e-06, "loss": 0.4525, "step": 34 }, { "Batch Mean": 3.747730255126953, "accuracy": 0.6875, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 8.108345031738281, "learning_rate": 4.802631578947369e-06, "loss": 0.5392, "step": 35 }, { "Batch Mean": 4.052231788635254, "accuracy": 0.671875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 8.877079963684082, "learning_rate": 4.789473684210527e-06, "loss": 0.5759, "step": 36 }, { "Batch Mean": 4.003006458282471, "accuracy": 0.7421875, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 9.152173042297363, "learning_rate": 4.7763157894736844e-06, "loss": 0.5118, "step": 37 }, { "Batch Mean": 4.063720703125, "accuracy": 0.6171875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 10.134416580200195, "learning_rate": 4.763157894736842e-06, "loss": 0.6653, "step": 38 }, { "Batch Mean": 4.406402587890625, "accuracy": 0.703125, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 7.997857570648193, "learning_rate": 4.75e-06, "loss": 0.525, "step": 39 }, { "Batch Mean": 4.049163818359375, "accuracy": 0.7578125, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 8.24782657623291, "learning_rate": 4.736842105263158e-06, "loss": 0.4568, "step": 40 }, { "Batch Mean": 4.293998718261719, "accuracy": 0.7890625, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 7.328802108764648, "learning_rate": 4.723684210526316e-06, "loss": 0.4618, "step": 41 }, { "Batch Mean": 4.420158386230469, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.795598030090332, "learning_rate": 4.710526315789474e-06, "loss": 0.5121, "step": 42 }, { "Batch Mean": 4.374538421630859, "accuracy": 0.6953125, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.923321723937988, "learning_rate": 4.697368421052632e-06, "loss": 0.5919, "step": 43 }, { "Batch Mean": 4.3918304443359375, "accuracy": 0.78125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 6.950074672698975, "learning_rate": 4.68421052631579e-06, "loss": 0.499, "step": 44 }, { "Batch Mean": 4.343505859375, "accuracy": 0.7578125, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 7.051249980926514, "learning_rate": 4.671052631578948e-06, "loss": 0.5362, "step": 45 }, { "Batch Mean": 4.268529891967773, "accuracy": 0.7265625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 7.27433443069458, "learning_rate": 4.657894736842106e-06, "loss": 0.527, "step": 46 }, { "Batch Mean": 4.0211334228515625, "accuracy": 0.796875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 7.680591106414795, "learning_rate": 4.6447368421052635e-06, "loss": 0.5004, "step": 47 }, { "Batch Mean": 4.125762939453125, "accuracy": 0.7578125, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 6.570550441741943, "learning_rate": 4.631578947368421e-06, "loss": 0.4887, "step": 48 }, { "Batch Mean": 3.82733154296875, "accuracy": 0.6953125, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 6.966435432434082, "learning_rate": 4.618421052631579e-06, "loss": 0.5351, "step": 49 }, { "Batch Mean": 3.548530101776123, "accuracy": 0.6796875, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 9.178398132324219, "learning_rate": 4.605263157894737e-06, "loss": 0.6348, "step": 50 }, { "Batch Mean": 3.5271453857421875, "accuracy": 0.7421875, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.21920108795166, "learning_rate": 4.592105263157895e-06, "loss": 0.5138, "step": 51 }, { "Batch Mean": 3.3803443908691406, "accuracy": 0.75, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.866484642028809, "learning_rate": 4.578947368421053e-06, "loss": 0.5163, "step": 52 }, { "Batch Mean": 3.42218017578125, "accuracy": 0.734375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 5.488794803619385, "learning_rate": 4.565789473684211e-06, "loss": 0.5135, "step": 53 }, { "Batch Mean": 3.271286964416504, "accuracy": 0.6484375, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 6.306389808654785, "learning_rate": 4.552631578947369e-06, "loss": 0.5649, "step": 54 }, { "Batch Mean": 3.0698814392089844, "accuracy": 0.7578125, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 6.536716461181641, "learning_rate": 4.539473684210527e-06, "loss": 0.5415, "step": 55 }, { "Batch Mean": 3.088502883911133, "accuracy": 0.75, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.343382358551025, "learning_rate": 4.526315789473685e-06, "loss": 0.5206, "step": 56 }, { "Batch Mean": 3.2023324966430664, "accuracy": 0.6796875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 7.171923637390137, "learning_rate": 4.513157894736843e-06, "loss": 0.5765, "step": 57 }, { "Batch Mean": 2.9620323181152344, "accuracy": 0.75, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 7.703383922576904, "learning_rate": 4.5e-06, "loss": 0.5144, "step": 58 }, { "Batch Mean": 2.995695114135742, "accuracy": 0.734375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 8.559469223022461, "learning_rate": 4.4868421052631584e-06, "loss": 0.4926, "step": 59 }, { "Batch Mean": 3.2689762115478516, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.663382530212402, "learning_rate": 4.473684210526316e-06, "loss": 0.4705, "step": 60 }, { "Batch Mean": 2.9887285232543945, "accuracy": 0.7265625, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 7.883590221405029, "learning_rate": 4.460526315789474e-06, "loss": 0.5447, "step": 61 }, { "Batch Mean": 2.901198387145996, "accuracy": 0.78125, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 7.087621688842773, "learning_rate": 4.447368421052632e-06, "loss": 0.4382, "step": 62 }, { "Batch Mean": 2.978053569793701, "accuracy": 0.7265625, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.361074447631836, "learning_rate": 4.43421052631579e-06, "loss": 0.5379, "step": 63 }, { "Batch Mean": 2.9929721355438232, "accuracy": 0.765625, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.397054672241211, "learning_rate": 4.4210526315789476e-06, "loss": 0.489, "step": 64 }, { "Batch Mean": 2.9623823165893555, "accuracy": 0.734375, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 11.552237510681152, "learning_rate": 4.407894736842105e-06, "loss": 0.5394, "step": 65 }, { "Batch Mean": 2.897951602935791, "accuracy": 0.734375, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 9.772311210632324, "learning_rate": 4.394736842105263e-06, "loss": 0.5274, "step": 66 }, { "Batch Mean": 2.8228421211242676, "accuracy": 0.75, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 8.786587715148926, "learning_rate": 4.381578947368421e-06, "loss": 0.5151, "step": 67 }, { "Batch Mean": 2.93172550201416, "accuracy": 0.8125, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 7.272209167480469, "learning_rate": 4.368421052631579e-06, "loss": 0.4259, "step": 68 }, { "Batch Mean": 3.0047380924224854, "accuracy": 0.6953125, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.469968795776367, "learning_rate": 4.3552631578947375e-06, "loss": 0.5409, "step": 69 }, { "Batch Mean": 3.2963790893554688, "accuracy": 0.75, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 8.683908462524414, "learning_rate": 4.342105263157895e-06, "loss": 0.4782, "step": 70 }, { "Batch Mean": 3.195692300796509, "accuracy": 0.7734375, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 7.932145118713379, "learning_rate": 4.328947368421053e-06, "loss": 0.4693, "step": 71 }, { "Batch Mean": 3.188863754272461, "accuracy": 0.8125, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.82938814163208, "learning_rate": 4.315789473684211e-06, "loss": 0.3917, "step": 72 }, { "Batch Mean": 3.1435837745666504, "accuracy": 0.7421875, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 6.974360942840576, "learning_rate": 4.302631578947369e-06, "loss": 0.49, "step": 73 }, { "Batch Mean": 3.5599288940429688, "accuracy": 0.78125, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.391234397888184, "learning_rate": 4.289473684210527e-06, "loss": 0.507, "step": 74 }, { "Batch Mean": 3.1217479705810547, "accuracy": 0.7421875, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 6.925199508666992, "learning_rate": 4.276315789473684e-06, "loss": 0.5203, "step": 75 }, { "Batch Mean": 3.3439712524414062, "accuracy": 0.7109375, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 8.055448532104492, "learning_rate": 4.2631578947368425e-06, "loss": 0.5173, "step": 76 }, { "Batch Mean": 3.372138023376465, "accuracy": 0.7734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.226714611053467, "learning_rate": 4.25e-06, "loss": 0.487, "step": 77 }, { "Batch Mean": 3.52044677734375, "accuracy": 0.8359375, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.280280113220215, "learning_rate": 4.236842105263158e-06, "loss": 0.4553, "step": 78 }, { "Batch Mean": 3.2147364616394043, "accuracy": 0.8359375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 6.3655781745910645, "learning_rate": 4.223684210526316e-06, "loss": 0.4328, "step": 79 }, { "Batch Mean": 3.4875216484069824, "accuracy": 0.8203125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 6.6272759437561035, "learning_rate": 4.210526315789474e-06, "loss": 0.4119, "step": 80 }, { "Batch Mean": 3.427578926086426, "accuracy": 0.78125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 8.57567024230957, "learning_rate": 4.197368421052632e-06, "loss": 0.4891, "step": 81 }, { "Batch Mean": 3.7187185287475586, "accuracy": 0.7578125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.171870708465576, "learning_rate": 4.18421052631579e-06, "loss": 0.439, "step": 82 }, { "Batch Mean": 3.990264892578125, "accuracy": 0.7734375, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.19045639038086, "learning_rate": 4.171052631578948e-06, "loss": 0.4541, "step": 83 }, { "Batch Mean": 4.509843826293945, "accuracy": 0.7265625, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 9.934694290161133, "learning_rate": 4.157894736842106e-06, "loss": 0.5258, "step": 84 }, { "Batch Mean": 4.694008827209473, "accuracy": 0.8125, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 7.037469863891602, "learning_rate": 4.144736842105263e-06, "loss": 0.4123, "step": 85 }, { "Batch Mean": 4.4774580001831055, "accuracy": 0.71875, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 9.059208869934082, "learning_rate": 4.1315789473684216e-06, "loss": 0.541, "step": 86 }, { "Batch Mean": 4.323337078094482, "accuracy": 0.7734375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 7.794407367706299, "learning_rate": 4.118421052631579e-06, "loss": 0.4797, "step": 87 }, { "Batch Mean": 4.874683380126953, "accuracy": 0.7578125, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 9.554245948791504, "learning_rate": 4.105263157894737e-06, "loss": 0.5024, "step": 88 }, { "Batch Mean": 4.5128173828125, "accuracy": 0.8046875, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 7.580309867858887, "learning_rate": 4.092105263157895e-06, "loss": 0.4127, "step": 89 }, { "Batch Mean": 4.122764587402344, "accuracy": 0.7890625, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.205588340759277, "learning_rate": 4.078947368421053e-06, "loss": 0.4207, "step": 90 }, { "Batch Mean": 4.157896041870117, "accuracy": 0.765625, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 9.179079055786133, "learning_rate": 4.065789473684211e-06, "loss": 0.505, "step": 91 }, { "Batch Mean": 4.137635231018066, "accuracy": 0.7890625, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 6.633444309234619, "learning_rate": 4.052631578947368e-06, "loss": 0.3927, "step": 92 }, { "Batch Mean": 3.7938804626464844, "accuracy": 0.828125, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 6.795446395874023, "learning_rate": 4.0394736842105265e-06, "loss": 0.4159, "step": 93 }, { "Batch Mean": 3.357052803039551, "accuracy": 0.8046875, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 7.158699989318848, "learning_rate": 4.026315789473684e-06, "loss": 0.4018, "step": 94 }, { "Batch Mean": 3.3779516220092773, "accuracy": 0.75, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 7.249587059020996, "learning_rate": 4.013157894736842e-06, "loss": 0.4374, "step": 95 }, { "Batch Mean": 3.2399396896362305, "accuracy": 0.8046875, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.230793476104736, "learning_rate": 4.000000000000001e-06, "loss": 0.4345, "step": 96 }, { "Batch Mean": 3.1136474609375, "accuracy": 0.8046875, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 8.101882934570312, "learning_rate": 3.986842105263158e-06, "loss": 0.4576, "step": 97 }, { "Batch Mean": 3.214869260787964, "accuracy": 0.8125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 8.142618179321289, "learning_rate": 3.9736842105263165e-06, "loss": 0.4481, "step": 98 }, { "Batch Mean": 3.2699995040893555, "accuracy": 0.8046875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 8.672248840332031, "learning_rate": 3.960526315789474e-06, "loss": 0.4518, "step": 99 }, { "Batch Mean": 3.287128448486328, "accuracy": 0.7578125, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 9.054008483886719, "learning_rate": 3.947368421052632e-06, "loss": 0.4934, "step": 100 }, { "Batch Mean": 2.6553401947021484, "accuracy": 0.7734375, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 9.374197006225586, "learning_rate": 3.93421052631579e-06, "loss": 0.5046, "step": 101 }, { "Batch Mean": 2.436018943786621, "accuracy": 0.78125, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 9.066594123840332, "learning_rate": 3.921052631578947e-06, "loss": 0.5216, "step": 102 }, { "Batch Mean": 2.2069954872131348, "accuracy": 0.7734375, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 6.94660758972168, "learning_rate": 3.907894736842106e-06, "loss": 0.4278, "step": 103 }, { "Batch Mean": 2.225872755050659, "accuracy": 0.7734375, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 6.518649101257324, "learning_rate": 3.894736842105263e-06, "loss": 0.4473, "step": 104 }, { "Batch Mean": 2.217318058013916, "accuracy": 0.796875, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 6.582048416137695, "learning_rate": 3.8815789473684214e-06, "loss": 0.4356, "step": 105 }, { "Batch Mean": 1.917119026184082, "accuracy": 0.828125, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 7.497337818145752, "learning_rate": 3.868421052631579e-06, "loss": 0.3825, "step": 106 }, { "Batch Mean": 1.5780539512634277, "accuracy": 0.7578125, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 8.559232711791992, "learning_rate": 3.855263157894737e-06, "loss": 0.4984, "step": 107 }, { "Batch Mean": 1.9931042194366455, "accuracy": 0.8203125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 6.91691780090332, "learning_rate": 3.842105263157895e-06, "loss": 0.401, "step": 108 }, { "Batch Mean": 2.131106376647949, "accuracy": 0.8046875, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 6.739565849304199, "learning_rate": 3.828947368421053e-06, "loss": 0.3582, "step": 109 }, { "Batch Mean": 2.5897467136383057, "accuracy": 0.8671875, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 8.209200859069824, "learning_rate": 3.815789473684211e-06, "loss": 0.3216, "step": 110 }, { "Batch Mean": 2.6740541458129883, "accuracy": 0.75, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 10.885510444641113, "learning_rate": 3.802631578947369e-06, "loss": 0.5196, "step": 111 }, { "Batch Mean": 2.9480109214782715, "accuracy": 0.703125, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 13.55850887298584, "learning_rate": 3.789473684210527e-06, "loss": 0.5931, "step": 112 }, { "Batch Mean": 3.3308558464050293, "accuracy": 0.8515625, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 9.246851921081543, "learning_rate": 3.7763157894736847e-06, "loss": 0.3901, "step": 113 }, { "Batch Mean": 2.8297948837280273, "accuracy": 0.7890625, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 9.644515037536621, "learning_rate": 3.7631578947368426e-06, "loss": 0.4573, "step": 114 }, { "Batch Mean": 3.7451376914978027, "accuracy": 0.7265625, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 12.58398151397705, "learning_rate": 3.7500000000000005e-06, "loss": 0.5545, "step": 115 }, { "Batch Mean": 3.6171021461486816, "accuracy": 0.7890625, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 9.254029273986816, "learning_rate": 3.736842105263158e-06, "loss": 0.4333, "step": 116 }, { "Batch Mean": 2.7716548442840576, "accuracy": 0.8046875, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 10.669205665588379, "learning_rate": 3.723684210526316e-06, "loss": 0.4602, "step": 117 }, { "Batch Mean": 2.781661033630371, "accuracy": 0.7734375, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 10.094223976135254, "learning_rate": 3.710526315789474e-06, "loss": 0.528, "step": 118 }, { "Batch Mean": 2.198807716369629, "accuracy": 0.7578125, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 7.635989189147949, "learning_rate": 3.6973684210526317e-06, "loss": 0.4268, "step": 119 }, { "Batch Mean": 2.0033488273620605, "accuracy": 0.7734375, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 7.30623197555542, "learning_rate": 3.6842105263157896e-06, "loss": 0.4528, "step": 120 }, { "Batch Mean": 1.8473689556121826, "accuracy": 0.859375, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 6.51129674911499, "learning_rate": 3.6710526315789476e-06, "loss": 0.4022, "step": 121 }, { "Batch Mean": 1.650212287902832, "accuracy": 0.78125, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 7.191035270690918, "learning_rate": 3.657894736842106e-06, "loss": 0.4487, "step": 122 }, { "Batch Mean": 1.7975679636001587, "accuracy": 0.7890625, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.7035651206970215, "learning_rate": 3.644736842105264e-06, "loss": 0.4338, "step": 123 }, { "Batch Mean": 1.5073213577270508, "accuracy": 0.8125, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 6.955989837646484, "learning_rate": 3.6315789473684217e-06, "loss": 0.4144, "step": 124 }, { "Batch Mean": 1.7353585958480835, "accuracy": 0.75, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 8.312121391296387, "learning_rate": 3.618421052631579e-06, "loss": 0.5017, "step": 125 }, { "Batch Mean": 1.718948245048523, "accuracy": 0.734375, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 8.334877967834473, "learning_rate": 3.605263157894737e-06, "loss": 0.4731, "step": 126 }, { "Batch Mean": 1.7114816904067993, "accuracy": 0.8359375, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 7.221029281616211, "learning_rate": 3.592105263157895e-06, "loss": 0.3843, "step": 127 }, { "Batch Mean": 1.466775894165039, "accuracy": 0.7578125, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 9.671870231628418, "learning_rate": 3.578947368421053e-06, "loss": 0.517, "step": 128 }, { "Batch Mean": 2.0674805641174316, "accuracy": 0.8125, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 7.535254955291748, "learning_rate": 3.565789473684211e-06, "loss": 0.4002, "step": 129 }, { "Batch Mean": 1.9499692916870117, "accuracy": 0.8046875, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.19841480255127, "learning_rate": 3.5526315789473687e-06, "loss": 0.4101, "step": 130 }, { "Batch Mean": 1.8141679763793945, "accuracy": 0.7890625, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 8.877778053283691, "learning_rate": 3.5394736842105266e-06, "loss": 0.458, "step": 131 }, { "Batch Mean": 2.349834680557251, "accuracy": 0.8046875, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 7.818430423736572, "learning_rate": 3.5263157894736846e-06, "loss": 0.3948, "step": 132 }, { "Batch Mean": 2.4295411109924316, "accuracy": 0.765625, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 10.082527160644531, "learning_rate": 3.513157894736842e-06, "loss": 0.4616, "step": 133 }, { "Batch Mean": 2.609513282775879, "accuracy": 0.8359375, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 8.784564971923828, "learning_rate": 3.5e-06, "loss": 0.3703, "step": 134 }, { "Batch Mean": 2.7128183841705322, "accuracy": 0.7890625, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 9.766796112060547, "learning_rate": 3.486842105263158e-06, "loss": 0.3847, "step": 135 }, { "Batch Mean": 3.1417746543884277, "accuracy": 0.7265625, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 11.59825325012207, "learning_rate": 3.473684210526316e-06, "loss": 0.5143, "step": 136 }, { "Batch Mean": 2.894944190979004, "accuracy": 0.7421875, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 12.137927055358887, "learning_rate": 3.460526315789474e-06, "loss": 0.5355, "step": 137 }, { "Batch Mean": 2.7786073684692383, "accuracy": 0.7890625, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 9.075066566467285, "learning_rate": 3.447368421052632e-06, "loss": 0.4115, "step": 138 }, { "Batch Mean": 3.0002739429473877, "accuracy": 0.734375, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 10.028230667114258, "learning_rate": 3.43421052631579e-06, "loss": 0.4722, "step": 139 }, { "Batch Mean": 2.4786624908447266, "accuracy": 0.765625, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 9.266149520874023, "learning_rate": 3.421052631578948e-06, "loss": 0.4656, "step": 140 }, { "Batch Mean": 2.1704983711242676, "accuracy": 0.8125, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 6.836582660675049, "learning_rate": 3.4078947368421057e-06, "loss": 0.3686, "step": 141 }, { "Batch Mean": 2.178023338317871, "accuracy": 0.828125, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 7.482564449310303, "learning_rate": 3.3947368421052636e-06, "loss": 0.4434, "step": 142 }, { "Batch Mean": 2.207672595977783, "accuracy": 0.8359375, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 9.649080276489258, "learning_rate": 3.381578947368421e-06, "loss": 0.4042, "step": 143 }, { "Batch Mean": 2.477532386779785, "accuracy": 0.8046875, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 6.769596576690674, "learning_rate": 3.368421052631579e-06, "loss": 0.3742, "step": 144 }, { "Batch Mean": 2.3647336959838867, "accuracy": 0.7578125, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 9.769651412963867, "learning_rate": 3.355263157894737e-06, "loss": 0.4972, "step": 145 }, { "Batch Mean": 2.282806396484375, "accuracy": 0.765625, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 9.153217315673828, "learning_rate": 3.342105263157895e-06, "loss": 0.4581, "step": 146 }, { "Batch Mean": 2.2191848754882812, "accuracy": 0.7421875, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 7.448163032531738, "learning_rate": 3.3289473684210528e-06, "loss": 0.4751, "step": 147 }, { "Batch Mean": 2.034508228302002, "accuracy": 0.7109375, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 8.64769458770752, "learning_rate": 3.3157894736842107e-06, "loss": 0.55, "step": 148 }, { "Batch Mean": 1.7862319946289062, "accuracy": 0.859375, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 6.058819770812988, "learning_rate": 3.302631578947369e-06, "loss": 0.3536, "step": 149 }, { "Batch Mean": 1.8278677463531494, "accuracy": 0.8046875, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.812012672424316, "learning_rate": 3.289473684210527e-06, "loss": 0.4398, "step": 150 }, { "Batch Mean": 1.9891557693481445, "accuracy": 0.8203125, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 7.331231117248535, "learning_rate": 3.276315789473685e-06, "loss": 0.4227, "step": 151 }, { "Batch Mean": 1.683842658996582, "accuracy": 0.7578125, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 6.605899333953857, "learning_rate": 3.2631578947368423e-06, "loss": 0.4956, "step": 152 }, { "Batch Mean": 1.8491768836975098, "accuracy": 0.8203125, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 6.61679744720459, "learning_rate": 3.2500000000000002e-06, "loss": 0.3815, "step": 153 }, { "Batch Mean": 1.5500710010528564, "accuracy": 0.8203125, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 7.707108497619629, "learning_rate": 3.236842105263158e-06, "loss": 0.4151, "step": 154 }, { "Batch Mean": 1.4066014289855957, "accuracy": 0.7890625, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 7.169893741607666, "learning_rate": 3.223684210526316e-06, "loss": 0.4474, "step": 155 }, { "Batch Mean": 1.7346997261047363, "accuracy": 0.7734375, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 10.779695510864258, "learning_rate": 3.210526315789474e-06, "loss": 0.4299, "step": 156 }, { "Batch Mean": 1.810314655303955, "accuracy": 0.796875, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 7.573057174682617, "learning_rate": 3.197368421052632e-06, "loss": 0.4399, "step": 157 }, { "Batch Mean": 1.7872644662857056, "accuracy": 0.7734375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 7.816164493560791, "learning_rate": 3.1842105263157898e-06, "loss": 0.4131, "step": 158 }, { "Batch Mean": 1.804029941558838, "accuracy": 0.765625, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.655269622802734, "learning_rate": 3.1710526315789477e-06, "loss": 0.4737, "step": 159 }, { "Batch Mean": 1.565956711769104, "accuracy": 0.796875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 7.699905872344971, "learning_rate": 3.157894736842105e-06, "loss": 0.4022, "step": 160 }, { "Batch Mean": 1.9926738739013672, "accuracy": 0.7734375, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 8.933215141296387, "learning_rate": 3.144736842105263e-06, "loss": 0.4674, "step": 161 }, { "Batch Mean": 1.7887498140335083, "accuracy": 0.7421875, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 9.933899879455566, "learning_rate": 3.131578947368421e-06, "loss": 0.459, "step": 162 }, { "Batch Mean": 1.8328676223754883, "accuracy": 0.828125, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 7.8370561599731445, "learning_rate": 3.1184210526315793e-06, "loss": 0.413, "step": 163 }, { "Batch Mean": 1.4974079132080078, "accuracy": 0.828125, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 9.378034591674805, "learning_rate": 3.1052631578947372e-06, "loss": 0.3895, "step": 164 }, { "Batch Mean": 1.8256797790527344, "accuracy": 0.75, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 9.905291557312012, "learning_rate": 3.092105263157895e-06, "loss": 0.4858, "step": 165 }, { "Batch Mean": 1.5948219299316406, "accuracy": 0.8046875, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 7.982060432434082, "learning_rate": 3.078947368421053e-06, "loss": 0.3709, "step": 166 }, { "Batch Mean": 1.8082324266433716, "accuracy": 0.8125, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 10.082322120666504, "learning_rate": 3.065789473684211e-06, "loss": 0.4202, "step": 167 }, { "Batch Mean": 1.3404994010925293, "accuracy": 0.8046875, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 10.225940704345703, "learning_rate": 3.052631578947369e-06, "loss": 0.5083, "step": 168 }, { "Batch Mean": 1.620276927947998, "accuracy": 0.796875, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 9.665302276611328, "learning_rate": 3.0394736842105268e-06, "loss": 0.4014, "step": 169 }, { "Batch Mean": 1.4625792503356934, "accuracy": 0.7890625, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 10.56104850769043, "learning_rate": 3.0263157894736843e-06, "loss": 0.5039, "step": 170 }, { "Batch Mean": 1.3835270404815674, "accuracy": 0.8671875, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 6.6405487060546875, "learning_rate": 3.013157894736842e-06, "loss": 0.2855, "step": 171 }, { "Batch Mean": 1.3700923919677734, "accuracy": 0.8125, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 7.546975612640381, "learning_rate": 3e-06, "loss": 0.3914, "step": 172 }, { "Batch Mean": 1.0351073741912842, "accuracy": 0.75, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 7.8717851638793945, "learning_rate": 2.986842105263158e-06, "loss": 0.4362, "step": 173 }, { "Batch Mean": 1.3376412391662598, "accuracy": 0.8359375, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 8.62287425994873, "learning_rate": 2.973684210526316e-06, "loss": 0.4212, "step": 174 }, { "Batch Mean": 1.219254493713379, "accuracy": 0.71875, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 11.215203285217285, "learning_rate": 2.960526315789474e-06, "loss": 0.5043, "step": 175 }, { "Batch Mean": 0.7928104400634766, "accuracy": 0.7578125, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 8.945316314697266, "learning_rate": 2.9473684210526317e-06, "loss": 0.4956, "step": 176 }, { "Batch Mean": 0.7014296650886536, "accuracy": 0.7578125, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 9.637972831726074, "learning_rate": 2.93421052631579e-06, "loss": 0.4785, "step": 177 }, { "Batch Mean": 1.184436321258545, "accuracy": 0.8046875, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 7.308509826660156, "learning_rate": 2.921052631578948e-06, "loss": 0.419, "step": 178 }, { "Batch Mean": 0.6965446472167969, "accuracy": 0.8125, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 8.029718399047852, "learning_rate": 2.907894736842106e-06, "loss": 0.4325, "step": 179 }, { "Batch Mean": 0.39546746015548706, "accuracy": 0.796875, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 7.985748291015625, "learning_rate": 2.8947368421052634e-06, "loss": 0.4038, "step": 180 }, { "Batch Mean": 0.5008649826049805, "accuracy": 0.8203125, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 7.97329044342041, "learning_rate": 2.8815789473684213e-06, "loss": 0.3997, "step": 181 }, { "Batch Mean": 0.5764102935791016, "accuracy": 0.8125, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 8.872234344482422, "learning_rate": 2.868421052631579e-06, "loss": 0.454, "step": 182 }, { "Batch Mean": 1.0936000347137451, "accuracy": 0.828125, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 8.255775451660156, "learning_rate": 2.855263157894737e-06, "loss": 0.4011, "step": 183 }, { "Batch Mean": 1.0273089408874512, "accuracy": 0.78125, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 9.933390617370605, "learning_rate": 2.842105263157895e-06, "loss": 0.4856, "step": 184 }, { "Batch Mean": 1.2797164916992188, "accuracy": 0.8359375, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 8.756768226623535, "learning_rate": 2.828947368421053e-06, "loss": 0.3929, "step": 185 }, { "Batch Mean": 1.3405977487564087, "accuracy": 0.8046875, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 8.689950942993164, "learning_rate": 2.815789473684211e-06, "loss": 0.4513, "step": 186 }, { "Batch Mean": 1.369067668914795, "accuracy": 0.7890625, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 8.023719787597656, "learning_rate": 2.8026315789473683e-06, "loss": 0.4026, "step": 187 }, { "Batch Mean": 2.006643295288086, "accuracy": 0.7578125, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 10.34017276763916, "learning_rate": 2.789473684210526e-06, "loss": 0.4653, "step": 188 }, { "Batch Mean": 2.0173683166503906, "accuracy": 0.8203125, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 9.120015144348145, "learning_rate": 2.776315789473684e-06, "loss": 0.4107, "step": 189 }, { "Batch Mean": 1.916172981262207, "accuracy": 0.8515625, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 7.870621681213379, "learning_rate": 2.7631578947368424e-06, "loss": 0.4053, "step": 190 }, { "Batch Mean": 2.2058982849121094, "accuracy": 0.765625, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 11.424184799194336, "learning_rate": 2.7500000000000004e-06, "loss": 0.512, "step": 191 }, { "Batch Mean": 2.4413208961486816, "accuracy": 0.84375, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 7.611173152923584, "learning_rate": 2.7368421052631583e-06, "loss": 0.3401, "step": 192 }, { "Batch Mean": 2.9837169647216797, "accuracy": 0.8125, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 7.9044413566589355, "learning_rate": 2.723684210526316e-06, "loss": 0.3885, "step": 193 }, { "Batch Mean": 2.2724387645721436, "accuracy": 0.8046875, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 7.795435905456543, "learning_rate": 2.710526315789474e-06, "loss": 0.4036, "step": 194 }, { "Batch Mean": 2.2937026023864746, "accuracy": 0.71875, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 8.691864013671875, "learning_rate": 2.697368421052632e-06, "loss": 0.4933, "step": 195 }, { "Batch Mean": 2.3943445682525635, "accuracy": 0.8359375, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 7.490312576293945, "learning_rate": 2.68421052631579e-06, "loss": 0.4102, "step": 196 }, { "Batch Mean": 1.93037748336792, "accuracy": 0.7578125, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 7.2265729904174805, "learning_rate": 2.6710526315789474e-06, "loss": 0.4356, "step": 197 }, { "Batch Mean": 2.346465587615967, "accuracy": 0.7578125, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 8.016980171203613, "learning_rate": 2.6578947368421053e-06, "loss": 0.4563, "step": 198 }, { "Batch Mean": 2.213311195373535, "accuracy": 0.7421875, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 7.577280521392822, "learning_rate": 2.644736842105263e-06, "loss": 0.4736, "step": 199 }, { "Batch Mean": 1.7689447402954102, "accuracy": 0.75, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 6.788050651550293, "learning_rate": 2.631578947368421e-06, "loss": 0.4655, "step": 200 }, { "Batch Mean": 1.7899178266525269, "accuracy": 0.8203125, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 6.991423606872559, "learning_rate": 2.618421052631579e-06, "loss": 0.4277, "step": 201 }, { "Batch Mean": 1.6013970375061035, "accuracy": 0.7734375, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 6.626009464263916, "learning_rate": 2.605263157894737e-06, "loss": 0.4654, "step": 202 }, { "Batch Mean": 1.734081745147705, "accuracy": 0.828125, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 6.0885725021362305, "learning_rate": 2.592105263157895e-06, "loss": 0.3811, "step": 203 }, { "Batch Mean": 1.5769119262695312, "accuracy": 0.765625, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 7.119544982910156, "learning_rate": 2.578947368421053e-06, "loss": 0.4682, "step": 204 }, { "Batch Mean": 1.2697408199310303, "accuracy": 0.7890625, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 7.171177387237549, "learning_rate": 2.565789473684211e-06, "loss": 0.4633, "step": 205 }, { "Batch Mean": 1.5697135925292969, "accuracy": 0.8203125, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 7.186983585357666, "learning_rate": 2.552631578947369e-06, "loss": 0.4177, "step": 206 }, { "Batch Mean": 1.5726741552352905, "accuracy": 0.78125, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 7.522491931915283, "learning_rate": 2.5394736842105265e-06, "loss": 0.4629, "step": 207 }, { "Batch Mean": 1.6065094470977783, "accuracy": 0.828125, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 7.612408638000488, "learning_rate": 2.5263157894736844e-06, "loss": 0.3536, "step": 208 }, { "Batch Mean": 2.1261379718780518, "accuracy": 0.8125, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 7.919214725494385, "learning_rate": 2.5131578947368423e-06, "loss": 0.4265, "step": 209 }, { "Batch Mean": 2.403383493423462, "accuracy": 0.7578125, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 8.452061653137207, "learning_rate": 2.5e-06, "loss": 0.4476, "step": 210 }, { "Batch Mean": 2.177506446838379, "accuracy": 0.7734375, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 10.394681930541992, "learning_rate": 2.486842105263158e-06, "loss": 0.5057, "step": 211 }, { "Batch Mean": 2.585557222366333, "accuracy": 0.8125, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 8.910456657409668, "learning_rate": 2.473684210526316e-06, "loss": 0.4257, "step": 212 }, { "Batch Mean": 2.618654251098633, "accuracy": 0.7890625, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 8.461559295654297, "learning_rate": 2.460526315789474e-06, "loss": 0.4123, "step": 213 }, { "Batch Mean": 2.872316360473633, "accuracy": 0.734375, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 12.725778579711914, "learning_rate": 2.447368421052632e-06, "loss": 0.5599, "step": 214 }, { "Batch Mean": 3.0026845932006836, "accuracy": 0.8515625, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 9.596014976501465, "learning_rate": 2.4342105263157898e-06, "loss": 0.4411, "step": 215 }, { "Batch Mean": 2.897749662399292, "accuracy": 0.78125, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 8.899314880371094, "learning_rate": 2.4210526315789477e-06, "loss": 0.3853, "step": 216 }, { "Batch Mean": 2.9906558990478516, "accuracy": 0.765625, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 10.041382789611816, "learning_rate": 2.4078947368421056e-06, "loss": 0.4807, "step": 217 }, { "Batch Mean": 3.049136161804199, "accuracy": 0.8359375, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 9.229241371154785, "learning_rate": 2.3947368421052635e-06, "loss": 0.3642, "step": 218 }, { "Batch Mean": 2.9170212745666504, "accuracy": 0.8046875, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 9.190288543701172, "learning_rate": 2.381578947368421e-06, "loss": 0.3924, "step": 219 }, { "Batch Mean": 2.6925058364868164, "accuracy": 0.7890625, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 10.271293640136719, "learning_rate": 2.368421052631579e-06, "loss": 0.4699, "step": 220 }, { "Batch Mean": 2.789365530014038, "accuracy": 0.78125, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 9.16843032836914, "learning_rate": 2.355263157894737e-06, "loss": 0.3856, "step": 221 }, { "Batch Mean": 2.4267354011535645, "accuracy": 0.8125, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 9.069971084594727, "learning_rate": 2.342105263157895e-06, "loss": 0.4526, "step": 222 }, { "Batch Mean": 2.2059507369995117, "accuracy": 0.8046875, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 9.028841018676758, "learning_rate": 2.328947368421053e-06, "loss": 0.41, "step": 223 }, { "Batch Mean": 1.7158894538879395, "accuracy": 0.8125, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 8.238944053649902, "learning_rate": 2.3157894736842105e-06, "loss": 0.4087, "step": 224 }, { "Batch Mean": 1.857762336730957, "accuracy": 0.796875, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 7.938877105712891, "learning_rate": 2.3026315789473684e-06, "loss": 0.418, "step": 225 }, { "Batch Mean": 2.1304166316986084, "accuracy": 0.734375, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 10.606386184692383, "learning_rate": 2.2894736842105263e-06, "loss": 0.4973, "step": 226 }, { "Batch Mean": 2.0900015830993652, "accuracy": 0.734375, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 9.365285873413086, "learning_rate": 2.2763157894736847e-06, "loss": 0.49, "step": 227 }, { "Batch Mean": 1.7396864891052246, "accuracy": 0.7890625, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 9.352794647216797, "learning_rate": 2.2631578947368426e-06, "loss": 0.4665, "step": 228 }, { "Batch Mean": 1.4743752479553223, "accuracy": 0.8046875, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 8.720419883728027, "learning_rate": 2.25e-06, "loss": 0.433, "step": 229 }, { "Batch Mean": 1.5897538661956787, "accuracy": 0.8046875, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 8.863330841064453, "learning_rate": 2.236842105263158e-06, "loss": 0.4247, "step": 230 }, { "Batch Mean": 1.2378873825073242, "accuracy": 0.7578125, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 8.975724220275879, "learning_rate": 2.223684210526316e-06, "loss": 0.4741, "step": 231 }, { "Batch Mean": 1.3916802406311035, "accuracy": 0.7578125, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 8.523025512695312, "learning_rate": 2.2105263157894738e-06, "loss": 0.4634, "step": 232 }, { "Batch Mean": 1.4983201026916504, "accuracy": 0.7421875, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 9.57751178741455, "learning_rate": 2.1973684210526317e-06, "loss": 0.5449, "step": 233 }, { "Batch Mean": 1.2628620862960815, "accuracy": 0.765625, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 8.061960220336914, "learning_rate": 2.1842105263157896e-06, "loss": 0.4441, "step": 234 }, { "Batch Mean": 1.1907455921173096, "accuracy": 0.8515625, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 6.615776538848877, "learning_rate": 2.1710526315789475e-06, "loss": 0.3421, "step": 235 }, { "Batch Mean": 1.1369013786315918, "accuracy": 0.8515625, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 7.37549352645874, "learning_rate": 2.1578947368421054e-06, "loss": 0.3825, "step": 236 }, { "Batch Mean": 1.1984944343566895, "accuracy": 0.7890625, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 7.898306846618652, "learning_rate": 2.1447368421052633e-06, "loss": 0.4196, "step": 237 }, { "Batch Mean": 0.9305081367492676, "accuracy": 0.796875, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 7.740181922912598, "learning_rate": 2.1315789473684212e-06, "loss": 0.4401, "step": 238 }, { "Batch Mean": 1.1609792709350586, "accuracy": 0.7578125, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 7.328685283660889, "learning_rate": 2.118421052631579e-06, "loss": 0.3894, "step": 239 }, { "Batch Mean": 1.2955622673034668, "accuracy": 0.7890625, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 7.802572250366211, "learning_rate": 2.105263157894737e-06, "loss": 0.3676, "step": 240 }, { "Batch Mean": 1.6187195777893066, "accuracy": 0.8046875, "epoch": 0.6, "step": 240 }, { "epoch": 0.6025, "grad_norm": 9.184107780456543, "learning_rate": 2.092105263157895e-06, "loss": 0.4378, "step": 241 }, { "Batch Mean": 1.7035493850708008, "accuracy": 0.765625, "epoch": 0.6025, "step": 241 }, { "epoch": 0.605, "grad_norm": 8.614889144897461, "learning_rate": 2.078947368421053e-06, "loss": 0.4155, "step": 242 }, { "Batch Mean": 1.7622737884521484, "accuracy": 0.78125, "epoch": 0.605, "step": 242 }, { "epoch": 0.6075, "grad_norm": 9.022790908813477, "learning_rate": 2.0657894736842108e-06, "loss": 0.435, "step": 243 }, { "Batch Mean": 1.6522586345672607, "accuracy": 0.7578125, "epoch": 0.6075, "step": 243 }, { "epoch": 0.61, "grad_norm": 9.872020721435547, "learning_rate": 2.0526315789473687e-06, "loss": 0.4509, "step": 244 }, { "Batch Mean": 1.6742737293243408, "accuracy": 0.796875, "epoch": 0.61, "step": 244 }, { "epoch": 0.6125, "grad_norm": 9.214981079101562, "learning_rate": 2.0394736842105266e-06, "loss": 0.4094, "step": 245 }, { "Batch Mean": 1.6322050094604492, "accuracy": 0.7890625, "epoch": 0.6125, "step": 245 }, { "epoch": 0.615, "grad_norm": 9.1509428024292, "learning_rate": 2.026315789473684e-06, "loss": 0.4213, "step": 246 }, { "Batch Mean": 1.2047111988067627, "accuracy": 0.8046875, "epoch": 0.615, "step": 246 }, { "epoch": 0.6175, "grad_norm": 10.241280555725098, "learning_rate": 2.013157894736842e-06, "loss": 0.4428, "step": 247 }, { "Batch Mean": 1.6580332517623901, "accuracy": 0.78125, "epoch": 0.6175, "step": 247 }, { "epoch": 0.62, "grad_norm": 9.183330535888672, "learning_rate": 2.0000000000000003e-06, "loss": 0.4128, "step": 248 }, { "Batch Mean": 1.211564064025879, "accuracy": 0.859375, "epoch": 0.62, "step": 248 }, { "epoch": 0.6225, "grad_norm": 7.7444915771484375, "learning_rate": 1.9868421052631582e-06, "loss": 0.3432, "step": 249 }, { "Batch Mean": 1.4931328296661377, "accuracy": 0.734375, "epoch": 0.6225, "step": 249 }, { "epoch": 0.625, "grad_norm": 9.798284530639648, "learning_rate": 1.973684210526316e-06, "loss": 0.467, "step": 250 }, { "Batch Mean": 1.1091797351837158, "accuracy": 0.8125, "epoch": 0.625, "step": 250 }, { "epoch": 0.6275, "grad_norm": 8.74545955657959, "learning_rate": 1.9605263157894736e-06, "loss": 0.4159, "step": 251 }, { "Batch Mean": 1.0245903730392456, "accuracy": 0.8046875, "epoch": 0.6275, "step": 251 }, { "epoch": 0.63, "grad_norm": 8.791230201721191, "learning_rate": 1.9473684210526315e-06, "loss": 0.4036, "step": 252 }, { "Batch Mean": 1.796776294708252, "accuracy": 0.828125, "epoch": 0.63, "step": 252 }, { "epoch": 0.6325, "grad_norm": 9.18313980102539, "learning_rate": 1.9342105263157895e-06, "loss": 0.4212, "step": 253 }, { "Batch Mean": 1.1153364181518555, "accuracy": 0.796875, "epoch": 0.6325, "step": 253 }, { "epoch": 0.635, "grad_norm": 9.925728797912598, "learning_rate": 1.9210526315789474e-06, "loss": 0.4159, "step": 254 }, { "Batch Mean": 1.2809149026870728, "accuracy": 0.796875, "epoch": 0.635, "step": 254 }, { "epoch": 0.6375, "grad_norm": 8.616195678710938, "learning_rate": 1.9078947368421057e-06, "loss": 0.4015, "step": 255 }, { "Batch Mean": 1.6480159759521484, "accuracy": 0.765625, "epoch": 0.6375, "step": 255 }, { "epoch": 0.64, "grad_norm": 10.048429489135742, "learning_rate": 1.8947368421052634e-06, "loss": 0.4418, "step": 256 }, { "Batch Mean": 1.6273455619812012, "accuracy": 0.7265625, "epoch": 0.64, "step": 256 }, { "epoch": 0.6425, "grad_norm": 9.716386795043945, "learning_rate": 1.8815789473684213e-06, "loss": 0.4678, "step": 257 }, { "Batch Mean": 1.7655820846557617, "accuracy": 0.7734375, "epoch": 0.6425, "step": 257 }, { "epoch": 0.645, "grad_norm": 8.57561206817627, "learning_rate": 1.868421052631579e-06, "loss": 0.4057, "step": 258 }, { "Batch Mean": 2.1368942260742188, "accuracy": 0.828125, "epoch": 0.645, "step": 258 }, { "epoch": 0.6475, "grad_norm": 9.588972091674805, "learning_rate": 1.855263157894737e-06, "loss": 0.3979, "step": 259 }, { "Batch Mean": 2.052062511444092, "accuracy": 0.8125, "epoch": 0.6475, "step": 259 }, { "epoch": 0.65, "grad_norm": 9.272086143493652, "learning_rate": 1.8421052631578948e-06, "loss": 0.4115, "step": 260 }, { "Batch Mean": 1.610414981842041, "accuracy": 0.796875, "epoch": 0.65, "step": 260 }, { "epoch": 0.6525, "grad_norm": 8.309432029724121, "learning_rate": 1.828947368421053e-06, "loss": 0.3825, "step": 261 }, { "Batch Mean": 1.932809829711914, "accuracy": 0.7578125, "epoch": 0.6525, "step": 261 }, { "epoch": 0.655, "grad_norm": 9.389504432678223, "learning_rate": 1.8157894736842109e-06, "loss": 0.4142, "step": 262 }, { "Batch Mean": 1.9246654510498047, "accuracy": 0.7734375, "epoch": 0.655, "step": 262 }, { "epoch": 0.6575, "grad_norm": 9.920475006103516, "learning_rate": 1.8026315789473685e-06, "loss": 0.4758, "step": 263 }, { "Batch Mean": 1.93519926071167, "accuracy": 0.796875, "epoch": 0.6575, "step": 263 }, { "epoch": 0.66, "grad_norm": 9.552947044372559, "learning_rate": 1.7894736842105265e-06, "loss": 0.4303, "step": 264 }, { "Batch Mean": 2.076547861099243, "accuracy": 0.8203125, "epoch": 0.66, "step": 264 }, { "epoch": 0.6625, "grad_norm": 9.472382545471191, "learning_rate": 1.7763157894736844e-06, "loss": 0.3981, "step": 265 }, { "Batch Mean": 1.9459846019744873, "accuracy": 0.90625, "epoch": 0.6625, "step": 265 }, { "epoch": 0.665, "grad_norm": 7.733583450317383, "learning_rate": 1.7631578947368423e-06, "loss": 0.3092, "step": 266 }, { "Batch Mean": 1.9703218936920166, "accuracy": 0.8515625, "epoch": 0.665, "step": 266 }, { "epoch": 0.6675, "grad_norm": 9.029424667358398, "learning_rate": 1.75e-06, "loss": 0.3787, "step": 267 }, { "Batch Mean": 2.1942596435546875, "accuracy": 0.8046875, "epoch": 0.6675, "step": 267 }, { "epoch": 0.67, "grad_norm": 9.263646125793457, "learning_rate": 1.736842105263158e-06, "loss": 0.4368, "step": 268 }, { "Batch Mean": 2.0240073204040527, "accuracy": 0.828125, "epoch": 0.67, "step": 268 }, { "epoch": 0.6725, "grad_norm": 8.806449890136719, "learning_rate": 1.723684210526316e-06, "loss": 0.379, "step": 269 }, { "Batch Mean": 2.138075828552246, "accuracy": 0.7734375, "epoch": 0.6725, "step": 269 }, { "epoch": 0.675, "grad_norm": 10.799676895141602, "learning_rate": 1.710526315789474e-06, "loss": 0.4921, "step": 270 }, { "Batch Mean": 1.9924249649047852, "accuracy": 0.8203125, "epoch": 0.675, "step": 270 }, { "epoch": 0.6775, "grad_norm": 10.256522178649902, "learning_rate": 1.6973684210526318e-06, "loss": 0.4736, "step": 271 }, { "Batch Mean": 2.115017890930176, "accuracy": 0.796875, "epoch": 0.6775, "step": 271 }, { "epoch": 0.68, "grad_norm": 8.95787239074707, "learning_rate": 1.6842105263157895e-06, "loss": 0.3887, "step": 272 }, { "Batch Mean": 2.272857666015625, "accuracy": 0.765625, "epoch": 0.68, "step": 272 }, { "epoch": 0.6825, "grad_norm": 10.41603946685791, "learning_rate": 1.6710526315789474e-06, "loss": 0.452, "step": 273 }, { "Batch Mean": 2.3034958839416504, "accuracy": 0.8125, "epoch": 0.6825, "step": 273 }, { "epoch": 0.685, "grad_norm": 9.792698860168457, "learning_rate": 1.6578947368421053e-06, "loss": 0.4109, "step": 274 }, { "Batch Mean": 2.2534396648406982, "accuracy": 0.734375, "epoch": 0.685, "step": 274 }, { "epoch": 0.6875, "grad_norm": 10.466201782226562, "learning_rate": 1.6447368421052635e-06, "loss": 0.52, "step": 275 }, { "Batch Mean": 2.1817493438720703, "accuracy": 0.78125, "epoch": 0.6875, "step": 275 }, { "epoch": 0.69, "grad_norm": 8.804483413696289, "learning_rate": 1.6315789473684212e-06, "loss": 0.4599, "step": 276 }, { "Batch Mean": 2.727586507797241, "accuracy": 0.84375, "epoch": 0.69, "step": 276 }, { "epoch": 0.6925, "grad_norm": 9.147492408752441, "learning_rate": 1.618421052631579e-06, "loss": 0.4186, "step": 277 }, { "Batch Mean": 2.453937530517578, "accuracy": 0.828125, "epoch": 0.6925, "step": 277 }, { "epoch": 0.695, "grad_norm": 8.768532752990723, "learning_rate": 1.605263157894737e-06, "loss": 0.4019, "step": 278 }, { "Batch Mean": 2.7397398948669434, "accuracy": 0.7734375, "epoch": 0.695, "step": 278 }, { "epoch": 0.6975, "grad_norm": 8.36785888671875, "learning_rate": 1.5921052631578949e-06, "loss": 0.4471, "step": 279 }, { "Batch Mean": 2.2289133071899414, "accuracy": 0.75, "epoch": 0.6975, "step": 279 }, { "epoch": 0.7, "grad_norm": 12.092652320861816, "learning_rate": 1.5789473684210526e-06, "loss": 0.5102, "step": 280 }, { "Batch Mean": 2.356043577194214, "accuracy": 0.8359375, "epoch": 0.7, "step": 280 }, { "epoch": 0.7025, "grad_norm": 8.004008293151855, "learning_rate": 1.5657894736842105e-06, "loss": 0.3634, "step": 281 }, { "Batch Mean": 2.659639596939087, "accuracy": 0.7734375, "epoch": 0.7025, "step": 281 }, { "epoch": 0.705, "grad_norm": 8.374983787536621, "learning_rate": 1.5526315789473686e-06, "loss": 0.4304, "step": 282 }, { "Batch Mean": 2.38856840133667, "accuracy": 0.7421875, "epoch": 0.705, "step": 282 }, { "epoch": 0.7075, "grad_norm": 9.618534088134766, "learning_rate": 1.5394736842105265e-06, "loss": 0.4379, "step": 283 }, { "Batch Mean": 2.196455955505371, "accuracy": 0.796875, "epoch": 0.7075, "step": 283 }, { "epoch": 0.71, "grad_norm": 8.226493835449219, "learning_rate": 1.5263157894736844e-06, "loss": 0.4122, "step": 284 }, { "Batch Mean": 2.7598533630371094, "accuracy": 0.765625, "epoch": 0.71, "step": 284 }, { "epoch": 0.7125, "grad_norm": 8.447372436523438, "learning_rate": 1.5131578947368421e-06, "loss": 0.451, "step": 285 }, { "Batch Mean": 2.280998468399048, "accuracy": 0.75, "epoch": 0.7125, "step": 285 }, { "epoch": 0.715, "grad_norm": 9.170207977294922, "learning_rate": 1.5e-06, "loss": 0.4512, "step": 286 }, { "Batch Mean": 2.208192825317383, "accuracy": 0.8203125, "epoch": 0.715, "step": 286 }, { "epoch": 0.7175, "grad_norm": 8.343831062316895, "learning_rate": 1.486842105263158e-06, "loss": 0.3791, "step": 287 }, { "Batch Mean": 2.5024871826171875, "accuracy": 0.8515625, "epoch": 0.7175, "step": 287 }, { "epoch": 0.72, "grad_norm": 7.829848289489746, "learning_rate": 1.4736842105263159e-06, "loss": 0.3538, "step": 288 }, { "Batch Mean": 2.4838662147521973, "accuracy": 0.7734375, "epoch": 0.72, "step": 288 }, { "epoch": 0.7225, "grad_norm": 8.380528450012207, "learning_rate": 1.460526315789474e-06, "loss": 0.4281, "step": 289 }, { "Batch Mean": 2.4715757369995117, "accuracy": 0.8046875, "epoch": 0.7225, "step": 289 }, { "epoch": 0.725, "grad_norm": 9.04012680053711, "learning_rate": 1.4473684210526317e-06, "loss": 0.4141, "step": 290 }, { "Batch Mean": 2.498483657836914, "accuracy": 0.84375, "epoch": 0.725, "step": 290 }, { "epoch": 0.7275, "grad_norm": 7.590081691741943, "learning_rate": 1.4342105263157896e-06, "loss": 0.33, "step": 291 }, { "Batch Mean": 2.728640556335449, "accuracy": 0.875, "epoch": 0.7275, "step": 291 }, { "epoch": 0.73, "grad_norm": 7.328402996063232, "learning_rate": 1.4210526315789475e-06, "loss": 0.3089, "step": 292 }, { "Batch Mean": 2.8817148208618164, "accuracy": 0.796875, "epoch": 0.73, "step": 292 }, { "epoch": 0.7325, "grad_norm": 9.085417747497559, "learning_rate": 1.4078947368421054e-06, "loss": 0.438, "step": 293 }, { "Batch Mean": 3.2919604778289795, "accuracy": 0.7890625, "epoch": 0.7325, "step": 293 }, { "epoch": 0.735, "grad_norm": 9.366758346557617, "learning_rate": 1.394736842105263e-06, "loss": 0.3997, "step": 294 }, { "Batch Mean": 3.028531551361084, "accuracy": 0.828125, "epoch": 0.735, "step": 294 }, { "epoch": 0.7375, "grad_norm": 9.361329078674316, "learning_rate": 1.3815789473684212e-06, "loss": 0.4117, "step": 295 }, { "Batch Mean": 3.166205406188965, "accuracy": 0.7890625, "epoch": 0.7375, "step": 295 }, { "epoch": 0.74, "grad_norm": 8.724553108215332, "learning_rate": 1.3684210526315791e-06, "loss": 0.3743, "step": 296 }, { "Batch Mean": 3.3602304458618164, "accuracy": 0.8125, "epoch": 0.74, "step": 296 }, { "epoch": 0.7425, "grad_norm": 9.30712604522705, "learning_rate": 1.355263157894737e-06, "loss": 0.4148, "step": 297 }, { "Batch Mean": 3.169215202331543, "accuracy": 0.828125, "epoch": 0.7425, "step": 297 }, { "epoch": 0.745, "grad_norm": 9.28420352935791, "learning_rate": 1.342105263157895e-06, "loss": 0.3247, "step": 298 }, { "Batch Mean": 3.769045829772949, "accuracy": 0.8125, "epoch": 0.745, "step": 298 }, { "epoch": 0.7475, "grad_norm": 9.424715995788574, "learning_rate": 1.3289473684210526e-06, "loss": 0.3602, "step": 299 }, { "Batch Mean": 3.6478939056396484, "accuracy": 0.859375, "epoch": 0.7475, "step": 299 }, { "epoch": 0.75, "grad_norm": 9.039104461669922, "learning_rate": 1.3157894736842106e-06, "loss": 0.3256, "step": 300 }, { "Batch Mean": 3.6405322551727295, "accuracy": 0.8125, "epoch": 0.75, "step": 300 }, { "epoch": 0.7525, "grad_norm": 9.519824981689453, "learning_rate": 1.3026315789473685e-06, "loss": 0.4135, "step": 301 }, { "Batch Mean": 3.9676380157470703, "accuracy": 0.828125, "epoch": 0.7525, "step": 301 }, { "epoch": 0.755, "grad_norm": 9.68959903717041, "learning_rate": 1.2894736842105266e-06, "loss": 0.3233, "step": 302 }, { "Batch Mean": 4.056262969970703, "accuracy": 0.8203125, "epoch": 0.755, "step": 302 }, { "epoch": 0.7575, "grad_norm": 11.665163040161133, "learning_rate": 1.2763157894736845e-06, "loss": 0.4197, "step": 303 }, { "Batch Mean": 3.99052095413208, "accuracy": 0.84375, "epoch": 0.7575, "step": 303 }, { "epoch": 0.76, "grad_norm": 9.797762870788574, "learning_rate": 1.2631578947368422e-06, "loss": 0.3341, "step": 304 }, { "Batch Mean": 4.704170227050781, "accuracy": 0.890625, "epoch": 0.76, "step": 304 }, { "epoch": 0.7625, "grad_norm": 10.098402976989746, "learning_rate": 1.25e-06, "loss": 0.3156, "step": 305 }, { "Batch Mean": 4.484651565551758, "accuracy": 0.796875, "epoch": 0.7625, "step": 305 }, { "epoch": 0.765, "grad_norm": 13.691995620727539, "learning_rate": 1.236842105263158e-06, "loss": 0.4064, "step": 306 }, { "Batch Mean": 3.8555450439453125, "accuracy": 0.8203125, "epoch": 0.765, "step": 306 }, { "epoch": 0.7675, "grad_norm": 11.670147895812988, "learning_rate": 1.223684210526316e-06, "loss": 0.4056, "step": 307 }, { "Batch Mean": 4.468677043914795, "accuracy": 0.84375, "epoch": 0.7675, "step": 307 }, { "epoch": 0.77, "grad_norm": 11.534550666809082, "learning_rate": 1.2105263157894738e-06, "loss": 0.4006, "step": 308 }, { "Batch Mean": 3.834693431854248, "accuracy": 0.7734375, "epoch": 0.77, "step": 308 }, { "epoch": 0.7725, "grad_norm": 11.872078895568848, "learning_rate": 1.1973684210526317e-06, "loss": 0.4734, "step": 309 }, { "Batch Mean": 3.850661277770996, "accuracy": 0.7734375, "epoch": 0.7725, "step": 309 }, { "epoch": 0.775, "grad_norm": 12.335707664489746, "learning_rate": 1.1842105263157894e-06, "loss": 0.4272, "step": 310 }, { "Batch Mean": 3.788878917694092, "accuracy": 0.734375, "epoch": 0.775, "step": 310 }, { "epoch": 0.7775, "grad_norm": 13.264825820922852, "learning_rate": 1.1710526315789476e-06, "loss": 0.4977, "step": 311 }, { "Batch Mean": 3.859971046447754, "accuracy": 0.78125, "epoch": 0.7775, "step": 311 }, { "epoch": 0.78, "grad_norm": 10.746715545654297, "learning_rate": 1.1578947368421053e-06, "loss": 0.4457, "step": 312 }, { "Batch Mean": 3.103464126586914, "accuracy": 0.7890625, "epoch": 0.78, "step": 312 }, { "epoch": 0.7825, "grad_norm": 10.573457717895508, "learning_rate": 1.1447368421052632e-06, "loss": 0.4286, "step": 313 }, { "Batch Mean": 3.299260139465332, "accuracy": 0.7890625, "epoch": 0.7825, "step": 313 }, { "epoch": 0.785, "grad_norm": 10.1765775680542, "learning_rate": 1.1315789473684213e-06, "loss": 0.4275, "step": 314 }, { "Batch Mean": 2.910822868347168, "accuracy": 0.734375, "epoch": 0.785, "step": 314 }, { "epoch": 0.7875, "grad_norm": 11.196432113647461, "learning_rate": 1.118421052631579e-06, "loss": 0.5264, "step": 315 }, { "Batch Mean": 3.1886849403381348, "accuracy": 0.8203125, "epoch": 0.7875, "step": 315 }, { "epoch": 0.79, "grad_norm": 9.3234281539917, "learning_rate": 1.1052631578947369e-06, "loss": 0.3845, "step": 316 }, { "Batch Mean": 2.9300460815429688, "accuracy": 0.7578125, "epoch": 0.79, "step": 316 }, { "epoch": 0.7925, "grad_norm": 10.788275718688965, "learning_rate": 1.0921052631578948e-06, "loss": 0.4715, "step": 317 }, { "Batch Mean": 2.96604061126709, "accuracy": 0.8203125, "epoch": 0.7925, "step": 317 }, { "epoch": 0.795, "grad_norm": 9.273138999938965, "learning_rate": 1.0789473684210527e-06, "loss": 0.4013, "step": 318 }, { "Batch Mean": 2.7190191745758057, "accuracy": 0.8359375, "epoch": 0.795, "step": 318 }, { "epoch": 0.7975, "grad_norm": 8.655777931213379, "learning_rate": 1.0657894736842106e-06, "loss": 0.3833, "step": 319 }, { "Batch Mean": 2.6934633255004883, "accuracy": 0.7734375, "epoch": 0.7975, "step": 319 }, { "epoch": 0.8, "grad_norm": 8.541173934936523, "learning_rate": 1.0526315789473685e-06, "loss": 0.4374, "step": 320 }, { "Batch Mean": 2.920466423034668, "accuracy": 0.734375, "epoch": 0.8, "step": 320 }, { "epoch": 0.8025, "grad_norm": 10.408062934875488, "learning_rate": 1.0394736842105264e-06, "loss": 0.5146, "step": 321 }, { "Batch Mean": 3.0105621814727783, "accuracy": 0.828125, "epoch": 0.8025, "step": 321 }, { "epoch": 0.805, "grad_norm": 8.558656692504883, "learning_rate": 1.0263157894736843e-06, "loss": 0.4025, "step": 322 }, { "Batch Mean": 2.5406768321990967, "accuracy": 0.7109375, "epoch": 0.805, "step": 322 }, { "epoch": 0.8075, "grad_norm": 10.365202903747559, "learning_rate": 1.013157894736842e-06, "loss": 0.5076, "step": 323 }, { "Batch Mean": 2.724036693572998, "accuracy": 0.8125, "epoch": 0.8075, "step": 323 }, { "epoch": 0.81, "grad_norm": 9.693260192871094, "learning_rate": 1.0000000000000002e-06, "loss": 0.4517, "step": 324 }, { "Batch Mean": 2.643817901611328, "accuracy": 0.7734375, "epoch": 0.81, "step": 324 }, { "epoch": 0.8125, "grad_norm": 9.147430419921875, "learning_rate": 9.86842105263158e-07, "loss": 0.4092, "step": 325 }, { "Batch Mean": 2.660895347595215, "accuracy": 0.8125, "epoch": 0.8125, "step": 325 }, { "epoch": 0.815, "grad_norm": 8.100640296936035, "learning_rate": 9.736842105263158e-07, "loss": 0.4067, "step": 326 }, { "Batch Mean": 2.5972213745117188, "accuracy": 0.7734375, "epoch": 0.815, "step": 326 }, { "epoch": 0.8175, "grad_norm": 8.55834674835205, "learning_rate": 9.605263157894737e-07, "loss": 0.4489, "step": 327 }, { "Batch Mean": 2.7025046348571777, "accuracy": 0.8125, "epoch": 0.8175, "step": 327 }, { "epoch": 0.82, "grad_norm": 7.632991313934326, "learning_rate": 9.473684210526317e-07, "loss": 0.4292, "step": 328 }, { "Batch Mean": 2.485902786254883, "accuracy": 0.8046875, "epoch": 0.82, "step": 328 }, { "epoch": 0.8225, "grad_norm": 8.363178253173828, "learning_rate": 9.342105263157895e-07, "loss": 0.3919, "step": 329 }, { "Batch Mean": 2.6269283294677734, "accuracy": 0.8203125, "epoch": 0.8225, "step": 329 }, { "epoch": 0.825, "grad_norm": 8.078348159790039, "learning_rate": 9.210526315789474e-07, "loss": 0.4515, "step": 330 }, { "Batch Mean": 2.464303970336914, "accuracy": 0.8203125, "epoch": 0.825, "step": 330 }, { "epoch": 0.8275, "grad_norm": 8.487800598144531, "learning_rate": 9.078947368421054e-07, "loss": 0.4312, "step": 331 }, { "Batch Mean": 2.7176826000213623, "accuracy": 0.84375, "epoch": 0.8275, "step": 331 }, { "epoch": 0.83, "grad_norm": 7.366917133331299, "learning_rate": 8.947368421052632e-07, "loss": 0.3685, "step": 332 }, { "Batch Mean": 2.4911954402923584, "accuracy": 0.828125, "epoch": 0.83, "step": 332 }, { "epoch": 0.8325, "grad_norm": 7.673061370849609, "learning_rate": 8.815789473684211e-07, "loss": 0.3904, "step": 333 }, { "Batch Mean": 2.8516788482666016, "accuracy": 0.8125, "epoch": 0.8325, "step": 333 }, { "epoch": 0.835, "grad_norm": 7.1761088371276855, "learning_rate": 8.68421052631579e-07, "loss": 0.3968, "step": 334 }, { "Batch Mean": 3.2786450386047363, "accuracy": 0.8125, "epoch": 0.835, "step": 334 }, { "epoch": 0.8375, "grad_norm": 6.7936177253723145, "learning_rate": 8.55263157894737e-07, "loss": 0.3717, "step": 335 }, { "Batch Mean": 3.0723023414611816, "accuracy": 0.828125, "epoch": 0.8375, "step": 335 }, { "epoch": 0.84, "grad_norm": 6.839074611663818, "learning_rate": 8.421052631578948e-07, "loss": 0.3472, "step": 336 }, { "Batch Mean": 2.7969651222229004, "accuracy": 0.890625, "epoch": 0.84, "step": 336 }, { "epoch": 0.8425, "grad_norm": 7.81821870803833, "learning_rate": 8.289473684210527e-07, "loss": 0.3287, "step": 337 }, { "Batch Mean": 3.1232268810272217, "accuracy": 0.7890625, "epoch": 0.8425, "step": 337 }, { "epoch": 0.845, "grad_norm": 9.474512100219727, "learning_rate": 8.157894736842106e-07, "loss": 0.4474, "step": 338 }, { "Batch Mean": 3.809293746948242, "accuracy": 0.78125, "epoch": 0.845, "step": 338 }, { "epoch": 0.8475, "grad_norm": 8.809016227722168, "learning_rate": 8.026315789473685e-07, "loss": 0.4153, "step": 339 }, { "Batch Mean": 3.514373302459717, "accuracy": 0.8203125, "epoch": 0.8475, "step": 339 }, { "epoch": 0.85, "grad_norm": 8.725719451904297, "learning_rate": 7.894736842105263e-07, "loss": 0.3813, "step": 340 }, { "Batch Mean": 3.5422677993774414, "accuracy": 0.78125, "epoch": 0.85, "step": 340 }, { "epoch": 0.8525, "grad_norm": 8.447900772094727, "learning_rate": 7.763157894736843e-07, "loss": 0.4199, "step": 341 }, { "Batch Mean": 3.5171260833740234, "accuracy": 0.8359375, "epoch": 0.8525, "step": 341 }, { "epoch": 0.855, "grad_norm": 8.496635437011719, "learning_rate": 7.631578947368422e-07, "loss": 0.3542, "step": 342 }, { "Batch Mean": 3.378173828125, "accuracy": 0.875, "epoch": 0.855, "step": 342 }, { "epoch": 0.8575, "grad_norm": 7.566522121429443, "learning_rate": 7.5e-07, "loss": 0.2996, "step": 343 }, { "Batch Mean": 3.8742589950561523, "accuracy": 0.8203125, "epoch": 0.8575, "step": 343 }, { "epoch": 0.86, "grad_norm": 8.366487503051758, "learning_rate": 7.368421052631579e-07, "loss": 0.3492, "step": 344 }, { "Batch Mean": 4.238893508911133, "accuracy": 0.7734375, "epoch": 0.86, "step": 344 }, { "epoch": 0.8625, "grad_norm": 10.260600090026855, "learning_rate": 7.236842105263158e-07, "loss": 0.4601, "step": 345 }, { "Batch Mean": 3.79160213470459, "accuracy": 0.828125, "epoch": 0.8625, "step": 345 }, { "epoch": 0.865, "grad_norm": 9.80223274230957, "learning_rate": 7.105263157894737e-07, "loss": 0.407, "step": 346 }, { "Batch Mean": 3.4316647052764893, "accuracy": 0.78125, "epoch": 0.865, "step": 346 }, { "epoch": 0.8675, "grad_norm": 9.012909889221191, "learning_rate": 6.973684210526316e-07, "loss": 0.4111, "step": 347 }, { "Batch Mean": 3.169463872909546, "accuracy": 0.8203125, "epoch": 0.8675, "step": 347 }, { "epoch": 0.87, "grad_norm": 7.685094356536865, "learning_rate": 6.842105263157896e-07, "loss": 0.368, "step": 348 }, { "Batch Mean": 3.5023093223571777, "accuracy": 0.8203125, "epoch": 0.87, "step": 348 }, { "epoch": 0.8725, "grad_norm": 8.767780303955078, "learning_rate": 6.710526315789475e-07, "loss": 0.3806, "step": 349 }, { "Batch Mean": 3.8760428428649902, "accuracy": 0.8046875, "epoch": 0.8725, "step": 349 }, { "epoch": 0.875, "grad_norm": 10.924077987670898, "learning_rate": 6.578947368421053e-07, "loss": 0.4436, "step": 350 }, { "Batch Mean": 3.915463447570801, "accuracy": 0.828125, "epoch": 0.875, "step": 350 }, { "epoch": 0.8775, "grad_norm": 11.166847229003906, "learning_rate": 6.447368421052633e-07, "loss": 0.4323, "step": 351 }, { "Batch Mean": 3.3088676929473877, "accuracy": 0.8359375, "epoch": 0.8775, "step": 351 }, { "epoch": 0.88, "grad_norm": 7.589693546295166, "learning_rate": 6.315789473684211e-07, "loss": 0.3519, "step": 352 }, { "Batch Mean": 2.881253957748413, "accuracy": 0.7890625, "epoch": 0.88, "step": 352 }, { "epoch": 0.8825, "grad_norm": 9.315062522888184, "learning_rate": 6.18421052631579e-07, "loss": 0.4301, "step": 353 }, { "Batch Mean": 2.906712055206299, "accuracy": 0.765625, "epoch": 0.8825, "step": 353 }, { "epoch": 0.885, "grad_norm": 11.210281372070312, "learning_rate": 6.052631578947369e-07, "loss": 0.4919, "step": 354 }, { "Batch Mean": 3.3116188049316406, "accuracy": 0.828125, "epoch": 0.885, "step": 354 }, { "epoch": 0.8875, "grad_norm": 8.310154914855957, "learning_rate": 5.921052631578947e-07, "loss": 0.3854, "step": 355 }, { "Batch Mean": 3.0483903884887695, "accuracy": 0.796875, "epoch": 0.8875, "step": 355 }, { "epoch": 0.89, "grad_norm": 9.462825775146484, "learning_rate": 5.789473684210526e-07, "loss": 0.4246, "step": 356 }, { "Batch Mean": 3.21954345703125, "accuracy": 0.8046875, "epoch": 0.89, "step": 356 }, { "epoch": 0.8925, "grad_norm": 8.243721008300781, "learning_rate": 5.657894736842106e-07, "loss": 0.3305, "step": 357 }, { "Batch Mean": 3.143207311630249, "accuracy": 0.7578125, "epoch": 0.8925, "step": 357 }, { "epoch": 0.895, "grad_norm": 8.959394454956055, "learning_rate": 5.526315789473684e-07, "loss": 0.4026, "step": 358 }, { "Batch Mean": 3.196836471557617, "accuracy": 0.7890625, "epoch": 0.895, "step": 358 }, { "epoch": 0.8975, "grad_norm": 9.289172172546387, "learning_rate": 5.394736842105264e-07, "loss": 0.4167, "step": 359 }, { "Batch Mean": 2.7712502479553223, "accuracy": 0.8359375, "epoch": 0.8975, "step": 359 }, { "epoch": 0.9, "grad_norm": 8.765880584716797, "learning_rate": 5.263157894736843e-07, "loss": 0.3261, "step": 360 }, { "Batch Mean": 3.0266494750976562, "accuracy": 0.8125, "epoch": 0.9, "step": 360 }, { "epoch": 0.9025, "grad_norm": 8.701887130737305, "learning_rate": 5.131578947368422e-07, "loss": 0.4457, "step": 361 }, { "Batch Mean": 3.071992874145508, "accuracy": 0.796875, "epoch": 0.9025, "step": 361 }, { "epoch": 0.905, "grad_norm": 9.102984428405762, "learning_rate": 5.000000000000001e-07, "loss": 0.4204, "step": 362 }, { "Batch Mean": 2.5766148567199707, "accuracy": 0.84375, "epoch": 0.905, "step": 362 }, { "epoch": 0.9075, "grad_norm": 7.755275249481201, "learning_rate": 4.868421052631579e-07, "loss": 0.372, "step": 363 }, { "Batch Mean": 3.2133255004882812, "accuracy": 0.78125, "epoch": 0.9075, "step": 363 }, { "epoch": 0.91, "grad_norm": 8.099285125732422, "learning_rate": 4.7368421052631585e-07, "loss": 0.4035, "step": 364 }, { "Batch Mean": 2.7393627166748047, "accuracy": 0.796875, "epoch": 0.91, "step": 364 }, { "epoch": 0.9125, "grad_norm": 8.071907043457031, "learning_rate": 4.605263157894737e-07, "loss": 0.4003, "step": 365 }, { "Batch Mean": 2.6384525299072266, "accuracy": 0.7890625, "epoch": 0.9125, "step": 365 }, { "epoch": 0.915, "grad_norm": 8.659152030944824, "learning_rate": 4.473684210526316e-07, "loss": 0.4324, "step": 366 }, { "Batch Mean": 3.1158814430236816, "accuracy": 0.8359375, "epoch": 0.915, "step": 366 }, { "epoch": 0.9175, "grad_norm": 7.912168979644775, "learning_rate": 4.342105263157895e-07, "loss": 0.363, "step": 367 }, { "Batch Mean": 2.7819793224334717, "accuracy": 0.8203125, "epoch": 0.9175, "step": 367 }, { "epoch": 0.92, "grad_norm": 9.396142959594727, "learning_rate": 4.210526315789474e-07, "loss": 0.436, "step": 368 }, { "Batch Mean": 2.770251750946045, "accuracy": 0.8046875, "epoch": 0.92, "step": 368 }, { "epoch": 0.9225, "grad_norm": 8.482458114624023, "learning_rate": 4.078947368421053e-07, "loss": 0.4448, "step": 369 }, { "Batch Mean": 2.6800403594970703, "accuracy": 0.828125, "epoch": 0.9225, "step": 369 }, { "epoch": 0.925, "grad_norm": 7.598038673400879, "learning_rate": 3.9473684210526315e-07, "loss": 0.3877, "step": 370 }, { "Batch Mean": 2.463380813598633, "accuracy": 0.7734375, "epoch": 0.925, "step": 370 }, { "epoch": 0.9275, "grad_norm": 9.041579246520996, "learning_rate": 3.815789473684211e-07, "loss": 0.4531, "step": 371 }, { "Batch Mean": 2.664177894592285, "accuracy": 0.84375, "epoch": 0.9275, "step": 371 }, { "epoch": 0.93, "grad_norm": 8.277569770812988, "learning_rate": 3.6842105263157896e-07, "loss": 0.3945, "step": 372 }, { "Batch Mean": 2.9936976432800293, "accuracy": 0.859375, "epoch": 0.93, "step": 372 }, { "epoch": 0.9325, "grad_norm": 8.517739295959473, "learning_rate": 3.5526315789473687e-07, "loss": 0.3599, "step": 373 }, { "Batch Mean": 2.63564395904541, "accuracy": 0.875, "epoch": 0.9325, "step": 373 }, { "epoch": 0.935, "grad_norm": 7.75486421585083, "learning_rate": 3.421052631578948e-07, "loss": 0.3049, "step": 374 }, { "Batch Mean": 2.2013025283813477, "accuracy": 0.8125, "epoch": 0.935, "step": 374 }, { "epoch": 0.9375, "grad_norm": 8.340994834899902, "learning_rate": 3.2894736842105264e-07, "loss": 0.3908, "step": 375 }, { "Batch Mean": 2.306203842163086, "accuracy": 0.84375, "epoch": 0.9375, "step": 375 }, { "epoch": 0.94, "grad_norm": 8.02235221862793, "learning_rate": 3.1578947368421055e-07, "loss": 0.3408, "step": 376 }, { "Batch Mean": 2.780491590499878, "accuracy": 0.75, "epoch": 0.94, "step": 376 }, { "epoch": 0.9425, "grad_norm": 11.336405754089355, "learning_rate": 3.0263157894736846e-07, "loss": 0.5006, "step": 377 }, { "Batch Mean": 2.413102388381958, "accuracy": 0.828125, "epoch": 0.9425, "step": 377 }, { "epoch": 0.945, "grad_norm": 8.683170318603516, "learning_rate": 2.894736842105263e-07, "loss": 0.374, "step": 378 }, { "Batch Mean": 2.5188517570495605, "accuracy": 0.8203125, "epoch": 0.945, "step": 378 }, { "epoch": 0.9475, "grad_norm": 8.190141677856445, "learning_rate": 2.763157894736842e-07, "loss": 0.4097, "step": 379 }, { "Batch Mean": 2.7684450149536133, "accuracy": 0.78125, "epoch": 0.9475, "step": 379 }, { "epoch": 0.95, "grad_norm": 8.226295471191406, "learning_rate": 2.6315789473684213e-07, "loss": 0.3831, "step": 380 }, { "Batch Mean": 2.796868324279785, "accuracy": 0.796875, "epoch": 0.95, "step": 380 }, { "epoch": 0.9525, "grad_norm": 9.279502868652344, "learning_rate": 2.5000000000000004e-07, "loss": 0.4314, "step": 381 }, { "Batch Mean": 2.3730950355529785, "accuracy": 0.78125, "epoch": 0.9525, "step": 381 }, { "epoch": 0.955, "grad_norm": 8.747193336486816, "learning_rate": 2.3684210526315792e-07, "loss": 0.4125, "step": 382 }, { "Batch Mean": 2.7471389770507812, "accuracy": 0.7734375, "epoch": 0.955, "step": 382 }, { "epoch": 0.9575, "grad_norm": 9.065400123596191, "learning_rate": 2.236842105263158e-07, "loss": 0.4453, "step": 383 }, { "Batch Mean": 2.758594274520874, "accuracy": 0.84375, "epoch": 0.9575, "step": 383 }, { "epoch": 0.96, "grad_norm": 8.173842430114746, "learning_rate": 2.105263157894737e-07, "loss": 0.3516, "step": 384 }, { "Batch Mean": 2.677278518676758, "accuracy": 0.8515625, "epoch": 0.96, "step": 384 }, { "epoch": 0.9625, "grad_norm": 7.212657451629639, "learning_rate": 1.9736842105263157e-07, "loss": 0.3345, "step": 385 }, { "Batch Mean": 3.0301685333251953, "accuracy": 0.859375, "epoch": 0.9625, "step": 385 }, { "epoch": 0.965, "grad_norm": 7.157632827758789, "learning_rate": 1.8421052631578948e-07, "loss": 0.325, "step": 386 }, { "Batch Mean": 2.6034302711486816, "accuracy": 0.8046875, "epoch": 0.965, "step": 386 }, { "epoch": 0.9675, "grad_norm": 8.646699905395508, "learning_rate": 1.710526315789474e-07, "loss": 0.364, "step": 387 }, { "Batch Mean": 2.835425853729248, "accuracy": 0.8359375, "epoch": 0.9675, "step": 387 }, { "epoch": 0.97, "grad_norm": 8.081703186035156, "learning_rate": 1.5789473684210527e-07, "loss": 0.3987, "step": 388 }, { "Batch Mean": 2.878167152404785, "accuracy": 0.828125, "epoch": 0.97, "step": 388 }, { "epoch": 0.9725, "grad_norm": 8.71593189239502, "learning_rate": 1.4473684210526316e-07, "loss": 0.3909, "step": 389 }, { "Batch Mean": 2.9658942222595215, "accuracy": 0.8671875, "epoch": 0.9725, "step": 389 }, { "epoch": 0.975, "grad_norm": 7.501869201660156, "learning_rate": 1.3157894736842107e-07, "loss": 0.3144, "step": 390 }, { "Batch Mean": 3.0065078735351562, "accuracy": 0.8125, "epoch": 0.975, "step": 390 }, { "epoch": 0.9775, "grad_norm": 9.999802589416504, "learning_rate": 1.1842105263157896e-07, "loss": 0.4321, "step": 391 }, { "Batch Mean": 3.0289103984832764, "accuracy": 0.84375, "epoch": 0.9775, "step": 391 }, { "epoch": 0.98, "grad_norm": 7.703149318695068, "learning_rate": 1.0526315789473685e-07, "loss": 0.3455, "step": 392 }, { "Batch Mean": 2.7720205783843994, "accuracy": 0.828125, "epoch": 0.98, "step": 392 }, { "epoch": 0.9825, "grad_norm": 8.7222261428833, "learning_rate": 9.210526315789474e-08, "loss": 0.3681, "step": 393 }, { "Batch Mean": 2.607419490814209, "accuracy": 0.8203125, "epoch": 0.9825, "step": 393 }, { "epoch": 0.985, "grad_norm": 8.8170747756958, "learning_rate": 7.894736842105264e-08, "loss": 0.3779, "step": 394 }, { "Batch Mean": 3.094576835632324, "accuracy": 0.84375, "epoch": 0.985, "step": 394 }, { "epoch": 0.9875, "grad_norm": 8.42007827758789, "learning_rate": 6.578947368421053e-08, "loss": 0.3663, "step": 395 }, { "Batch Mean": 2.809164524078369, "accuracy": 0.84375, "epoch": 0.9875, "step": 395 }, { "epoch": 0.99, "grad_norm": 7.778329849243164, "learning_rate": 5.263157894736842e-08, "loss": 0.3257, "step": 396 }, { "Batch Mean": 2.802082061767578, "accuracy": 0.7734375, "epoch": 0.99, "step": 396 }, { "epoch": 0.9925, "grad_norm": 10.571059226989746, "learning_rate": 3.947368421052632e-08, "loss": 0.4216, "step": 397 }, { "Batch Mean": 3.107623815536499, "accuracy": 0.8046875, "epoch": 0.9925, "step": 397 }, { "epoch": 0.995, "grad_norm": 8.333806991577148, "learning_rate": 2.631578947368421e-08, "loss": 0.4045, "step": 398 }, { "Batch Mean": 2.785451889038086, "accuracy": 0.8125, "epoch": 0.995, "step": 398 }, { "epoch": 0.9975, "grad_norm": 10.335576057434082, "learning_rate": 1.3157894736842106e-08, "loss": 0.4548, "step": 399 }, { "Batch Mean": 2.530060052871704, "accuracy": 0.765625, "epoch": 0.9975, "step": 399 }, { "epoch": 1.0, "grad_norm": 10.57154369354248, "learning_rate": 0.0, "loss": 0.4866, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }